Beta
Table of Contents
The outline of your notebook will show up here. You can include headings in any text cell by starting a line with #
, ##
, ###
, etc., depending on the desired title hierarchy.
import glob
path = 'Data2/'
all_files = glob.glob(path + "/*.txt")
#print(all_files)
text = []
for file in all_files:
with open(file, 'r', encoding='ISO-8859-1') as f:
text.append(f.read())
print(text)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
custom_stop_words = ['http', 'https', 'amp', 'com','risk', 'project', 'risk management', 'risks','management','share', 'like', 'itõs','JWM','jwm','___']
stop_words = ENGLISH_STOP_WORDS.union(custom_stop_words)
vectorizer = TfidfVectorizer(stop_words=stop_words,smooth_idf=True)
# under the hood - lowercasing,removing special chars,removing stop words
input_matrix = vectorizer.fit_transform(text).todense()
svd_modeling= TruncatedSVD(n_components=4, algorithm='randomized', n_iter=100, random_state=122)
svd_modeling.fit(input_matrix)
components=svd_modeling.components_
vocab = vectorizer.get_feature_names()
topic_word_list = []
def get_topics(components):
for i, comp in enumerate(components):
terms_comp = zip(vocab,comp)
sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
topic=" "
for t in sorted_terms:
topic= topic + ' ' + t[0]
topic_word_list.append(topic)
print(topic_word_list)
return topic_word_list
get_topics(components)
!pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
for i in range(4):
wc = WordCloud(width=1000, height=600, margin=3, prefer_horizontal=0.7,scale=1,background_color='black', relative_scaling=0).generate(topic_word_list[i])
plt.imshow(wc)
plt.title(f"Topic{i+1}")
plt.axis("off")
plt.show()