SVD Topic Modeling

Beta

import glob
path = 'Data2/'
all_files = glob.glob(path + "/*.txt")
#print(all_files)
text = []
for file in all_files:
    with open(file, 'r', encoding='ISO-8859-1') as f:
        text.append(f.read())
print(text)

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

custom_stop_words = ['http', 'https', 'amp', 'com','risk', 'project', 'risk management', 'risks','management','share', 'like', 'itõs','JWM','jwm','___']
stop_words = ENGLISH_STOP_WORDS.union(custom_stop_words)

vectorizer = TfidfVectorizer(stop_words=stop_words,smooth_idf=True) 
# under the hood - lowercasing,removing special chars,removing stop words
input_matrix = vectorizer.fit_transform(text).todense()

svd_modeling= TruncatedSVD(n_components=4, algorithm='randomized', n_iter=100, random_state=122)
svd_modeling.fit(input_matrix)
components=svd_modeling.components_
vocab = vectorizer.get_feature_names()

topic_word_list = []
def get_topics(components): 
  for i, comp in enumerate(components):
    terms_comp = zip(vocab,comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    topic=" "
    for t in sorted_terms:
      topic= topic + ' ' + t[0]
    topic_word_list.append(topic)
    print(topic_word_list)
  return topic_word_list
get_topics(components)

!pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
for i in range(4):
  wc = WordCloud(width=1000, height=600, margin=3,  prefer_horizontal=0.7,scale=1,background_color='black', relative_scaling=0).generate(topic_word_list[i])
  plt.imshow(wc)
  plt.title(f"Topic{i+1}")
  plt.axis("off")
  plt.show()