SVD Topic Modeling
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner
    import glob
    path = 'Data2/'
    all_files = glob.glob(path + "/*.txt")
    #print(all_files)
    text = []
    for file in all_files:
        with open(file, 'r', encoding='ISO-8859-1') as f:
            text.append(f.read())
    print(text)
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
    
    custom_stop_words = ['http', 'https', 'amp', 'com','risk', 'project', 'risk management', 'risks','management','share', 'like', 'itõs','JWM','jwm','___']
    stop_words = ENGLISH_STOP_WORDS.union(custom_stop_words)
    vectorizer = TfidfVectorizer(stop_words=stop_words,smooth_idf=True) 
    # under the hood - lowercasing,removing special chars,removing stop words
    input_matrix = vectorizer.fit_transform(text).todense()
    svd_modeling= TruncatedSVD(n_components=4, algorithm='randomized', n_iter=100, random_state=122)
    svd_modeling.fit(input_matrix)
    components=svd_modeling.components_
    vocab = vectorizer.get_feature_names()
    topic_word_list = []
    def get_topics(components): 
      for i, comp in enumerate(components):
        terms_comp = zip(vocab,comp)
        sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
        topic=" "
        for t in sorted_terms:
          topic= topic + ' ' + t[0]
        topic_word_list.append(topic)
        print(topic_word_list)
      return topic_word_list
    get_topics(components)
    !pip install wordcloud
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    for i in range(4):
      wc = WordCloud(width=1000, height=600, margin=3,  prefer_horizontal=0.7,scale=1,background_color='black', relative_scaling=0).generate(topic_word_list[i])
      plt.imshow(wc)
      plt.title(f"Topic{i+1}")
      plt.axis("off")
      plt.show()