Kateryna Babenko/

Topic Identification with Tf-idf


Topic Identification with Tf-idf

Identify and visualize the words that characterize a text within a collection of texts. Tf–idf, short for term frequency–inverse document frequency, is an information retrieval technique, that reflect how important a word is to a document in a collection of documents. A tf-idf value of 0 indicates generic terms, higher values indicate words that uniquely identify a given document.

!pip install wordcloud
# Load packages
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import os 
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
# Upload your data as a .txt files to the data folder 
reference = dict()
corpus = []

for filepath in glob.glob("data/*.txt"):
    basename = os.path.basename(filepath).replace('.txt', '')
    with open(filepath, 'r') as f:
        content ="\n", '')
        reference[basename] = content
# Create a Tfidf matrix
vectorizer = TfidfVectorizer(stop_words='english',     #
                             ngram_range = (1,1),      #
                             max_df = .6,              #
                             min_df = .01)             #

X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
dense = X.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
# Find the top words in each document
top_dict = {}
data = df.transpose()
data.columns = reference.keys()

for i, c in enumerate(data.columns):
    top = data.loc[:,c].sort_values(ascending=False).head(30)
    top_dict[data.columns[i]]= list(zip(top.index, top.values))

# Print the top 15 words said by each President
for president, top_words in top_dict.items():
    print(', '.join([word for word, count in top_words[0:14]]))
# Prep TF-IDF Matrix for Word Clouds
data = df.transpose()
data.columns = reference.keys()

# change the value to black
def black_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return("hsl(0,100%, 1%)")

wc = WordCloud(background_color="white",                  # select background color
               width=3000,                                # set wight
               height=2000,                               # set height
            .generate_from_frequencies(data['biden'])     # set max amount of words
                                                          # choose column for wordcloud
wc.recolor(color_func = black_color_func)                 # set the word color to black
plt.figure(figsize=[15,10])                               # set the figsize
plt.imshow(wc, interpolation="bilinear");                 # plot the wordcloud
plt.axis("off")                                           # remove plot axes
plt.savefig('wordcloud.png')                              # pick neame and save as png 
  • AI Chat
  • Code