Workspace
Ilya Selivanov/

Text Analysis

0
Beta
Spinner

This is a pre-processing example of text that we will make predictions on as well as investigate/analyse. The dataset was imported from Kaggle and is focused on the FIFA World Cup 2022 tweets posted on Twitter.

%%capture
	!pip install demoji
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter, OrderedDict
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import numpy as np
import demoji
import string

demoji.download_codes()

df = pd.read_csv('fifa_world_cup_2022_tweets[1].csv')
df = df.rename(columns={'Unnamed: 0':'ID'}) # Rename column because it's meant to be the tweet ID
df['Hashtag'] = [[] for i in range(len(df['Sentiment']))] # Create a column for future use
df.head(5)
# We have to check for a missing values as this is a real-world dataset
print(df.isnull().sum())

print('\n') # Separate outputs

#Check for data types
print(df.dtypes)

End of check - now to the data displays

In this code, we will clear all the emojis, punctuation and make the letters lower case, so we can count the most popular hashtags.

# Function for removing emojis
def remove_em(text):
    dem = demoji.findall(text)
    for item in dem.keys():
    	text = text.replace(item, '')
    return text

#Get all the hashtags
hashtags = []

for i in df['Tweet']:
    words = i.split()
    
    for j in words:
        if '#' in j:

            j = j.lower()
            j = j.translate(str.maketrans('', '', string.punctuation))
            j = remove_em(j)
            
            # row = df.loc[df['Tweet'] == i]
            # current_val = row['Hashtag'].to_string(index=False)
            # row['Hashtag'] = list(current_val).clear().append(j)
            hashtags.append('#' + j)

all_hash = pd.DataFrame(Counter(hashtags),
           index=list(range(len(Counter(hashtags))))).transpose().drop(list(range(1 ,6429)),
                                                                       axis=1).rename(columns={0:'frequency'})

all_hash = all_hash.sort_values('frequency',ascending=False)

pop_hash = all_hash.head(10).reset_index()
non_popular_hash = all_hash.tail(-10)[all_hash.tail(-10)['frequency']>100]

pop_hash = pop_hash.append({'index':'Other', 'frequency':sum(non_popular_hash['frequency'])}, ignore_index=True)

pop_hash = pop_hash.set_index('index').sort_values('frequency', ascending=False)
pop_hash

Here we vizualized the most popular hashtags in the form of a pie chart. You can interact with it to see the differences as percentages between the most common tags used.

#Visualize data

title = f'10 most popular hashtags'
text1 = f'Note: \'Other\' consists of {len(non_popular_hash)} other hashtags'
text2 = '(with a frequency > 100), their'
text3 = 'frequency added together'

fig = px.pie(pop_hash.reset_index().rename(columns={'index':'Hashtag'}), values='frequency', names='Hashtag', title=title, hole=0.4)

fig.add_annotation(x=0.05, y=1,
            ax=1.4, ay=0.1,
            text=text1)

fig.add_annotation(x=-0.01, y=0.95,
            ax=1.4, ay=0.1,
            text=text2)

fig.add_annotation(x=-0.04, y=0.894,
            ax=1.4, ay=0.1,
            text=text3)

fig.show()
#Number of hashtags that contain < word >

def merge(big_dict, key1, key2, new_name):
    big_dict[new_name] = big_dict[key1] + big_dict[key2]
    del(big_dict[key1]) #Delete un-needed dictionary values - we dont need them anymore
    del(big_dict[key2]) # Same here

contains_dict = {'worldcup' : 0,
                 'kook' : 0,
                 'fifa' : 0,
                 'qatar' : 0,
                 'football' : 0,
                 'soccer' : 0,
                 '2022' : 0,
                 'ecuador' : 0,
                 'jung' : 0
                }

for i in hashtags:
    for j in contains_dict.items():
        if j[0] in i:
            contains_dict[j[0]] += 1

merge(contains_dict, 'football', 'soccer', 'football/soccer') #Add `football` and `soccer` as they are the same thing
merge(contains_dict, 'jung', 'kook', '`jung` or `kook`')

contains_dict = dict(OrderedDict(sorted(contains_dict.items(), key=lambda kv: kv[1], reverse=True))) # Sort the dictionary

print('Count of common substrings in different hashtags: |')
print('_'*50 + '|')
n = 1

for i in contains_dict.items():
    spaces = (50 - len(f'{n}. `{i[0]}` with a count of `{i[1]}`'))*' '
    print(f'{n}. \'{i[0]}\' with a count of `{i[1]}`{spaces}|')
    n += 1
    
print('_'*51)
    
contains_df = pd.DataFrame(contains_dict.values(), index=contains_dict.keys(), columns=['frequency']).reset_index().rename(columns={'index':'substring'})

fig = px.pie(contains_df, values='frequency', names='substring', title='Frequency of substrings in different hashtags', hole=0.5)
fig.show()
df = pd.read_csv('fifa_world_cup_2022_tweets[1].csv')
df = df.rename(columns={'Unnamed: 0':'ID'}) # Rename column because it's meant to be the tweet ID
df['Hashtag'] = [[] for i in range(len(df['Sentiment']))] # Create a column for future use

def remove_em(text):
    dem = demoji.findall(text)
    for item in dem.keys():
    	text = text.replace(item, ' ')
    return text

tv = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1,3))

tweets = df['Tweet'].tolist()

for i in range(len(tweets)):
    tweets[i] = tweets[i].split()
    for j in range(len(tweets[i])):
        tweets[i][j] = remove_em(tweets[i][j]).lower()
        tweets[i][j]  = tweets[i][j].translate(str.maketrans(' ', ' ', string.punctuation))
    tweets[i] = ' '.join(tweets[i])
        
df['tweets_clean'] = tweets
tv_transformed = tv.fit_transform(tweets)

tv_array = tv_transformed.toarray()

tv_df = pd.DataFrame(
    tv_array,
    columns=tv.get_feature_names(),
)

df = pd.concat([df,tv_df], axis=1, sort=False)

total = tv_df.sum().sort_values(ascending=False).head(10)

print(total)

fig = px.pie(title='10 most important words of the tweets',
             names=total.index,
             values=total.values,
             hole=0.5)
fig.show()
# Now we will turn the most important features into a column of our tweets df
most_important = []
mock_df = df.copy().head(100)

for i in tv_df.itertuples(index=False):
    i_df = pd.DataFrame({'index':i._fields, 'importance':i}).set_index('index')
    most_important.append(i_df.sort_values('importance', ascending=False).head(1).index.tolist()[0].replace('countOf__', ''))
    
df['most_important_word'] = most_important

most_important_word_count = dict(Counter(df['most_important_word'].tolist()).most_common(10))

fig = px.pie(values=most_important_word_count.values(), names=most_important_word_count.keys(), title='Count of how much times an important word is chosen', hole=0.4)
fig.show()