Beta
This is a pre-processing example of text that we will make predictions on as well as investigate/analyse. The dataset was imported from Kaggle and is focused on the FIFA World Cup 2022 tweets posted on Twitter.
%%capture
!pip install demoji
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter, OrderedDict
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import numpy as np
import demoji
import string
demoji.download_codes()
df = pd.read_csv('fifa_world_cup_2022_tweets[1].csv')
df = df.rename(columns={'Unnamed: 0':'ID'}) # Rename column because it's meant to be the tweet ID
df['Hashtag'] = [[] for i in range(len(df['Sentiment']))] # Create a column for future use
df.head(5)
# We have to check for a missing values as this is a real-world dataset
print(df.isnull().sum())
print('\n') # Separate outputs
#Check for data types
print(df.dtypes)
End of check - now to the data displays
In this code, we will clear all the emojis, punctuation and make the letters lower case, so we can count the most popular hashtags.
# Function for removing emojis
def remove_em(text):
dem = demoji.findall(text)
for item in dem.keys():
text = text.replace(item, '')
return text
#Get all the hashtags
hashtags = []
for i in df['Tweet']:
words = i.split()
for j in words:
if '#' in j:
j = j.lower()
j = j.translate(str.maketrans('', '', string.punctuation))
j = remove_em(j)
# row = df.loc[df['Tweet'] == i]
# current_val = row['Hashtag'].to_string(index=False)
# row['Hashtag'] = list(current_val).clear().append(j)
hashtags.append('#' + j)
all_hash = pd.DataFrame(Counter(hashtags),
index=list(range(len(Counter(hashtags))))).transpose().drop(list(range(1 ,6429)),
axis=1).rename(columns={0:'frequency'})
all_hash = all_hash.sort_values('frequency',ascending=False)
pop_hash = all_hash.head(10).reset_index()
non_popular_hash = all_hash.tail(-10)[all_hash.tail(-10)['frequency']>100]
pop_hash = pop_hash.append({'index':'Other', 'frequency':sum(non_popular_hash['frequency'])}, ignore_index=True)
pop_hash = pop_hash.set_index('index').sort_values('frequency', ascending=False)
pop_hash
Here we vizualized the most popular hashtags in the form of a pie chart. You can interact with it to see the differences as percentages between the most common tags used.
#Visualize data
title = f'10 most popular hashtags'
text1 = f'Note: \'Other\' consists of {len(non_popular_hash)} other hashtags'
text2 = '(with a frequency > 100), their'
text3 = 'frequency added together'
fig = px.pie(pop_hash.reset_index().rename(columns={'index':'Hashtag'}), values='frequency', names='Hashtag', title=title, hole=0.4)
fig.add_annotation(x=0.05, y=1,
ax=1.4, ay=0.1,
text=text1)
fig.add_annotation(x=-0.01, y=0.95,
ax=1.4, ay=0.1,
text=text2)
fig.add_annotation(x=-0.04, y=0.894,
ax=1.4, ay=0.1,
text=text3)
fig.show()
#Number of hashtags that contain < word >
def merge(big_dict, key1, key2, new_name):
big_dict[new_name] = big_dict[key1] + big_dict[key2]
del(big_dict[key1]) #Delete un-needed dictionary values - we dont need them anymore
del(big_dict[key2]) # Same here
contains_dict = {'worldcup' : 0,
'kook' : 0,
'fifa' : 0,
'qatar' : 0,
'football' : 0,
'soccer' : 0,
'2022' : 0,
'ecuador' : 0,
'jung' : 0
}
for i in hashtags:
for j in contains_dict.items():
if j[0] in i:
contains_dict[j[0]] += 1
merge(contains_dict, 'football', 'soccer', 'football/soccer') #Add `football` and `soccer` as they are the same thing
merge(contains_dict, 'jung', 'kook', '`jung` or `kook`')
contains_dict = dict(OrderedDict(sorted(contains_dict.items(), key=lambda kv: kv[1], reverse=True))) # Sort the dictionary
print('Count of common substrings in different hashtags: |')
print('_'*50 + '|')
n = 1
for i in contains_dict.items():
spaces = (50 - len(f'{n}. `{i[0]}` with a count of `{i[1]}`'))*' '
print(f'{n}. \'{i[0]}\' with a count of `{i[1]}`{spaces}|')
n += 1
print('_'*51)
contains_df = pd.DataFrame(contains_dict.values(), index=contains_dict.keys(), columns=['frequency']).reset_index().rename(columns={'index':'substring'})
fig = px.pie(contains_df, values='frequency', names='substring', title='Frequency of substrings in different hashtags', hole=0.5)
fig.show()
df = pd.read_csv('fifa_world_cup_2022_tweets[1].csv')
df = df.rename(columns={'Unnamed: 0':'ID'}) # Rename column because it's meant to be the tweet ID
df['Hashtag'] = [[] for i in range(len(df['Sentiment']))] # Create a column for future use
def remove_em(text):
dem = demoji.findall(text)
for item in dem.keys():
text = text.replace(item, ' ')
return text
tv = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1,3))
tweets = df['Tweet'].tolist()
for i in range(len(tweets)):
tweets[i] = tweets[i].split()
for j in range(len(tweets[i])):
tweets[i][j] = remove_em(tweets[i][j]).lower()
tweets[i][j] = tweets[i][j].translate(str.maketrans(' ', ' ', string.punctuation))
tweets[i] = ' '.join(tweets[i])
df['tweets_clean'] = tweets
tv_transformed = tv.fit_transform(tweets)
tv_array = tv_transformed.toarray()
tv_df = pd.DataFrame(
tv_array,
columns=tv.get_feature_names(),
)
df = pd.concat([df,tv_df], axis=1, sort=False)
total = tv_df.sum().sort_values(ascending=False).head(10)
print(total)
fig = px.pie(title='10 most important words of the tweets',
names=total.index,
values=total.values,
hole=0.5)
fig.show()
# Now we will turn the most important features into a column of our tweets df
most_important = []
mock_df = df.copy().head(100)
for i in tv_df.itertuples(index=False):
i_df = pd.DataFrame({'index':i._fields, 'importance':i}).set_index('index')
most_important.append(i_df.sort_values('importance', ascending=False).head(1).index.tolist()[0].replace('countOf__', ''))
df['most_important_word'] = most_important
most_important_word_count = dict(Counter(df['most_important_word'].tolist()).most_common(10))
fig = px.pie(values=most_important_word_count.values(), names=most_important_word_count.keys(), title='Count of how much times an important word is chosen', hole=0.4)
fig.show()