Beta
# Import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure
matplotlib.rcParams['figure.figsize'] = (12,8) # adjusts the config of the plots we will create
# Read in data
df = pd.read_csv('movies.csv')
df.head()
# Let's see if there is any missing data
print(df.isnull().any())
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
print(f'{col} - {pct_missing}')
# Data types for our columns
df.dtypes
df2=df
df2['new']= 'x'
df2['new'].str.replace("x","y")
df_cleaned = df.fillna(df.mean())
df_cleaned['budget'] = df_cleaned['budget'].astype('int64')
df_cleaned['gross'] = df_cleaned['gross'].astype('int64')
#df_cleaned['year'] = pd.to_datetime(df_cleaned['released'], format='%B %d, %Y')
df_cleaned.sort_values(by=['gross'], inplace=False, ascending=True)
df_cleaned
#df_cleaned['company'].drop_duplicates().sort_values(ascending=True)
df_cleaned.drop_duplicates()
#Educated guess: budget & company would have high corrleation with gross revenue
plt.scatter(x=df_cleaned['budget'] , y=df_cleaned['gross'])
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for Film')
plt.show()
sns.regplot(x='budget', y='gross', data=df_cleaned, scatter_kws={"color": "blue", 'alpha':0.3},line_kws={'color': 'red'})
df_cleaned.corr(method='spearman') #pearson, spearman, kendall
# High correlation between gross and (budget, votes)
correlation_matrix = df_cleaned.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
df_cleaned['company'].sort_values().unique()
# Look at company
df_numerized = df_cleaned
for col_name in df_numerized.columns:
if (df_numerized[col_name].dtype == 'object'):
df_numerized[col_name] = df_numerized[col_name].astype('category')
df_numerized[col_name] = df_numerized[col_name].cat.codes
correlation_matrix = df_numerized.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
corr_pairs = correlation_matrix.unstack()
sorted_pairs = corr_pairs.sort_values()
sorted_pairs