Movie Correlation Project

Beta

# Import libraries

import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure

matplotlib.rcParams['figure.figsize'] = (12,8) # adjusts the config of the plots we will create

# Read in data

df = pd.read_csv('movies.csv')
df.head()

# Let's see if there is any missing data
print(df.isnull().any())
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print(f'{col} - {pct_missing}')

# Data types for our columns

df.dtypes
df2=df
df2['new']= 'x'
df2['new'].str.replace("x","y")

df_cleaned = df.fillna(df.mean())
df_cleaned['budget'] = df_cleaned['budget'].astype('int64')
df_cleaned['gross'] = df_cleaned['gross'].astype('int64')

#df_cleaned['year'] = pd.to_datetime(df_cleaned['released'], format='%B %d, %Y')

df_cleaned.sort_values(by=['gross'], inplace=False, ascending=True)
df_cleaned

#df_cleaned['company'].drop_duplicates().sort_values(ascending=True)
df_cleaned.drop_duplicates()

#Educated guess: budget & company would have high corrleation with gross revenue

plt.scatter(x=df_cleaned['budget'] , y=df_cleaned['gross'])
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for Film')

plt.show()

sns.regplot(x='budget', y='gross', data=df_cleaned, scatter_kws={"color": "blue", 'alpha':0.3},line_kws={'color': 'red'})

df_cleaned.corr(method='spearman') #pearson, spearman, kendall
# High correlation between gross and (budget, votes)

correlation_matrix = df_cleaned.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

df_cleaned['company'].sort_values().unique()

# Look at company
df_numerized = df_cleaned 
for col_name in df_numerized.columns:
    if (df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes

correlation_matrix = df_numerized.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

corr_pairs = correlation_matrix.unstack()
sorted_pairs = corr_pairs.sort_values()
sorted_pairs

‌
‌
‌