Movie Correlation Project
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner
    # Import libraries
    
    import pandas as pd
    import seaborn as sns
    import numpy as np
    
    import matplotlib
    import matplotlib.pyplot as plt
    plt.style.use('ggplot')
    from matplotlib.pyplot import figure
    
    matplotlib.rcParams['figure.figsize'] = (12,8) # adjusts the config of the plots we will create
    
    # Read in data
    
    df = pd.read_csv('movies.csv')
    df.head()
    # Let's see if there is any missing data
    print(df.isnull().any())
    for col in df.columns:
        pct_missing = np.mean(df[col].isnull())
        print(f'{col} - {pct_missing}')
    # Data types for our columns
    
    df.dtypes
    df2=df
    df2['new']= 'x'
    df2['new'].str.replace("x","y")
    df_cleaned = df.fillna(df.mean())
    df_cleaned['budget'] = df_cleaned['budget'].astype('int64')
    df_cleaned['gross'] = df_cleaned['gross'].astype('int64')
    
    #df_cleaned['year'] = pd.to_datetime(df_cleaned['released'], format='%B %d, %Y')
    df_cleaned.sort_values(by=['gross'], inplace=False, ascending=True)
    df_cleaned
    #df_cleaned['company'].drop_duplicates().sort_values(ascending=True)
    df_cleaned.drop_duplicates()
    #Educated guess: budget & company would have high corrleation with gross revenue
    
    plt.scatter(x=df_cleaned['budget'] , y=df_cleaned['gross'])
    plt.title('Budget vs Gross Earnings')
    plt.xlabel('Gross Earnings')
    plt.ylabel('Budget for Film')
    
    plt.show()
    sns.regplot(x='budget', y='gross', data=df_cleaned, scatter_kws={"color": "blue", 'alpha':0.3},line_kws={'color': 'red'})
    df_cleaned.corr(method='spearman') #pearson, spearman, kendall
    # High correlation between gross and (budget, votes)
    correlation_matrix = df_cleaned.corr(method='pearson')
    sns.heatmap(correlation_matrix, annot=True)
    plt.title('Correlation Matrix for Numeric Features')
    plt.xlabel('Movie Features')
    plt.ylabel('Movie Features')
    plt.show()
    df_cleaned['company'].sort_values().unique()
    # Look at company
    df_numerized = df_cleaned 
    for col_name in df_numerized.columns:
        if (df_numerized[col_name].dtype == 'object'):
            df_numerized[col_name] = df_numerized[col_name].astype('category')
            df_numerized[col_name] = df_numerized[col_name].cat.codes
    
    correlation_matrix = df_numerized.corr(method='pearson')
    sns.heatmap(correlation_matrix, annot=True)
    plt.title('Correlation Matrix for Numeric Features')
    plt.xlabel('Movie Features')
    plt.ylabel('Movie Features')
    plt.show()
    corr_pairs = correlation_matrix.unstack()
    sorted_pairs = corr_pairs.sort_values()
    sorted_pairs