Skip to content
Linear Regression
  • AI Chat
  • Code
  • Report
  • Spinner

    Make Predictions with Linear Regression

    This recipe shows how to perform linear regression on your data. You can either play around using the provided Boston housing data (source) or you can load your own data and make the necessary changes in input_cols and output_col. For the linear regression itself, you will use the LinearRegression functionality from the scikit-learn package.

    # Load packages
    import numpy as np 
    import pandas as pd 
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set_style("darkgrid", {"axes.facecolor": ".9"})
    # Load data from the csv file
    df = pd.read_csv("housing_data.csv")
    df.head()
    # Understand the variables
    pd.options.display.max_colwidth = 100
    pd.read_csv('variable_explanation.csv', index_col=0)
    # Split the data into X and y
    # You can adapt the input and output columns to fit your own data
    input_cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
    output_col = ['PRICE']
    X = df[input_cols]
    y = df[output_col]
    
    # Split the data into training and test data
    X_train, X_test, y_train, y_test =  train_test_split(X,y,test_size = 0.30, random_state= 44)
    # Make two figures so it is better visualized
    half = len(input_cols)//2
    
    fig1=sns.pairplot(
        df,
        x_vars=input_cols[:half],
        y_vars=output_col
    )
    
    fig2=sns.pairplot(
        df,
        x_vars=input_cols[half:],
        y_vars=output_col
    )
    # Function to flatten 2D lists so it can be used by plotly
    def flatten(l):
        return [item for sublist in l for item in sublist]
    
    # Set up and fit the linear regressor
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    
    # Flatten the prediction and expected lists
    predicted = flatten(lin_reg.predict(X_test))
    expected = flatten(y_test.values)
    %matplotlib inline
    # Import plotting package
    import plotly.express as px
    
    # Put data to plot in dataframe
    df_plot = pd.DataFrame({'expected':expected, 'predicted':predicted})
    
    # Make scatter plot from data
    fig = px.scatter(
        df_plot, 
        x='expected', 
        y='predicted',
        title='Predicted vs. Actual Values')
    
    # Add straight line indicating perfect model
    fig.add_shape(type="line",
        x0=0, y0=0, x1=50, y1=50,
        line=dict(
            color="Red",
            width=4,
            dash="dot",
        )
    )
    
    # Show figure
    fig.show()
    # Print the root mean square error (RMS)
    error = np.sqrt(np.mean((np.array(predicted) - np.array(expected)) ** 2))
    print(f"RMS: {error:.4f} ")
    
    r2=r2_score(expected, predicted)
    print(f"R2: {round(r2,4)}")