Project: Predicting Temperature in London
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    As the climate changes, predicting the weather becomes ever more important for businesses. Since the weather depends on a lot of different factors, you will want to run a lot of experiments to determine what the best approach is to predict the weather. In this project, you will run experiments for different regression models predicting the mean temperature, using a combination of sklearn and MLflow.

    You will be working with data stored in london_weather.csv, which contains the following columns:

    • date - recorded date of measurement - (int)
    • cloud_cover - cloud cover measurement in oktas - (float)
    • sunshine - sunshine measurement in hours (hrs) - (float)
    • global_radiation - irradiance measurement in Watt per square meter (W/m2) - (float)
    • max_temp - maximum temperature recorded in degrees Celsius (°C) - (float)
    • mean_temp - mean temperature in degrees Celsius (°C) - (float)
    • min_temp - minimum temperature recorded in degrees Celsius (°C) - (float)
    • precipitation - precipitation measurement in millimeters (mm) - (float)
    • pressure - pressure measurement in Pascals (Pa) - (float)
    • snow_depth - snow depth measurement in centimeters (cm) - (float)
    # Run this cell to import the modules you require
    import pandas as pd
    import numpy as np
    import mlflow
    import mlflow.sklearn
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    
    # Import the data and perform exploratory data analysis
    weather = pd.read_csv('london_weather.csv')
    weather.info()
    
    weather['date'] = pd.to_datetime(weather['date'], format='%Y%m%d')
    weather['year'] = weather['date'].dt.year
    weather['month'] = weather['date'].dt.month
    weather_metrics = ['cloud_cover', 'sunshine', 'global_radiation', 'max_temp', 'mean_temp', 'min_temp', 'precipitation', 'pressure', 'snow_depth']
    weather_per_month = weather.groupby(['year', 'month'], as_index = False)[weather_metrics].mean()
    
    sns.lineplot(x="year", y="mean_temp", data=weather_per_month, ci=None)
    plt.show()
    sns.barplot(x='month', y='precipitation', data=weather)
    plt.show()
    sns.heatmap(weather.corr(), annot=True)
    plt.show()
    
    # Choose features, define the target, and drop null values
    feature_selection = ['month', 'cloud_cover', 'sunshine', 'precipitation', 'pressure', 'global_radiation']
    target_var = 'mean_temp'
    weather = weather.dropna(subset=[target_var])
    
    # Load data and perform exploratory analysis
    def preprocess_df(df, feature_selection, target_var):
        """
        Split dataframe into X and y, and train and test consecutively. Then impute and scale both train and test features. Returns the train and test sets
        """
        # Subset the data
        X = df[feature_selection]    
        y = df[target_var]
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
        
        # Impute missing values
        imputer = SimpleImputer(strategy="mean")
        # Fit on the training data
        X_train = imputer.fit_transform(X_train)
        # Transform on the test data
        X_test  = imputer.transform(X_test)
        
        # Scale the data
        scaler = StandardScaler()
        # Fit on the training data
        X_train = scaler.fit_transform(X_train)
        # Transform on the test data
        X_test = scaler.transform(X_test)
        
        return X_train, X_test, y_train, y_test
    
    X_train, X_test, y_train, y_test = preprocess_df(weather, feature_selection, target_var)
    
    # Predict on the test set and evaluate performance
    def predict_and_evaluate(model, x_test, y_test):
        """
        Predict values from test set, calculate and return the root mean squared error.
        """
        y_pred = model.predict(x_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))    
        return rmse
      
    # Create an experiment
    EXPERIMENT_NAME = "weather_pred"
    EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)
    
    # Predict, evaluate, and log the parameters and metrics of your models
    for idx, depth in enumerate([1, 2, 5, 10, 20]):
        parameters = {
            'max_depth': depth
        }    
        run_name = f"run_{idx}"
        with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=run_name):
            # Create models
            lin_reg = LinearRegression().fit(X_train, y_train)
            tree_reg = DecisionTreeRegressor(random_state=42, max_depth=depth).fit(X_train, y_train)
            forest_reg = RandomForestRegressor(random_state=42, max_depth=depth).fit(X_train, y_train)
            # Log models
            mlflow.sklearn.log_model(lin_reg, "lin_reg")
            mlflow.sklearn.log_model(tree_reg, "tree_reg")
            mlflow.sklearn.log_model(forest_reg, "forest_reg")
            # Evaluate performance
            lin_reg_rmse = predict_and_evaluate(lin_reg, X_test, y_test)
            tree_reg_rmse = predict_and_evaluate(tree_reg, X_test, y_test)
            forest_reg_rmse = predict_and_evaluate(forest_reg, X_test, y_test)
            # Log performance
            mlflow.log_param("max_depth", depth)
            mlflow.log_metric("rmse_lr", lin_reg_rmse)
            mlflow.log_metric("rmse_tr", tree_reg_rmse)
            mlflow.log_metric("rmse_fr", forest_reg_rmse)
            
    # Search the runs for the experiment's results
    experiment_results = mlflow.search_runs(experiment_names=[EXPERIMENT_NAME])
    experiment_results