Supervised Learning with scikit-learn
  • AI Chat
  • Code
  • Report
  • Spinner

    Supervised Learning with scikit-learn

    Run the hidden code cell below to import the data used in this course.

    # Importing pandas
    import pandas as pd
    
    # Importing the course datasets 
    diabetes = pd.read_csv('datasets/diabetes_clean.csv')
    music = pd.read_csv('datasets/music_clean.csv')
    advertising = pd.read_csv('datasets/advertising_and_sales_clean.csv')
    telecom = pd.read_csv("datasets/telecom_churn_clean.csv")

    Take Notes

    Add notes about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    # Add your code snippets here

    Supervised learning

    • classification : variables discrètes (s'il n'y en a que deux, on dit que la classification est binaire)
    • regression : variables continues

    Conventions de nommage

    • feature (caractéristique) aussi appelée predictor variable ou independent variable
    • target variable (vairable cible) encore appelée dependent variable ou response variable
    • Requirements
      • no missing values
      • data in numeric format
      • data stored in pandas DataFrame or NumPy array
    • Perform Exploratory Data Analysis (EDA)

    Classifying labels of unseen data

    1. Build a model
    2. Model learns from the labeled data (training data) we pass to it
    3. Pass unlabeled data to he model as input
    4. Model predicts the label of the unseen data

    Scikit exige que les feature soient dans un tableau où chaque colonne est une caractéristique et chaque ligne une observation différente. De même, la vairable dépendante doit être dans un tableau à une dimension. Le nombre d'observations doit être, en outre, le même dans les deux tableaux

    # Import KNeighborsClassifier
    from sklearn.neighbors import KNeighborsClassifier 
    
    # Create arrays for the features and the target variable
    y = churn_df["churn"].values
    X = churn_df[["account_length", "customer_service_calls"]].values
    
    # Create a KNN classifier with 6 neighbors
    knn = KNeighborsClassifier(n_neighbors=6)
    
    # Fit the classifier to the data
    knn.fit(X, y)
    
    X_new = np.array([[30.0, 17.5],
                      [107.0, 24.1],
                      [213.0, 10.9]])
    
    # Predict the labels for the X_new
    y_pred = knn.predict(X_new)
    
    # Print the predictions for X_new
    print("Predictions: {}".format(y_pred)) 
    Hidden output
    # Import the module
    from sklearn.model_selection import train_test_split
    
    X = churn_df.drop("churn", axis=1).values
    y = churn_df["churn"].values
    
    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    knn = KNeighborsClassifier(n_neighbors=5)
    
    # Fit the classifier to the training data
    knn.fit(X_train, y_train)
    
    # Print the accuracy
    print(knn.score(X_test, y_test))
    
    # Create neighbors
    neighbors = np.arange(1, 13)
    train_accuracies = {}
    test_accuracies = {}
    
    for neighbor in neighbors:
      
    	# Set up a KNN Classifier
    	knn = KNeighborsClassifier(n_neighbors=neighbor)
      
    	# Fit the model
    	knn.fit(X_train, y_train)
      
    	# Compute accuracy
    	train_accuracies[neighbor] = knn.score(X_train, y_train)
    	test_accuracies[neighbor] = knn.score(X_test, y_test)
    print(neighbors, '\n', train_accuracies, '\n', test_accuracies)
    
    # Add a title
    plt.title("KNN: Varying Number of Neighbors")
    
    # Plot training accuracies
    plt.plot(neighbors, train_accuracies.values(), label="Training Accuracy")
    
    # Plot test accuracies
    plt.plot(neighbors, test_accuracies.values(), label="Testing Accuracy")
    
    plt.legend()
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Accuracy")
    
    # Display the plot
    plt.show()
    import numpy as np
    
    # Create X from the radio column's values
    X = sales_df["radio"].values
    
    # Create y from the sales column's values
    y = sales_df["sales"].values
    
    # Reshape X (pour en faire un tableau 2D)
    X = X.reshape(-1,1)
    
    # Check the shape of the features and targets
    print(X.shape, y.shape)
    
    # Import LinearRegression
    from sklearn.linear_model import LinearRegression
    
    # Create the model
    reg = LinearRegression()
    
    # Fit the model to the data
    reg.fit(X,y)
    
    # Make predictions
    predictions = reg.predict(X)
    
    print(predictions[:5])
    
    # Import matplotlib.pyplot
    import matplotlib.pyplot as plt
    
    # Create scatter plot
    plt.scatter(X, y, color="blue")
    
    # Create line plot
    plt.plot(X, predictions, color="red")
    plt.xlabel("Radio Expenditure ($)")
    plt.ylabel("Sales ($)")
    
    # Display the plot
    plt.show()
    
    # Create X and y arrays
    X = sales_df.drop("sales", axis=1).values
    y = sales_df["sales"].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Instantiate the model
    reg = LinearRegression()
    
    # Fit the model to the data
    reg.fit(X_train, y_train)
    
    # Make predictions
    y_pred = reg.predict(X_test)
    print("Predictions: {}, Actual Values: {}".format(y_pred[:2], y_test[:2]))
    
    # Import mean_squared_error
    from sklearn.metrics import mean_squared_error
    
    # Compute R-squared
    r_squared = reg.score(X_test, y_test)
    
    # Compute RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Print the metrics
    print("R^2: {}".format(r_squared))
    print("RMSE: {}".format(rmse))
    # Import the necessary modules
    from sklearn.model_selection import cross_val_score, KFold
    
    # Create a KFold object
    kf = KFold(n_splits=6, shuffle=True, random_state=5)
    
    reg = LinearRegression()
    
    # Compute 6-fold cross-validation scores
    cv_scores = cross_val_score(reg, X, y, cv=kf)
    
    # Print scores
    print(cv_scores)
    
    # Import Ridge
    from sklearn.linear_model import Ridge
    alphas = [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
    ridge_scores = []
    for alpha in alphas:
      
      # Create a Ridge regression model
      ridge = Ridge(alpha=alpha)
      
      # Fit the data
      ridge.fit(X_train, y_train)
      
      # Obtain R-squared
      score = ridge.score(X_test, y_test)
      ridge_scores.append(score)
    print(ridge_scores)
    
    # Import Lasso
    from sklearn.linear_model import Lasso
    
    # Instantiate a lasso regression model
    lasso = Lasso(alpha=0.3)
    
    # Fit the model to the data
    lasso.fit(X, y)
    
    # Compute and print the coefficients
    lasso_coef = lasso.fit(X,y).coef_
    print(lasso_coef)
    plt.bar(sales_columns, lasso_coef)
    plt.xticks(rotation=45)
    plt.show()
    # Import confusion matrix
    from sklearn.metrics import classification_report, confusion_matrix
    
    knn = KNeighborsClassifier(n_neighbors=6)
    
    # Fit the model to the training data
    knn.fit(X_train, y_train)
    
    # Predict the labels of the test data: y_pred
    y_pred = knn.predict(X_test)
    
    # Generate the confusion matrix and classification report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    
    # Import LogisticRegression
    from sklearn.linear_model import LogisticRegression
    
    # Instantiate the model
    logreg = LogisticRegression()
    
    # Fit the model
    logreg.fit(X_train, y_train)
    
    # Predict probabilities
    y_pred_probs = logreg.predict_proba(X_test)[:, 1]
    
    print(y_pred_probs[:10])
    
    # Import roc_curve
    from sklearn.metrics import roc_curve
    
    # Generate ROC curve values: fpr, tpr, thresholds
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
    
    plt.plot([0, 1], [0, 1], 'k--')
    
    # Plot tpr against fpr
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve for Diabetes Prediction')
    plt.show()
    
    # Import roc_auc_score
    from sklearn.metrics import roc_auc_score
    
    # Calculate roc_auc_score
    print(roc_auc_score(y_test, y_pred_probs))
    
    # Calculate the confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # Calculate the classification report
    print(classification_report(y_test, y_pred))
    
    
    # Import GridSearchCV
    from sklearn.model_selection import GridSearchCV
    
    # Set up the parameter grid
    param_grid = {"alpha": np.linspace(0.00001, 1, 20)}
    
    # Instantiate lasso_cv
    lasso_cv = GridSearchCV(lasso, param_grid, cv=kf)
    
    # Fit to the training data
    lasso_cv.fit(X_train, y_train)
    print("Tuned lasso paramaters: {}".format(lasso_cv.best_params_))
    print("Tuned lasso score: {}".format(lasso_cv.best_score_))
    
    # Create the parameter space
    params = {"penalty": ["l1", "l2"],
             "tol": np.linspace(0.0001, 1.0, 50),
             "C": np.linspace(0.1, 1.0, 50),
             "class_weight": ["balanced", {0:0.8, 1:0.2}]}
    
    # Instantiate the RandomizedSearchCV object
    logreg_cv = RandomizedSearchCV(logreg, params, cv=kf)
    
    # Fit the data to the model
    logreg_cv.fit(X_train, y_train)
    
    # Print the tuned parameters and score
    print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
    print("Tuned Logistic Regression Best Accuracy Score: {}".format(logreg_cv.best_score_))