Matheus Cerqueira/

Supervised Learning with scikit-learn


Supervised Learning with scikit-learn

Run the hidden code cell below to import the data used in this course.

# Importing pandas
import pandas as pd

# Importing the course datasets 
diabetes = pd.read_csv('datasets/diabetes_clean.csv')
music = pd.read_csv('datasets/music_clean.csv')
advertising = pd.read_csv('datasets/advertising_and_sales_clean.csv')
telecom = pd.read_csv("datasets/telecom_churn_clean.csv")
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = advertising["radio"].to_numpy().reshape(-1, 1)
y = advertising["sales"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate the model
reg = LinearRegression()

# Fit the model to the data, y_train)

# Make predictions
y_pred = reg.predict(X_test)
print("Predictions: {}, Actual Values: {}".format(y_pred[:2], y_test[:2]))
from matplotlib import pyplot as plt

plt.scatter(X_test, y_test, color="blue")

plt.plot(X_test, y_pred, color="red")
plt.xlabel("Radio Expenditure ($)")
plt.ylabel("Sales ($)")

# Display the plot
from sklearn.metrics import mean_squared_error

# Compute R-squared
r_squared = reg.score(X_test, y_test)

# Compute RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print("R^2: {}".format(r_squared))
print("RMSE: {}".format(rmse))
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=6, shuffle=True, random_state=42)

cv_scores = cross_val_score(reg, X, y, cv=kf)

# Print scores


Hight precision = lower false positive rate

High Recall = lower false negative rate

# Import confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier

X = diabetes.drop("diabetes", axis=1)
y = diabetes["diabetes"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) 

knn = KNeighborsClassifier(n_neighbors=6)

# Fit the model to the training data, y_train)

# Predict the labels of the test data: y_pred
y_pred = knn.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression and ROC

Logistics Regression outputs probabilities

# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

X = telecom.drop("churn", axis=1)
y = telecom["churn"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) 

# Instantiate the model
logreg = LogisticRegression()

# Fit the model, y_train)

# Predict probabilities
y_pred_probs = logreg.predict_proba(X_test)[:, 1]


ROC curve shows how the true positive rate and false positive rate vary as the decision threshold changes

# Import roc_curve
from sklearn.metrics import roc_curve

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)

plt.plot([0, 1], [0, 1], 'k--')

# Plot tpr against fpr
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Diabetes Prediction')

  • AI Chat
  • Code