Beta
Predicting Breast Cancer with Machine Learning
# Import the modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score ,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn.datasets import load_breast_cancer
Loading and checking the data
# Load the dataset
bc = load_breast_cancer()
data = pd.DataFrame(bc.data, columns=bc.feature_names)
data["diagnosis"] = bc.target
data.head()
# Check for any missing value
data.isnull().sum()
Features and target
# Define features (all columns except 'diagnosis') and target (column 'diagnosis')
X = data.iloc[:,:-1].values
y = data.diagnosis.values.reshape(-1,1)
Logistic Regression or Decision Tree?
# Create a dictionary containing the models and an empty list
models = {"Logistic regression":LogisticRegression(), "Decision tree classifier":DecisionTreeClassifier()}
results = []
# Iterate over models
for model in models.values():
# Cross validation scores
cv_scores = cross_val_score(model, X, y, cv=5)
# Append the results to the empty list
results.append(cv_scores)
# Make a boxplot out of the results
plt.boxplot(results, labels=models.keys())
Model building
# Model to use
model = LogisticRegression()
# Scaler to use
scaler = StandardScaler()
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
# Scale features (target doesn't need to be scaled, since it's binary)
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# Fit the model
model.fit(X_train, y_train)
# Predict the target
y_pred = model.predict(X_test)
# Compute accuracy
model.score(X_test, y_test)
# Draw confusion matrix
plot_confusion_matrix(model, X_test, y_test)