Predicting Breast Cancer with Machine Learning
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Predicting Breast Cancer with Machine Learning

    # Import the modules 
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import train_test_split, cross_val_score ,GridSearchCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import plot_confusion_matrix, accuracy_score
    from sklearn.datasets import load_breast_cancer

    Loading and checking the data

    # Load the dataset 
    bc = load_breast_cancer()
    data = pd.DataFrame(bc.data, columns=bc.feature_names)
    data["diagnosis"] = bc.target
    data.head()
    # Check for any missing value
    data.isnull().sum()

    Features and target

    # Define features (all columns except 'diagnosis') and target (column 'diagnosis')
    X = data.iloc[:,:-1].values
    y = data.diagnosis.values.reshape(-1,1)

    Logistic Regression or Decision Tree?

    # Create a dictionary containing the models and an empty list
    models = {"Logistic regression":LogisticRegression(), "Decision tree classifier":DecisionTreeClassifier()}
    results = []
    
    # Iterate over models
    for model in models.values():
        
        # Cross validation scores
        cv_scores = cross_val_score(model, X, y, cv=5)
        
        # Append the results to the empty list
        results.append(cv_scores)
    
    # Make a boxplot out of the results
    plt.boxplot(results, labels=models.keys())

    Model building

    # Model to use
    model = LogisticRegression()
    # Scaler to use
    scaler = StandardScaler()
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
    # Scale features (target doesn't need to be scaled, since it's binary)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    # Fit the model
    model.fit(X_train, y_train)
    # Predict the target
    y_pred = model.predict(X_test)
    # Compute accuracy
    model.score(X_test, y_test)
    # Draw confusion matrix
    plot_confusion_matrix(model, X_test, y_test)