Workspace
Abd Elaziz Elsayed Mohamed/

Diabetes Prediction Using Random Forest Algorithm

0
Beta
Spinner
#Importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import warnings
#loading the data
warnings.filterwarnings('ignore')
diab_df= pd.read_csv("diabetes-dataset.csv")
print(diab_df.head())

print(diab_df.tail())

print(diab_df.shape)

print(diab_df.info())

print(diab_df.isnull().sum())
diab_df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diab_df[['Glucose','BloodPressure','SkinThickness',
                                                                                'Insulin','BMI']].replace(0,np.NaN)
print(diab_df.isnull().sum())
print(diab_df.hist(figsize = (11,11), color="#008080"))
plt.show()

diab_df['Glucose'].fillna(diab_df['Glucose'].mean(), inplace = True)
diab_df['BloodPressure'].fillna(diab_df['BloodPressure'].mean(), inplace = True)
diab_df['SkinThickness'].fillna(diab_df['SkinThickness'].median(), inplace = True)
diab_df['Insulin'].fillna(diab_df['Insulin'].median(), inplace = True)
diab_df['BMI'].fillna(diab_df['BMI'].median(), inplace = True)
print(diab_df.isnull().sum())

print(diab_df.info())


plt.figure(figsize=(10,5))
plt.title('Diabetes Plot Yes/No', fontsize=14)
sns.countplot(x="Outcome", data=diab_df, palette=('#23C552','#C52219'))
plt.xlabel("Diabetes (0 = No, 1= Yes)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

fig, axes = plt.subplots(2, 4, figsize=(18, 12))
fig.suptitle('Diabetes Outcome Distribution WRT All Independent Variables', fontsize=16)
sns.boxplot(ax=axes[0, 0], x=diab_df['Outcome'], y=diab_df['Pregnancies'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[0, 0].set_title("Diabetes Outcome vs Pregnancies", fontsize=12)

sns.boxplot(ax=axes[0, 1], x=diab_df['Outcome'], y=diab_df['Glucose'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[0, 1].set_title("Diabetes Outcome vs Glucose", fontsize=12)

sns.boxplot(ax=axes[0, 2], x=diab_df['Outcome'], y=diab_df['BloodPressure'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[0, 2].set_title("Diabetes Outcome vs BloodPressure", fontsize=12)

sns.boxplot(ax=axes[0, 3], x=diab_df['Outcome'], y=diab_df['SkinThickness'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[0, 3].set_title("Diabetes Outcome vs SkinThickness", fontsize=12)

sns.boxplot(ax=axes[1, 0], x=diab_df['Outcome'], y=diab_df['Insulin'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[1, 0].set_title("Diabetes Outcome vs Insulin", fontsize=12)

sns.boxplot(ax=axes[1, 1], x=diab_df['Outcome'], y=diab_df['BMI'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[1, 1].set_title("Diabetes Outcome vs BMI", fontsize=12)

sns.boxplot(ax=axes[1, 2], x=diab_df['Outcome'], y=diab_df['DiabetesPedigreeFunction'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[1, 2].set_title("Diabetes Outcome vs DiabetesPedigreeFunction", fontsize=12)

sns.boxplot(ax=axes[1, 3], x=diab_df['Outcome'], y=diab_df['Age'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
print(axes[1, 3].set_title("Diabetes Outcome vs Age", fontsize=12))
plt.show()
print(sns.pairplot(diab_df, hue='Outcome', palette=('#23C552','#C52219')))
plt.show()

plt.figure(figsize=(12,10))
sns.heatmap(diab_df.corr(), annot=True, cmap='RdYlGn')
plt.title("Feature Correlation Matrix",fontsize=20)
plt.show()


print(diab_df.describe())

print(diab_df['Outcome'].value_counts())

x = diab_df.drop(['Outcome'],axis=1)
y = diab_df['Outcome']


sc= StandardScaler()
x_scaled= sc.fit_transform(x)


x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=0)

print(x_train.shape, y_train.shape)

print(x_test.shape, y_test.shape)


#applying random forst
random_forest = RandomForestClassifier(criterion = "gini",
                                       min_samples_leaf = 1,
                                       min_samples_split = 10,
                                       n_estimators=100,
                                       max_features='auto',
                                       oob_score=True,
                                       random_state=1,
                                       n_jobs=-1)

random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)

confmat1 = confusion_matrix(y_test,y_pred )
print(confmat1)

cm=metrics.ConfusionMatrixDisplay(confusion_matrix=metrics.confusion_matrix(y_test,y_pred,labels=random_forest.classes_),
                              display_labels=random_forest.classes_)
print(cm.plot(cmap="magma"))
plt.show()

print(accuracy_score(y_test, y_pred))