Unsupervised Learning in Python
👋 Welcome to your new workspace! Here, you can experiment with the data you used in Unsupervised Learning in Python and practice your newly learned skills with a challenge. You can find out more about DataCamp Workspace here.
Below is a code cell that imports the course packages and loads in the course datasets as pandas DataFrames.
🏃To execute the code, click inside the cell to select it and click "Run" or the ► icon. You can also use Shift-Enter to run a selected cell and automatically switch to the next cell.
# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import scipy.stats
# Import the course datasets as DataFrames
grains = pd.read_csv('datasets/grains.csv')
fish = pd.read_csv('datasets/fish.csv', header=None)
wine = pd.read_csv('datasets/wine.csv')
eurovision = pd.read_csv('datasets/eurovision-2016.csv')
stocks = pd.read_csv('datasets/company-stock-movements-2010-2015-incl.csv', index_col=0)
digits = pd.read_csv('datasets/lcd-digits.csv', header=None)
# Preview the first DataFrame
grains
#using kmeans to cluster grains
#this is a very simple manipulation. Number of clusters is variety number
#here the numpy array is made by deleting labels and all other features are used for clustering.
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import seaborn as sns
#sns.histplot(grains['variety'])
data = grains.drop(['variety_number','variety' ], axis = 1).values
print(data)
#sns.pairplot(data = grains)
model = KMeans(n_clusters = 3)
model.fit(data)
labels = model.predict(data)
print(model.cluster_centers_)
print(labels)
print(np.unique(labels))
Measuring the quality of the clustering
- Can check the correspondence to labels, if known
- Measure quality of clustering
- Use cross tabulation to check the label correspondence
import pandas as pd
grains = pd.read_csv('datasets/grains.csv')
labels = grains['variety_number'].values
kind = grains['variety'].values
df = pd.DataFrame({'labels':labels,'variety':kind})
print(df.head())
ct = pd.crosstab(grains['labels'], df['variety'])
print(ct)
from sklearn.cluster import KMeans
data_grain = grains.drop(['variety', 'variety_number'], axis = 1).values
model = KMeans(n_clusters = 3)
model.fit(data_grain)
prediction = model.predict(data_grain)
ct2 = pd.crosstab(prediction, grains['variety'])
print(ct2)
# the same can be done with simpler code using model.fit_predict() method
# Create a KMeans model with 3 clusters: model
model = KMeans(n_clusters = 3)
# Use fit_predict to fit model and obtain cluster labels: labels
labels = model.fit_predict(data_grain)
# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'varieties': grains['variety']})
# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['varieties'])
# Display ct
print(ct)
Most datasets have no cluster labeling, so we need to measure the quality of clustering using only samples and their cluster labels Good clustering has tight clusters. Inertia is measuring clustering quality - lower is better. Inertia is distance from each sample to centroid of its cluster k-means attepts to minimize the inertia when choosing clusters
Good number of clusters - it is clusters with low inertia, but without too many clusters
import pandas as pd
grains = pd.read_csv('datasets/grains.csv')
grains_data = grains.drop(['variety_number', 'variety'], axis = 1).values
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 3)
model.fit(grains_data)
inertia = []
for clusters in range(1,11):
model = KMeans(n_clusters = clusters)
model.fit(grains_data)
inertia.append(model.inertia_)
plt.plot(range(1,11,1), inertia)
plt.xlabel('n_clusters')
plt.ylabel('Inertia')
plt.xticks(range(1,11))
plt.title('Best number of clusters is 3')
plt.show()
#Finding the best amount of clusters using inertia_ attribute
fish = pd.read_csv('datasets/fish.csv', header=None)
print(fish.head())
fish_feat = fish.drop([0], axis = 1).values
labels = fish.iloc[:,0]
print(labels)
from sklearn.cluster import KMeans
inertias = []
for k in range(1, 11):
model = KMeans(n_clusters = k)
model.fit(fish_feat)
inertias.append(model.inertia_)
import matplotlib.pyplot as plt
plt.plot(range(1,11), inertias)
plt.show()
#clustering fishes
import pandas as pd
fish = pd.read_csv('datasets/fish.csv', header=None)
fish_feat = fish.drop([0], axis = 1).values
labels = fish.iloc[:, 0]
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 4)
predict = model.fit_predict(fish_feat)
df = pd.DataFrame({'labels':labels, 'prediction': predict})
print(df)
print(pd.crosstab(df['prediction'], df['labels']))
#Note simple clustering is not sufficient to cluster the fishes correctly.
Transforming features for better clustering. The clustering is affected by features variance. If it is very different it can affect the clustering badly To give every feature a chance the data should be transformed, so the features have an equal variance StandardScaler transform every feature to have mena 0 and variance 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import pandas as pd
fish = pd.read_csv('datasets/fish.csv', header=None)
fish_feat = fish.drop([0], axis = 1).values
labels = fish.iloc[:, 0]
scaler.fit(fish_feat)
fish_scaled = scaler.transform(fish_feat)
from sklearn.cluster import KMeans
model = KMeans(n_clusters = 4)
model.fit(fish_scaled)
predictions = model.predict(fish_scaled)
df = pd.DataFrame({'labels':labels, 'predictions':predictions })
pd.crosstab(df['labels'], df['predictions'])
#It appears, that scaled data works much better for fish clustering
Use the same procedure using pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
scaler = StandardScaler()
kmeans = KMeans(n_clusters = 4)
# preparing data
fish = pd.read_csv('datasets/fish.csv', header=None)
fish_feat = fish.drop([0], axis = 1).values
labels = fish.iloc[:, 0]
#making pipeline
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(scaler, kmeans)
pipeline.fit(fish_feat)
prediction = pipeline.predict(fish_feat)
df = pd.DataFrame({'labels':labels, 'predictions':predictions })
pd.crosstab(df['labels'], df['predictions'])