Skip to content
Unsupervised Learning in Python
Run the hidden code cell below to import the data used in this course.
# Import the course packages
import pandas as pd
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
import sklearn
import scipy.stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.manifold import TSNE
from scipy.stats import pearsonr
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from mpl_toolkits.mplot3d import Axes3D
# Import the course datasets
grains = pd.read_csv('datasets/grains.csv')
fish = pd.read_csv('datasets/fish.csv', header=None)
wine = pd.read_csv('datasets/wine.csv')
eurovision = pd.read_csv('datasets/eurovision-2016.csv')
stocks = pd.read_csv('datasets/company-stock-movements-2010-2015-incl.csv', index_col=0)
digits = pd.read_csv('datasets/lcd-digits.csv', header=None)
BEGINNING OF UNSUPERVISED LEARNING
#THIS CODE WILL NOT WORK AS points and new_points ARE NOT DEFINED
# Create a KMeans instance with 3 clusters: model
model = KMeans(n_clusters=3)
# Fit model to points
model.fit(points)
# Determine the cluster labels of new_points: labels
labels = model.predict(new_points)
# Print cluster labels of new_points
print(labels)
# Assign the columns of new_points: xs and ys
xs = new_points[:,0]
ys = new_points[:,1]
# Make a scatter plot of xs and ys, using labels to define the colors
plt.scatter(xs,ys, c=labels, alpha=0.5)
# Assign the cluster centers: centroids
centroids = model.cluster_centers_
# Assign the columns of centroids: centroids_x, centroids_y
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]
# Make a scatter plot of centroids_x and centroids_y
plt.scatter(centroids_x, centroids_y, marker="D", s=50)
plt.show()
gr_sample=grains.iloc[:,0:7].values
varieties=list(grains['variety'])
ks = range(1, 6)
inertias = []
for k in ks:
# Create a KMeans instance with k clusters: model
model=KMeans(n_clusters=k)
# Fit model to samples
model.fit(gr_sample)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
# Create a KMeans model with 3 clusters: model
model = KMeans(n_clusters=3)
# Use fit_predict to fit model and obtain cluster labels: labels
labels = model.fit_predict(gr_sample)
# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'varieties': varieties})
# Create crosstab: ct
ct = pd.crosstab(df['labels'],df['varieties'])
# Display ct
print(ct)
print(wine.head())
samples = wine.iloc[:, 2:10].values
# Create scaler: scaler
scaler = StandardScaler()
# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters=4)
# Create pipeline: pipeline
pipeline = make_pipeline(scaler, kmeans)
# Fit the pipeline to samples
pipeline.fit(samples)
# Define species
species = wine['class_label'].values
# Calculate the cluster labels: labels
labels = pipeline.predict(samples)
# Create a DataFrame with labels and species as columns: df
df = pd.DataFrame({'labels':labels,'species':species})
# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['species'])
# Display ct
print(ct)
# Use fcluster to extract labels: labels
labels = fcluster(mergings,20, criterion='distance' )
# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'species':species})
# Create crosstab: ct
ctf = pd.crosstab(df['labels'], df['species'])
# Display ct
print(ctf)
wine.head()
HIERARCHICAL CLUSTERING AND t-SNE (t-distributed stochastic neighbor embedding)
varieties = samples[:,-1]
# Calculate the linkage: mergings
mergings = linkage(samples, method='complete')
# Plot the dendrogram, using varieties as labels
dendrogram(mergings,
labels=species,
leaf_rotation=90,
leaf_font_size=6,
)
plt.show()
varieties = samples[:,-1]
# Calculate the linkage: mergings
mergings = linkage(samples, method='single')
# Plot the dendrogram, using varieties as labels
dendrogram(mergings,
labels=species,
leaf_rotation=90,
leaf_font_size=6,
)
plt.show()
# Create a TSNE instance: model
model = TSNE(learning_rate=50)
# Apply fit_transform to normalized_movements: tsne_features
tsne_features = model.fit_transform(samples)
# Select the 0th feature: xs
xs = tsne_features[:,0]
# Select the 1th feature: ys
ys = tsne_features[:,1]
# Scatter plot
plt.scatter(xs,ys,c=species,alpha=0.5)
wine.head()
PRINCIPAL COMPONENT ANALYSIS (PCA) FOR DATA DIMENSION REDUCTION
#principal components = directions of variance
samples2= wine[['total_phenols','od280']].values
width=wine['total_phenols']
legth=wine['od280']
plt.scatter(width,legth)
plt.axis('equal')
plt.title('scatter plot of phenols vs od280')
plt.show()
correlation,pvalue = pearsonr(width,legth)
print(correlation)
model=PCA()
pca_features= model.fit_transform(samples2)
xs=pca_features[:,0]
ys=pca_features[:,1]
plt.scatter(xs,ys)
plt.axis('equal')
plt.title('PCA transformed scatter plt of phenol vs od280')
plt.show()
#Intrinsic Dimension = number of features needed to approximate the dataset
#an essential idea behind dimenions reduction
#can be detected with PCA
# Make a scatter plot of the untransformed points
plt.scatter(samples2[:,0], samples2[:,1])
# Create a PCA instance: model
model = PCA()
# Fit model to points
model.fit(samples2)
# Get the mean of the grain samples: mean
mean = model.mean_
# Get the first principal component: first_pc
first_pc = model.components_[0,:]
# Plot first_pc as an arrow, starting at mean
plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)
# Keep axes on same scale
plt.axis('equal')
plt.show()
# Create scaler: scaler
scaler = StandardScaler()
# Create a PCA instance: pca
pca = PCA()
# Create pipeline: pipeline
pipeline = make_pipeline(scaler,pca)
# Fit the pipeline to 'samples'
pipeline.fit(samples)
# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()
#this last plot shows that depending on the parameters you set the wine dataset could have 3,4 or even 5 intrinsic dimensions