Skip to content
Unsupervised Learning in Python
  • AI Chat
  • Code
  • Report
  • Spinner

    Unsupervised Learning in Python

    Run the hidden code cell below to import the data used in this course.

    # Import the course packages
    import pandas as pd
    import numpy as np
    from numpy import random
    import matplotlib.pyplot as plt
    import sklearn
    import scipy.stats 
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import make_pipeline
    from scipy.cluster.hierarchy import linkage, dendrogram
    from scipy.cluster.hierarchy import fcluster
    from sklearn.manifold import TSNE
    from scipy.stats import pearsonr
    from sklearn.decomposition import PCA
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD
    from mpl_toolkits.mplot3d import Axes3D
    
    # Import the course datasets 
    grains = pd.read_csv('datasets/grains.csv')
    fish = pd.read_csv('datasets/fish.csv', header=None)
    wine = pd.read_csv('datasets/wine.csv')
    eurovision = pd.read_csv('datasets/eurovision-2016.csv')
    stocks = pd.read_csv('datasets/company-stock-movements-2010-2015-incl.csv', index_col=0)
    digits = pd.read_csv('datasets/lcd-digits.csv', header=None)

    BEGINNING OF UNSUPERVISED LEARNING

    #THIS CODE WILL NOT WORK AS points and new_points ARE NOT DEFINED
    # Create a KMeans instance with 3 clusters: model
    model = KMeans(n_clusters=3)
    
    # Fit model to points
    model.fit(points)
    
    # Determine the cluster labels of new_points: labels
    labels = model.predict(new_points)
    
    # Print cluster labels of new_points
    print(labels)
    
    # Assign the columns of new_points: xs and ys
    xs = new_points[:,0]
    ys = new_points[:,1]
    
    # Make a scatter plot of xs and ys, using labels to define the colors
    plt.scatter(xs,ys, c=labels, alpha=0.5)
    
    # Assign the cluster centers: centroids
    centroids = model.cluster_centers_
    
    # Assign the columns of centroids: centroids_x, centroids_y
    centroids_x = centroids[:,0]
    centroids_y = centroids[:,1]
    
    # Make a scatter plot of centroids_x and centroids_y
    plt.scatter(centroids_x, centroids_y, marker="D", s=50)
    plt.show()
    
    
    
    gr_sample=grains.iloc[:,0:7].values
    varieties=list(grains['variety'])
    
    ks = range(1, 6)
    inertias = []
    
    for k in ks:
        # Create a KMeans instance with k clusters: model
        model=KMeans(n_clusters=k)
        
        # Fit model to samples
        model.fit(gr_sample)
        
        # Append the inertia to the list of inertias
        inertias.append(model.inertia_)
        
    # Plot ks vs inertias
    plt.plot(ks, inertias, '-o')
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.show()
    
    # Create a KMeans model with 3 clusters: model
    model = KMeans(n_clusters=3)
    
    # Use fit_predict to fit model and obtain cluster labels: labels
    labels = model.fit_predict(gr_sample)
    
    # Create a DataFrame with labels and varieties as columns: df
    df = pd.DataFrame({'labels': labels, 'varieties': varieties})
    
    # Create crosstab: ct
    ct = pd.crosstab(df['labels'],df['varieties'])
    
    # Display ct
    print(ct)
    print(wine.head())
    
    samples = wine.iloc[:, 2:10].values
    
    # Create scaler: scaler
    scaler = StandardScaler()
    
    # Create KMeans instance: kmeans
    kmeans = KMeans(n_clusters=4)
    
    # Create pipeline: pipeline
    pipeline = make_pipeline(scaler, kmeans)
    
    # Fit the pipeline to samples
    pipeline.fit(samples)
    
    # Define species
    species = wine['class_label'].values
    
    # Calculate the cluster labels: labels
    labels = pipeline.predict(samples)
    
    # Create a DataFrame with labels and species as columns: df
    df = pd.DataFrame({'labels':labels,'species':species})
    
    # Create crosstab: ct
    ct = pd.crosstab(df['labels'], df['species'])
    
    # Display ct
    print(ct)
    
    # Use fcluster to extract labels: labels
    labels = fcluster(mergings,20, criterion='distance' )
    
    # Create a DataFrame with labels and varieties as columns: df
    df = pd.DataFrame({'labels': labels, 'species':species})
    
    # Create crosstab: ct
    ctf = pd.crosstab(df['labels'], df['species'])
    
    # Display ct
    print(ctf)
    
    wine.head()

    HIERARCHICAL CLUSTERING AND t-SNE (t-distributed stochastic neighbor embedding)

    varieties = samples[:,-1]
    
    # Calculate the linkage: mergings
    mergings = linkage(samples, method='complete')
    
    # Plot the dendrogram, using varieties as labels
    dendrogram(mergings,
               labels=species,
               leaf_rotation=90,
               leaf_font_size=6,
    )
    plt.show()
    
    varieties = samples[:,-1]
    
    # Calculate the linkage: mergings
    mergings = linkage(samples, method='single')
    
    # Plot the dendrogram, using varieties as labels
    dendrogram(mergings,
               labels=species,
               leaf_rotation=90,
               leaf_font_size=6,
    )
    plt.show()
    # Create a TSNE instance: model
    model = TSNE(learning_rate=50)
    
    # Apply fit_transform to normalized_movements: tsne_features
    tsne_features = model.fit_transform(samples)
    
    
    # Select the 0th feature: xs
    xs = tsne_features[:,0]
    
    # Select the 1th feature: ys
    ys = tsne_features[:,1]
    
    # Scatter plot
    plt.scatter(xs,ys,c=species,alpha=0.5)
    
    
    wine.head()
    
    

    PRINCIPAL COMPONENT ANALYSIS (PCA) FOR DATA DIMENSION REDUCTION

    #principal components = directions of variance
    
    samples2= wine[['total_phenols','od280']].values
    
    width=wine['total_phenols']
    legth=wine['od280']
    
    plt.scatter(width,legth)
    plt.axis('equal')
    plt.title('scatter plot of phenols vs od280')
    plt.show()
    
    correlation,pvalue = pearsonr(width,legth)
    print(correlation)
    
    model=PCA()
    pca_features= model.fit_transform(samples2)
    xs=pca_features[:,0]
    ys=pca_features[:,1]
    
    plt.scatter(xs,ys)
    plt.axis('equal')
    plt.title('PCA transformed scatter plt of phenol vs od280')
    plt.show()
    
    
    #Intrinsic Dimension = number of features needed to approximate the dataset
    #an essential idea behind dimenions reduction
    #can be detected with PCA
    
    # Make a scatter plot of the untransformed points
    plt.scatter(samples2[:,0], samples2[:,1])
    
    # Create a PCA instance: model
    model = PCA()
    
    # Fit model to points
    model.fit(samples2)
    
    # Get the mean of the grain samples: mean
    mean = model.mean_
    
    # Get the first principal component: first_pc
    first_pc = model.components_[0,:]
    
    # Plot first_pc as an arrow, starting at mean
    plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)
    
    # Keep axes on same scale
    plt.axis('equal')
    plt.show()
    
    # Create scaler: scaler
    scaler = StandardScaler()
    
    # Create a PCA instance: pca
    pca = PCA()
    
    # Create pipeline: pipeline
    pipeline = make_pipeline(scaler,pca)
    
    # Fit the pipeline to 'samples'
    pipeline.fit(samples)
    
    # Plot the explained variances
    features = range(pca.n_components_)
    plt.bar(features, pca.explained_variance_)
    plt.xlabel('PCA feature')
    plt.ylabel('variance')
    plt.xticks(features)
    plt.show()
    
    #this last plot shows that depending on the parameters you set the wine dataset could have 3,4 or even 5 intrinsic dimensions