Eric Manuel Villegas Gómez














Sign up
Course Notes: Working with Categorical Data in Python
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Course Notes

    Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! For courses that use data, the datasets will be available in the datasets folder.

    # Import any packages you want to use here
    
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import os
    
    print(os.listdir('datasets/'))
    adults = pd.read_csv('datasets/adult.csv')
    used_cars = pd.read_csv('datasets/cars.csv')
    reviews = pd.read_csv('datasets/lasvegas_tripadvisor.csv')
    dogs = pd.read_csv('datasets/ShelterDogs.csv')
    

    Pasos para trabajar con Data Categorica

    Paso 1: Verificar toda tú información. .head - .describe .valuecounts(normalize=True)

    Paso 2: Convertir la información a tipo categoria y colocar el orden

    Paso 3: Agrupar los grupos y sacar estadisticas

    Paso 4: Agregar categorias

    Paso 5: Mover o Remover categorias

    Paso 6: Renombrar o colapsar las categorias

    Paso 7: Ordenar / Reordenar las categorias

    Paso 8: **Limpieza de datos de categoria

    Paso 9: Buscar data / Filtrar Data

    Paso 10: Graficar data categorica

    Paso 11: Crear matriz de graficas

    Paso 12: Trampas de data categorica --- Uso de etiquetas - Numeros en vez de categoricos o strings para ahorrar data --- Busqueda de palabras dentro de las etiquetas --- one-hot encoding para ML

    display(adults.head())
    #display(adults['Above/Below 50k'].value_counts())
    #display(adults.info)
    #display(adults.describe())
    #display(adults.isna().sum())
    #display(adults['Country'].value_counts())
    #display(adults['Country'].value_counts(normalize=True))

    Paso 2

    #Ejemplo 1
    #my_series1 = pd.Series(my_data, dtype="category")
    #print(my_series1)
    
    
    #Ejemplo 2
    #adults['Above/Below 50k_categories'] = pd.Categorical(adults['Above/Below 50k'], categories=[">50K","<=50K"],ordered=True)
    #print(adults)
    
    #Ejemplo 3 **Buena practica**
    adult_dtypes = {"Workclass":"category","Education":"category","Relationship":"category","Above/Below 50k":"category"} #Crear un diccionario
    adults = pd.read_csv('datasets/adult.csv',dtype=adult_dtypes)
    #print(adults.dtypes)

    Paso 3

    #Ejemplo1
    # Group the adult dataset by "Sex" and "Above/Below 50k"
    gb = adults.groupby(by=["Above/Below 50k","Sex"])
    # Print out how many rows are in each created group
    print(gb.size())
    # Print out the mean of each group for all columns
    print(gb.mean())
    
    #Ejemplo2
    # Create a list of user-selected variables
    user_list = ["Education", "Above/Below 50k"]
    # Create a GroupBy object using this list
    gb = adults.groupby(by=user_list)
    # Find the mean for the variable "Hours/Week" for each group - Be efficient!
    display(gb["Hours/Week"].mean())

    Paso 4

    # Check frequency counts while also printing the NaN count
    print(dogs["keep_in"].value_counts(dropna=False))
    
    # Switch to a categorical variable
    dogs["keep_in"] = dogs["keep_in"].astype("category")
    
    # Add new categories
    new_categories = ["Unknown History", "Open Yard (Countryside)"]
    dogs["keep_in"] = dogs["keep_in"].cat.add_categories(new_categories)
    
    # Check frequency counts one more time
    print(dogs["keep_in"].value_counts(dropna=False))

    Paso 5

    # Set "maybe" to be "no"
    dogs["likes_children"] = dogs["likes_children"].astype("category")
    dogs.loc[dogs["likes_children"] == "maybe", "likes_children"] = "no" #Cambiar categoria
    
    # Print out categories
    print(dogs["likes_children"].cat.categories)
    
    # Print the frequency table
    print(dogs["likes_children"].value_counts())
    
    # Remove the "maybe" category
    #dogs["likes_children"] = dogs["likes_children"].cat.remove_categories(removals=["maybe"])
    #print(dogs["likes_children"].value_counts())

    Paso 6

    #Ejemplo 1
    my_changes = {"Maybe?":"Maybe"}
    
    # Rename the categories listed in the my_changes dictionary
    dogs["likes_children"] = dogs["likes_children"].replace(my_changes)
    
    # Use a lambda function to convert all categories to uppercase using upper()
    dogs["likes_children"] =  dogs["likes_children"].cat.rename_categories(lambda c: c.upper())
    
    # Print the list of categories
    print(dogs["likes_children"].cat.categories)
    
    #Ejemplo 2
    # Create the update_coats dictionary
    update_coats = {
      "wirehaired": "medium",
      "medium-long": "medium"
    }
    
    # Create a new column, coat_collapsed
    dogs["coat_collapsed"] = dogs["coat"].replace(update_coats)
    
    # Convert the column to categorical
    dogs["coat_collapsed"] = dogs["coat_collapsed"].astype("category")
    
    # Print the frequency table
    print(dogs["coat_collapsed"].value_counts())

    Paso 7