Python continued (Pandas notes)
  • AI Chat
  • Code
  • Report
  • Spinner

    Simple descriptors in pandas

    # Print the head of the homelessness data
    print(homelessness.head())
    
    # Print information about homelessness
    print(homelessness.info())
    
    # Print the shape of homelessness: shape is an attribute, hence no brackets at the end
    print(homelessness.shape)
    
    # Print a description of homelessness: .describe() gives simple summary statistics for all numerical? columns
    print(homelessness.describe())
    
    # Print the values of homelessness
    print(homelessness.values)
    
    # Print the column index/column names of homelessness
    print(homelessness.columns)
    
    # Print the row index/row names of homelessness
    print(homelessness.index)

    Sorting continued...

    # Sort homelessness by region, then descending family members
    homelessness_reg_fam = homelessness.sort_values(["region","family_members"], ascending=[True, False])
    
    # Print the top few rows
    print(homelessness_reg_fam.head())
    #Subsetting columns:
    
    #When working with data, you may not need all of the variables in your dataset. Square brackets ([]) can be used to select only the columns that matter to you in an order that makes sense to you. To select only "col_a" of the DataFrame df, use
    
    df["col_a"]
    
    #To select "col_a" and "col_b" of df, use
    
    df[["col_a", "col_b"]]
    #Subsetting rows:
    #There are many ways to subset a DataFrame, perhaps the most common is to use relational operators to return True or False for each row, then pass that inside square brackets.
    
    dogs[dogs["height_cm"] > 60]
    dogs[dogs["color"] == "tan"]
    
    #You can filter for multiple conditions at once by using the "bitwise and" operator, &.
    
    dogs[(dogs["height_cm"] > 60) & (dogs["color"] == "tan")]
    
    #subsetting rows by categorical variables: use of ".isin()"
    colors = ["brown", "black", "tan"]
    condition = dogs["color"].isin(colors)
    dogs[condition]
    #for comparison, subsetting rows by use of or "|" instead of .isin()
    south_mid_atlantic = homelessness[(homelessness["region"]=="South Atlantic") | (homelessness["region"]=="Mid-Atlantic")]
    #adding new columns to a dataframe:
    # Add total col as sum of individuals and family_members
    homelessness["total"] = homelessness["individuals"] + homelessness["family_members"]
    
    # Add p_individuals col as proportion of total that are individuals
    homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]

    SUMMARY STATISTICS e.g.: .mean() .min() .mode() .quantile() .agg() etc

    #While pandas and NumPy have tons of functions, sometimes, you may need a different function to summarize your data.
    
    #The .agg() method allows you to apply your own custom functions to a DataFrame, as well as apply functions to more than one column of a DataFrame at once, making your aggregations super-efficient. For example,
    
    df['column'].agg(function)
    
    #for example:
    # A custom IQR function:
    def iqr(column):
        return column.quantile(0.75) - column.quantile(0.25)
        
    # Print IQR of the temperature_c column
    print(sales["temperature_c"].agg(iqr))
    
    #for multiple columns:
    # Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
    print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))
    
    # Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment: note the list of functions inside agg():
    print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))
    # Drop duplicate store/type combinations
    store_types = sales.drop_duplicates(subset=["store","type"])
    print(store_types.head())
    
    # Drop duplicate store/department combinations
    store_depts = sales.drop_duplicates(subset=["store","department"])
    print(store_depts.head())
    
    # Subset the rows where is_holiday is True and drop duplicate dates
    holiday_dates = sales[sales["is_holiday"]].drop_duplicates("date")
    
    # Print date col of holiday_dates
    print(holiday_dates["date"])
    #use of .groupby()
    sales_by_type = sales.groupby("type")["weekly_sales"].sum()
    
    # Group by multiple columns; calculate total weekly sales
    sales_by_type_is_holiday = sales.groupby(["type","is_holiday"])["weekly_sales"].sum()
    
    #use of pivot_table instead of .groupby()
    # Import NumPy as np
    import numpy as np
    
    # Pivot for mean and median weekly_sales for each store type
    mean_med_sales_by_type = sales.pivot_table(values = "weekly_sales", index = "type", aggfunc = [np.mean, np.median])
    
    # Pivot for mean weekly_sales by store type and holiday 
    mean_sales_by_type_holiday = sales.pivot_table(values="weekly_sales", index="type", columns = "is_holiday")
    
    sales.pivot_table(values="weekly_sales",index="department",columns="type", fill_value=0)
    sales.pivot_table(values="weekly_sales", index="department", columns="type", fill_value=0, margins=True)

    Indexing by one column, or multiple columns

    # Set the index of temperatures to city
    temperatures_ind = temperatures.set_index("city")
    # Reset the temperatures_ind index, keeping its contents
    print(temperatures_ind.reset_index())
    # Reset the temperatures_ind index, dropping its contents
    print(temperatures_ind.reset_index(drop=True))
    
    # Index temperatures by country & city
    temperatures_ind = temperatures.set_index(["country","city"])
    # List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
    rows_to_keep = [("Brazil","Rio De Janeiro"),("Pakistan","Lahore")]
    # Subset for rows to keep
    print(temperatures_ind.loc[rows_to_keep])
    
    #indexing and sorting by index:
    # Sort temperatures_ind by index values
    print(temperatures_ind.sort_index())
    # Sort temperatures_ind by index values at the city level
    print(temperatures_ind.sort_index(level = "city"))
    # Sort temperatures_ind by country then descending city
    print(temperatures_ind.sort_index(level = ["country", "city"], ascending = [True,False]))
    
    #slicing by index: you need to SORT first!!!
    # Sort the index of temperatures_ind
    temperatures_srt = temperatures_ind.sort_index()
    # Subset rows from Pakistan to Russia
    print(temperatures_srt.loc["Pakistan":"Russia"])
    # Try to subset rows from Lahore to Moscow, by inner indexes:
    print(temperatures_srt.loc["Lahore":"Moscow"])
    # Subset rows from Pakistan, Lahore to Russia, Moscow... this is the correct way to use inner indexes:
    print(temperatures_srt.loc[("Pakistan","Lahore"):("Russia","Moscow")])
    
    # Subset rows from India, Hyderabad to Iraq, Baghdad
    print(temperatures_srt.loc[("India","Hyderabad"):("Iraq","Baghdad")])
    # Subset columns from date to avg_temp_c
    print(temperatures_srt.loc[:,"date":"avg_temp_c"])
    # Subset in both directions at once
    print(temperatures_srt.loc[("India","Hyderabad"):("Iraq","Baghdad"),"date":"avg_temp_c"])
    
    #subsetting by dates:
    # Use Boolean conditions to subset temperatures for rows in 2010 and 2011
    temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") & (temperatures["date"] <= "2011-12-31")]
    print(temperatures_bool)
    # Set date as the index and sort the index
    temperatures_ind = temperatures.set_index("date").sort_index()
    # Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
    print(temperatures_ind.loc["2010":"2011"])
    # Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
    print(temperatures_ind.loc["2010-08":"2011-02"])