Beta
Table of Contents
The outline of your notebook will show up here. You can include headings in any text cell by starting a line with #
, ##
, ###
, etc., depending on the desired title hierarchy.
Simple descriptors in pandas
# Print the head of the homelessness data
print(homelessness.head())
# Print information about homelessness
print(homelessness.info())
# Print the shape of homelessness: shape is an attribute, hence no brackets at the end
print(homelessness.shape)
# Print a description of homelessness: .describe() gives simple summary statistics for all numerical? columns
print(homelessness.describe())
# Print the values of homelessness
print(homelessness.values)
# Print the column index/column names of homelessness
print(homelessness.columns)
# Print the row index/row names of homelessness
print(homelessness.index)
Sorting continued...
# Sort homelessness by region, then descending family members
homelessness_reg_fam = homelessness.sort_values(["region","family_members"], ascending=[True, False])
# Print the top few rows
print(homelessness_reg_fam.head())
#Subsetting columns:
#When working with data, you may not need all of the variables in your dataset. Square brackets ([]) can be used to select only the columns that matter to you in an order that makes sense to you. To select only "col_a" of the DataFrame df, use
df["col_a"]
#To select "col_a" and "col_b" of df, use
df[["col_a", "col_b"]]
#Subsetting rows:
#There are many ways to subset a DataFrame, perhaps the most common is to use relational operators to return True or False for each row, then pass that inside square brackets.
dogs[dogs["height_cm"] > 60]
dogs[dogs["color"] == "tan"]
#You can filter for multiple conditions at once by using the "bitwise and" operator, &.
dogs[(dogs["height_cm"] > 60) & (dogs["color"] == "tan")]
#subsetting rows by categorical variables: use of ".isin()"
colors = ["brown", "black", "tan"]
condition = dogs["color"].isin(colors)
dogs[condition]
#for comparison, subsetting rows by use of or "|" instead of .isin()
south_mid_atlantic = homelessness[(homelessness["region"]=="South Atlantic") | (homelessness["region"]=="Mid-Atlantic")]
#adding new columns to a dataframe:
# Add total col as sum of individuals and family_members
homelessness["total"] = homelessness["individuals"] + homelessness["family_members"]
# Add p_individuals col as proportion of total that are individuals
homelessness["p_individuals"] = homelessness["individuals"] / homelessness["total"]
SUMMARY STATISTICS e.g.: .mean() .min() .mode() .quantile() .agg() etc
#While pandas and NumPy have tons of functions, sometimes, you may need a different function to summarize your data.
#The .agg() method allows you to apply your own custom functions to a DataFrame, as well as apply functions to more than one column of a DataFrame at once, making your aggregations super-efficient. For example,
df['column'].agg(function)
#for example:
# A custom IQR function:
def iqr(column):
return column.quantile(0.75) - column.quantile(0.25)
# Print IQR of the temperature_c column
print(sales["temperature_c"].agg(iqr))
#for multiple columns:
# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment: note the list of functions inside agg():
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset=["store","type"])
print(store_types.head())
# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=["store","department"])
print(store_depts.head())
# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]].drop_duplicates("date")
# Print date col of holiday_dates
print(holiday_dates["date"])
#use of .groupby()
sales_by_type = sales.groupby("type")["weekly_sales"].sum()
# Group by multiple columns; calculate total weekly sales
sales_by_type_is_holiday = sales.groupby(["type","is_holiday"])["weekly_sales"].sum()
#use of pivot_table instead of .groupby()
# Import NumPy as np
import numpy as np
# Pivot for mean and median weekly_sales for each store type
mean_med_sales_by_type = sales.pivot_table(values = "weekly_sales", index = "type", aggfunc = [np.mean, np.median])
# Pivot for mean weekly_sales by store type and holiday
mean_sales_by_type_holiday = sales.pivot_table(values="weekly_sales", index="type", columns = "is_holiday")
sales.pivot_table(values="weekly_sales",index="department",columns="type", fill_value=0)
sales.pivot_table(values="weekly_sales", index="department", columns="type", fill_value=0, margins=True)
Indexing by one column, or multiple columns
# Set the index of temperatures to city
temperatures_ind = temperatures.set_index("city")
# Reset the temperatures_ind index, keeping its contents
print(temperatures_ind.reset_index())
# Reset the temperatures_ind index, dropping its contents
print(temperatures_ind.reset_index(drop=True))
# Index temperatures by country & city
temperatures_ind = temperatures.set_index(["country","city"])
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [("Brazil","Rio De Janeiro"),("Pakistan","Lahore")]
# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])
#indexing and sorting by index:
# Sort temperatures_ind by index values
print(temperatures_ind.sort_index())
# Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level = "city"))
# Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level = ["country", "city"], ascending = [True,False]))
#slicing by index: you need to SORT first!!!
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()
# Subset rows from Pakistan to Russia
print(temperatures_srt.loc["Pakistan":"Russia"])
# Try to subset rows from Lahore to Moscow, by inner indexes:
print(temperatures_srt.loc["Lahore":"Moscow"])
# Subset rows from Pakistan, Lahore to Russia, Moscow... this is the correct way to use inner indexes:
print(temperatures_srt.loc[("Pakistan","Lahore"):("Russia","Moscow")])
# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[("India","Hyderabad"):("Iraq","Baghdad")])
# Subset columns from date to avg_temp_c
print(temperatures_srt.loc[:,"date":"avg_temp_c"])
# Subset in both directions at once
print(temperatures_srt.loc[("India","Hyderabad"):("Iraq","Baghdad"),"date":"avg_temp_c"])
#subsetting by dates:
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") & (temperatures["date"] <= "2011-12-31")]
print(temperatures_bool)
# Set date as the index and sort the index
temperatures_ind = temperatures.set_index("date").sort_index()
# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
print(temperatures_ind.loc["2010":"2011"])
# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08":"2011-02"])