# Importing the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Importing the course datasets
usfarmers = pd.read_csv('https://assets.datacamp.com/production/repositories/3841/datasets/efdbc5d7c7b734f0b091d924605c4ad2664ef830/markets_cleaned.csv')
pollution = pd.read_csv('https://assets.datacamp.com/production/repositories/3841/datasets/a6b11493e11dd47f3e03e0b96e2a2dbc51f03cb2/pollution_wide.csv')
pop=pd.read_csv('census-state-populations.csv')
Highlighting your data
Here we will learn how to highlight our data in python using both matplotlib.pyplot and seaborn
pollution.head()
pollution.city.unique()
cinci_pollution = pollution[pollution.city == 'Cincinnati']
sns.scatterplot(x=cinci_pollution['NO2'], y=cinci_pollution['SO2'])
plt.show()
cinci_pollution = pollution[pollution.city == 'Cincinnati']
cinci_colors = ['orangered' if day == 38 else 'steelblue'
for day in cinci_pollution.day]
sns.set_style('whitegrid')
p = sns.regplot(x = 'NO2',
y= 'SO2',
data = cinci_pollution,
fit_reg=False,
scatter_kws={'facecolors': cinci_colors, 'alpha': 0.7})
plt.show()
Hardcoding a highlight
You are working with the city of Houston to look at the relationship between sulfur dioxide (SO2) and nitrogen dioxide (NO2) pollution, specifically, pollution in the most recent year data was collected (2014). You have singled out a particularly bad day, November 26th, where there was a bad spike in the SO2 levels. To draw the viewers attention to this bad day, you will highlight it in a bright orangish-red and color the rest of the points gray.
houston_pollution = pollution[pollution.city == 'Houston']
# Make array orangred for day 330 of year 2014, otherwise lightgray
houston_colors = ['orangered' if (day == 330) & (year == 2014) else 'lightgray'
for day,year in zip(houston_pollution.day, houston_pollution.year)]
sns.regplot(x = 'NO2',
y = 'SO2',
data = houston_pollution,
fit_reg = False,
# Send scatterplot argument to color points
scatter_kws = {'facecolors': houston_colors, 'alpha': 0.7})
plt.show()
Programmatically creating a highlight
You are continuing your work for the city of Houston. Now you want to look at the behavior of both NO2 and SO2 when the un-plotted ozone (O3) value was at its highest.
To do this, replace the logic in the current list comprehension with one that compares a row's O3 value with the highest observed O3 in the dataset. Note: use sns.scatterplot() instead of sns.regplot(). This is because sns.scatterplot() can take a non-color vector as its hue argument and colors the points automatically while providing a helpful legend.
houston_pollution = pollution[pollution.city == 'Houston'].copy()
# Find the highest observed O3 value
max_O3 = houston_pollution.O3.max()
# Make a column that denotes which day had highest O3
houston_pollution['point type'] = ['Highest O2 Day' if O3 == max_O3 else 'Others' for O3 in houston_pollution.O3]
sns.set_style('whitegrid')
# Encode the hue of the points with the O3 generated column
sns.scatterplot(x='NO2',
y='SO2',
data=houston_pollution,
hue='point type')
plt.show()
Comparing groups
Here we will compare groups using a kernel density estimator
pollution_nov = pollution[pollution.month == 10]
sns.distplot(pollution_nov[pollution_nov.city == 'Denver'].O3, hist = False, color = 'red')
sns.distplot(pollution_nov[pollution_nov.city != 'Denver'].O3, hist = False)
plt.show()