Beta
Course Notes
Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! The datasets used in this course are available in the datasets
folder.
# Import any packages you want to use here
Take Notes
Add notes here about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here
# Add a linear trend line without a confidence ribbon
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE)
# Run a linear regression of price_twd_msq vs. n_convenience
lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
# Using taiwan_real_estate, plot price_twd_msq
ggplot(taiwan_real_estate, aes(price_twd_msq)) +
# Make it a histogram with 10 bins
geom_histogram(bins = 10) +
# Facet the plot so each house age group gets its own panel
facet_wrap(~house_age_years)
summary_stats <- taiwan_real_estate %>%
# Group by house age
group_by(house_age_years) %>%
# Summarize to calculate the mean house price/area
summarize(mean_by_group = mean(price_twd_msq))
# See the result
summary_stats
# Update the model formula to remove the intercept
mdl_price_vs_age_no_intercept <- lm(
price_twd_msq ~ house_age_years + 0,
data = taiwan_real_estate
)
# See the result
mdl_price_vs_age_no_intercept
# Create a tibble with n_convenience column from zero to ten
explanatory_data <- tibble(
n_convenience = 0:10
)
# Edit this, so predictions are stored in prediction_data
prediction_data <- explanatory_data %>%
mutate( price_twd_msq = predict(mdl_price_vs_conv, explanatory_data))
# See the result
prediction_data
# Add to the plot
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add a point layer of prediction data, colored yellow
geom_point(
data = prediction_data,
color = "yellow"
)
# Get the coefficients of mdl_price_vs_conv
coeffs <- coefficients(mdl_price_vs_conv)
# Get the intercept
intercept <- coeffs[1]
# Get the slope
slope <- coeffs[2]
explanatory_data %>%
mutate(
# Manually calculate the predictions
price_twd_msq = intercept + slope*n_convenience
)
# Compare to the results from predict()
predict(mdl_price_vs_conv, explanatory_data)
# Using sp500_yearly_returns, plot return_2019 vs. return_2018
ggplot(sp500_yearly_returns, aes(return_2018, return_2019)) +
# Make it a scatter plot
geom_point() +
# Add a line at y = x, colored green, size 1
geom_abline(color = "green", size = 1) +
# Add a linear regression trend line, no std. error ribbon
geom_smooth(method = "lm", se = FALSE) +
# Fix the coordinate ratio
coord_fixed()
# From previous steps
mdl_price_vs_dist <- lm(
price_twd_msq ~ sqrt(dist_to_mrt_m),
data = taiwan_real_estate
)
explanatory_data <- tibble(
dist_to_mrt_m = seq(0, 80, 10) ^ 2
)
prediction_data <- explanatory_data %>%
mutate(
price_twd_msq = predict(mdl_price_vs_dist, explanatory_data)
)
ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add points from prediction_data, colored green, size 5
geom_point(data = prediction_data,
color = "green", size = 5)
# From previous steps
mdl_click_vs_impression <- lm(
I(n_clicks ^ 0.25) ~ I(n_impressions ^ 0.25),
data = ad_conversion
)
explanatory_data <- tibble(
n_impressions = seq(0, 3e6, 5e5)
)
prediction_data <- explanatory_data %>%
mutate(
n_clicks_025 = predict(mdl_click_vs_impression, explanatory_data),
n_clicks = n_clicks_025 ^ 4
)
ggplot(ad_conversion, aes(n_impressions ^ 0.25, n_clicks ^ 0.25)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add points from prediction_data, colored green
geom_point(data = prediction_data, color = "green")
# Get coeff of determination for mdl_click_vs_impression_orig
mdl_click_vs_impression_orig %>%
# Get the model-level details
glance() %>%
# Pull out r.squared
pull(r.squared)
# Do the same for the transformed model
mdl_click_vs_impression_trans %>%
glance() %>%
pull(r.squared)
# Plot the three diagnostics for mdl_price_vs_conv
autoplot(mdl_price_vs_conv,
which = 1:3,
nrow = 3,
ncol =1)
mdl_price_vs_dist %>%
# Augment the model
augment() %>%
# Arrange rows by descending leverage
arrange(desc(.hat)) %>%
# Get the head of the dataset
head()
mdl_price_vs_dist %>%
# Augment the model
augment() %>%
# Arrange rows by descending Cook's distance
arrange(desc(.cooksd)) %>%
# Get the head of the dataset
head()
# Plot the three outlier diagnostics for mdl_price_vs_dist
autoplot(mdl_price_vs_dist,
which = 4:6,
nrow = 3,
ncol = 1)
# Using churn, plot time_since_last_purchase
ggplot(churn, aes(time_since_last_purchase)) +
# as a histogram with binwidth 0.25
geom_histogram(binwidth = 0.25) +
# faceted in a grid with has_churned on each row
facet_grid(has_churned~.)
ggplot(churn, aes(time_since_first_purchase, has_churned)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
# Add a glm trend line, no std error ribbon, binomial family
geom_smooth(method = "glm", se = FALSE, method.args = list(family = binomial))
# Update the data frame
prediction_data <- explanatory_data %>%
mutate(
has_churned = predict(mdl_churn_vs_relationship, explanatory_data, type = "response"),
most_likely_outcome = round(has_churned)
)
# Update the plot
plt_churn_vs_relationship +
# Add most likely outcome points from prediction_data, colored yellow, size 2
geom_point(data = prediction_data,aes(y = most_likely_outcome), color = "yellow", size = 2)