this is the nav!
Workspace
Moussa Diallo/

# Course Notes: Introduction to Regression in R

0
Beta

Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! The datasets used in this course are available in the `datasets` folder.

```.mfe-app-workspace-qcdhrn{font-size:13px;line-height:1.5384615384615385;font-family:JetBrainsMonoNL,Menlo,Monaco,'Courier New',monospace;}```# Import any packages you want to use here
``````

### Take Notes

Add notes here about the concepts you've learned and code cells with code you want to keep.

``````# Add your code snippets here
# Add a linear trend line without a confidence ribbon
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE)``````
``````# Run a linear regression of price_twd_msq vs. n_convenience
lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)``````
``````# Using taiwan_real_estate, plot price_twd_msq
ggplot(taiwan_real_estate, aes(price_twd_msq)) +
# Make it a histogram with 10 bins
geom_histogram(bins = 10) +
# Facet the plot so each house age group gets its own panel
facet_wrap(~house_age_years)``````
``````summary_stats <- taiwan_real_estate %>%
# Group by house age
group_by(house_age_years) %>%
# Summarize to calculate the mean house price/area
summarize(mean_by_group = mean(price_twd_msq))

# See the result
summary_stats``````
``````# Update the model formula to remove the intercept
mdl_price_vs_age_no_intercept <- lm(
price_twd_msq ~ house_age_years + 0,
data = taiwan_real_estate
)

# See the result
mdl_price_vs_age_no_intercept``````
``````# Create a tibble with n_convenience column from zero to ten
explanatory_data <- tibble(
n_convenience = 0:10
)

# Edit this, so predictions are stored in prediction_data
prediction_data <- explanatory_data %>%
mutate( price_twd_msq = predict(mdl_price_vs_conv, explanatory_data))
# See the result
prediction_data``````
``````# Add to the plot
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add a point layer of prediction data, colored yellow
geom_point(
data = prediction_data,
color = "yellow"
)``````
``````# Get the coefficients of mdl_price_vs_conv
coeffs <- coefficients(mdl_price_vs_conv)

# Get the intercept
intercept <- coeffs

# Get the slope
slope <- coeffs

explanatory_data %>%
mutate(
# Manually calculate the predictions
price_twd_msq = intercept + slope*n_convenience
)

# Compare to the results from predict()
predict(mdl_price_vs_conv, explanatory_data)``````
``````# Using sp500_yearly_returns, plot return_2019 vs. return_2018
ggplot(sp500_yearly_returns, aes(return_2018, return_2019)) +
# Make it a scatter plot
geom_point() +
# Add a line at y = x, colored green, size 1
geom_abline(color = "green", size = 1) +
# Add a linear regression trend line, no std. error ribbon
geom_smooth(method = "lm", se = FALSE) +
# Fix the coordinate ratio
coord_fixed()``````
``````# From previous steps
mdl_price_vs_dist <- lm(
price_twd_msq ~ sqrt(dist_to_mrt_m),
data = taiwan_real_estate
)
explanatory_data <- tibble(
dist_to_mrt_m = seq(0, 80, 10) ^ 2
)
prediction_data <- explanatory_data %>%
mutate(
price_twd_msq = predict(mdl_price_vs_dist, explanatory_data)
)

ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add points from prediction_data, colored green, size 5
geom_point(data = prediction_data,
color = "green", size = 5)              ``````
``````# From previous steps
mdl_click_vs_impression <- lm(
I(n_clicks ^ 0.25) ~ I(n_impressions ^ 0.25),
)
explanatory_data <- tibble(
n_impressions = seq(0, 3e6, 5e5)
)
prediction_data <- explanatory_data %>%
mutate(
n_clicks_025 = predict(mdl_click_vs_impression, explanatory_data),
n_clicks = n_clicks_025 ^ 4
)

ggplot(ad_conversion, aes(n_impressions ^ 0.25, n_clicks ^ 0.25)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add points from prediction_data, colored green
geom_point(data = prediction_data, color = "green")
``````
``````# Get coeff of determination for mdl_click_vs_impression_orig
mdl_click_vs_impression_orig %>%
# Get the model-level details
glance() %>%
# Pull out r.squared
pull(r.squared)

# Do the same for the transformed model
mdl_click_vs_impression_trans %>%
glance() %>%
pull(r.squared)``````
``````# Plot the three diagnostics for mdl_price_vs_conv
autoplot(mdl_price_vs_conv,
which = 1:3,
nrow = 3,
ncol =1)``````
``````mdl_price_vs_dist %>%
# Augment the model
augment() %>%
# Arrange rows by descending leverage
arrange(desc(.hat)) %>%
# Get the head of the dataset
``````mdl_price_vs_dist %>%
# Augment the model
augment() %>%
# Arrange rows by descending Cook's distance
arrange(desc(.cooksd)) %>%
# Get the head of the dataset
``````
``````# Plot the three outlier diagnostics for mdl_price_vs_dist
autoplot(mdl_price_vs_dist,
which = 4:6,
nrow = 3,
ncol = 1)``````
``````# Using churn, plot time_since_last_purchase
ggplot(churn, aes(time_since_last_purchase)) +
# as a histogram with binwidth 0.25
geom_histogram(binwidth = 0.25) +
# faceted in a grid with has_churned on each row
facet_grid(has_churned~.)``````
``````ggplot(churn, aes(time_since_first_purchase, has_churned)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
# Add a glm trend line, no std error ribbon, binomial family
geom_smooth(method = "glm", se = FALSE, method.args = list(family = binomial))``````
``````# Update the data frame
prediction_data <- explanatory_data %>%
mutate(
has_churned = predict(mdl_churn_vs_relationship, explanatory_data, type = "response"),
most_likely_outcome = round(has_churned)
)

# Update the plot
plt_churn_vs_relationship +
# Add most likely outcome points from prediction_data, colored yellow, size 2
geom_point(data = prediction_data,aes(y = most_likely_outcome), color = "yellow", size = 2)``````