this is the nav!
Workspace
Moussa Diallo/

# Course Notes: Machine Learning in the Tidyverse

0
Beta

Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! For courses that use data, the datasets will be available in the `datasets` folder.

```.mfe-app-workspace-qcdhrn{font-size:13px;line-height:1.5384615384615385;font-family:JetBrainsMonoNL,Menlo,Monaco,'Courier New',monospace;}```# Install the gapminder package
install.packages("gapminder")

# Import any packages you want to use
library(tidyverse)
library(gapminder)``````

### Take Notes

Add notes here about the concepts you've learned and code cells with code you want to keep.

``````# Add your code snippets here
# Explore gapminder

# Prepare the nested data frame gap_nested
library(tidyverse)
gap_nested <- gapminder %>%
group_by(country) %>%
nest()

# Explore gap_nested
``````# Create the unnested data frame called gap_unnnested
gap_unnested <- gap_nested %>%
unnest()

# Confirm that your data was not modified
identical(gapminder, gap_unnested)``````
``````# Extract the data of Algeria
algeria_df <- gap_nested\$data[[1]]

# Calculate the minimum of the population vector
min(algeria_df\$population)

# Calculate the maximum of the population vector
max(algeria_df\$population)

# Calculate the mean of the population vector
mean(algeria_df\$population)``````
``````# Build a linear model for each country
gap_models <- gap_nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy~year, data = .x)))

# Extract the model for Algeria
algeria_model <- gap_models\$model[[1]]

# View the summary for the Algeria model
summary(algeria_model)``````
``````# Build the augmented data frame
algeria_fitted <- augment(algeria_model)

# Compare the predicted values with the actual values of life expectancy
algeria_fitted %>%
ggplot(aes(x = year)) +
geom_point(aes(y = life_expectancy)) +
geom_line(aes(y = .fitted), color = "red")``````
``````# Build a linear model for each country using all features
gap_fullmodel <- gap_nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy ~ ., data = .x)))

fullmodel_perf <- gap_fullmodel %>%
# Extract the fit statistics of each model into data frames
mutate(fit = map(model, ~glance(.x))) %>%
# Simplify the fit data frames for each model
unnest(fit)

# View the performance for the four countries with the worst fitting four simple models you looked at before
fullmodel_perf %>%
filter(country %in% worst_fit\$country) %>%
``````set.seed(42)

# Prepare the initial split object
gap_split <- initial_split(gapminder, prop = 0.75)

# Extract the training data frame
training_data <- training(gap_split)

# Extract the testing data frame
testing_data <- testing(gap_split)

# Calculate the dimensions of both training_data and testing_data
dim(training_data)
dim(testing_data)``````
``````set.seed(42)

# Prepare the data frame containing the cross validation partitions
cv_split <- vfold_cv(training_data, v = 5)

cv_data <- cv_split %>%
mutate(
# Extract the train data frame for each split
train = map(splits, ~training(.x)),
# Extract the validate data frame for each split
validate = map(splits, ~testing(.x))
)

# Use head() to preview cv_data
``````# Build a model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>%
mutate(model = map(train, ~lm(formula = life_expectancy ~ ., data = .x)))``````
``````cv_prep_lm <- cv_models_lm %>%
mutate(
# Extract the recorded life expectancy for the records in the validate data frames
validate_actual = map(validate, ~.x\$life_expectancy),
# Predict life expectancy for each validate set using its corresponding model
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
)``````
``````library(Metrics)
# Calculate the mean absolute error for each validate fold
cv_eval_lm <- cv_prep_lm %>%
mutate(validate_mae = map2_dbl(.x = validate_actual, .y = validate_predicted, ~mae(actual = .x, predicted = .y)))

# Print the validate_mae column
cv_eval_lm\$validate_mae

# Calculate the mean of validate_mae column
mean(cv_eval_lm\$validate_mae)``````
``````library(ranger)

# Build a random forest model for each fold
cv_models_rf <- cv_data %>%
mutate(model = map(train, ~ranger(formula = life_expectancy ~ ., data = .x,
num.trees = 100, seed = 42)))

# Generate predictions using the random forest model
cv_prep_rf <- cv_models_rf %>%
mutate(validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)\$predictions))``````
``````library(ranger)

# Calculate validate MAE for each fold
cv_eval_rf <- cv_prep_rf %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~mae(actual = .x, predicted = .y)))

# Print the validate_mae column
cv_eval_rf\$validate_mae

# Calculate the mean of validate_mae column
mean(cv_eval_rf\$validate_mae)``````
``````# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
crossing(mtry = 2:5)

# Build a model for each fold & mtry combination
cv_model_tunerf <- cv_tune %>%
mutate(model = map2(.x = train, .y = mtry, ~ranger(formula = life_expectancy~.,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))``````
``````# Generate validate predictions for each model
cv_prep_tunerf <- cv_model_tunerf %>%
mutate(validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y)\$predictions))

# Calculate validate MAE for each fold and mtry combination
cv_eval_tunerf <- cv_prep_tunerf %>%
mutate(validate_mae = map2_dbl(.x = validate_actual, .y = validate_predicted, ~mae(actual = .x, predicted = .y)))

# Calculate the mean validate_mae for each mtry used
cv_eval_tunerf %>%
group_by(mtry) %>%
summarise(mean_mae = mean(validate_mae))``````
``````# Build the model using all training data and the best performing parameter
best_model <- ranger(formula = life_expectancy~., data = training_data,
mtry = 4, num.trees = 100, seed = 42)

# Prepare the test_actual vector
test_actual <- testing_data\$life_expectancy

# Predict life_expectancy for the testing_data
test_predicted <- predict(best_model, testing_data)\$predictions

# Calculate the test MAE
mae(test_actual, test_predicted)``````
``````set.seed(42)
cv_split <- vfold_cv(training_data, v = 5)

cv_data <- cv_split %>%
mutate(
# Extract the train data frame for each split
train = map(splits, ~training(.x)),
# Extract the validate data frame for each split
validate = map(splits, ~testing(.x))
)``````
``````# Build a model using the train data for each fold of the cross validation
cv_models_lr <- cv_data %>%
mutate(model = map(.x = train, ~glm(formula = Attrition ~.,
data = .x, family = "binomial")))``````
``````# Extract the first model and validate
model <- cv_models_lr\$model[[1]]
validate <- cv_models_lr\$validate[[1]]

# Prepare binary vector of actual Attrition values in validate
validate_actual <- validate\$Attrition == "Yes"

# Predict the probabilities for the observations in validate
validate_prob <- predict(model, validate, type = "response")

# Prepare binary vector of predicted Attrition values for validate
validate_predicted <- validate_prob > 0.5``````