Skip to content

Regressions

Part 1: Simple Linear Regression

# Load necessary libraries
library(readxl) # for reading Excel files
library(tidyverse) # for data manipulation and visualization

# Read datasets from files
fish <- read.csv('fish.csv') # Read fish data from CSV
customer_churn <- read.csv('customer_churn.csv') # Read customer churn data from CSV
College_Town <- read_excel('College_Town.xlsx') # Read college town data from Excel

# Filter the dataset for Bream species
bream <- fish %>% 
  filter(Species == "Bream")

# Perform a quick exploratory data analysis with a scatter plot
ggplot(bream, aes(Length, Weight)) + 
  geom_point()

# Build a linear model object for Weight as a function of Length
model1 <- lm(Weight ~ Length, bream)

# Display a summary of the linear model
summary(model1)

# Load broom library for tidying model outputs
library(broom)

# Extract coefficients of the linear model
coefficients(model1)

# Augment data with information from the linear model
augment(model1)

# Get fitted values from the model
fitted(model1)

# Get a glance of the model's statistics
glance(model1)

# Extract R-squared value from the model's summary
model1 %>% 
  glance() %>%
  pull(r.squared)

# Plot the linear model on top of the scatter plot
ggplot(bream, aes(Length, Weight)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE)

# Step 1: Build a new random dataset for prediction
explanatory_data <- tibble(Length = 20:40)

# Step 2: Predict the values using the new dataset
predict(model1, explanatory_data)

# Step 3: Build a new prediction dataset for plotting
prediction_data <- explanatory_data %>%
  mutate(Weight = predict(model1, explanatory_data))

# Step 4: Plot the new random data over the existing plot
ggplot(bream, aes(Length, Weight)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE) +
  geom_point(
    data=prediction_data,
    color="red"
  )

# Install and load ggfortify for enhanced plotting of model diagnostics
install.packages("ggfortify")
library(ggfortify)

# Plot residuals and diagnostics for the linear model
autoplot(model1, which=1:3, nrow=3, ncol=1) # Residuals
autoplot(model1, which=4:6, nrow=3, ncol=1) # Outliers

Part 2: Multiple Linear Regressions

# Display the structure and summary statistics of the College_Town dataframe
str(College_Town)
summary(College_Town)

# Comments on different regression model formulas
#1 y = b0 + b1x1 + b2x2 + b3x3
#2 y = b0 + b1x1 + b2x2 + Log(b3x3)
#3 log(y) =  b0 + b1x1 + b2x2 + b3x3
#4 log(y) = b0 + b1x1 + b2x2 + Log(b3x3)

# Perform exploratory data analysis
# Visualize the relationship between Sqft and Rent
ggplot(College_Town, aes(Sqft, Rent)) +
  geom_point()

# Visualize the relationship between Beds and Rent
ggplot(College_Town, aes(Beds, Rent)) +
  geom_point()

# Visualize the relationship between Baths and Rent
ggplot(College_Town, aes(Baths, Rent)) +
  geom_point()

# Display the arguments of the lm function
args(lm)

# Fit four different linear models with various transformations
lmodel1 <- lm(Rent ~ Beds + Baths + Sqft, College_Town)
lmodel2 <- lm(Rent ~ Beds + Baths + log(Sqft), College_Town)
lmodel3 <- lm(log(Rent) ~ Beds + Baths + Sqft, College_Town)
lmodel4 <- lm(log(Rent) ~ Beds + Baths + log(Sqft), College_Town)

# Display the summary of each model
summary(lmodel1)
summary(lmodel2)
summary(lmodel3)
summary(lmodel4) # this is identified as the best model

#############################################################################################
# Introduction to FOR loops
for (x in 1:5) {
  print(x)
}

for (x in 1:5) {
  print(x^2)
}

cities <- c("Copenhagen", "Odense", "Aalborg", "Aarhus", "Roskilde")

for(city in cities) {
  print(city)
}

# Break the loop when city name length is 7
for(city in cities) {
  if(nchar(city) == 7)
    break
  print(city)
}

# Skip printing the city name when its length is 7
for(city in cities) {
  if(nchar(city) == 7) {
    next
  } else {
    print(city)
  }
}
#############################################################################################

# Print the R-squared value for each model
models <- list(lmodel1, lmodel2, lmodel3, lmodel4)
for(model in models) {
  print(glance(model))
}
for(model in models) {
  print(model %>% 
          glance() %>%
          pull(r.squared))
}

#Step 1 Generate a new dataset for prediction
explanatory_data_new <- expand_grid(
  Beds=1:5, 
  Baths = 1:4,
  Sqft = unique(College_Town$Sqft)
)
explanatory_data_new <- explanatory_data_new %>% 
  mutate(log_Sqft = log(Sqft))

#Step 2 predictions using model4
predict(lmodel4, explanatory_data_new)

# Step 3 Create a new dataframe with predictions
prediction_data_new <- explanatory_data_new %>% 
  mutate(log_Rent_pred = predict(lmodel4, explanatory_data_new), 
         Rent= exp(log_Rent_pred))

# Step 4 Plot the original data and predictions in a log-log regression model
ggplot(College_Town, aes(Sqft, Rent)) +
  geom_point() + 
  geom_smooth(method = "lm", se=FALSE) +
  geom_point(
    data= prediction_data_new,
    color="red",
    alpha=0.4
  )

Part 3: Multiple Logistic Regression

# Load necessary library for data manipulation
library(dplyr)

# Display a summary of the customer churn dataset
summary(customer_churn)

# Create a subset of the first 300 rows from the customer churn dataset
test <- customer_churn %>%
  slice(1:300) # slice is a function for rows

# Fit a multiple logistic regression model using selected predictors
modelGLM <- glm(Churn ~ Age + Years + Total_Purchase + Account_Manager, family = binomial, data = test)

# Display a summary of the model to check coefficients and model statistics
summary(modelGLM)

# Extract the actual churn values from the test dataset
actual_churn <- test$Churn

# Predict churn using the model and round the predictions to get binary outcomes
modeled_churn <- round(fitted(modelGLM)) # Use fitted values to classify as 0 or 1 based on a 0.5 threshold

# Predict probabilities of churn for the test dataset
prediced_probabilities <- predict(modelGLM, type= "response") # Predicted probabilities of each class

# Calculate the accuracy of the model
accuracy_model <- mean(actual_churn == modeled_churn)

# Load necessary library for creating confusion matrices
library(yardstick)

# Create a confusion matrix from the model predictions and actual values
outcomes <- table(modeled_churn, actual_churn)
outcomes

# Convert the table to a confusion matrix and plot it
confusion <- conf_mat(outcomes)
autoplot(confusion)

# Fit the model again, this time using the ENTIRE customer churn dataset
modelGLM <- glm(Churn ~ Age + Years + Total_Purchase + Account_Manager, family = binomial, data = customer_churn)

# Display a summary of the model to check coefficients and model statistics
summary(modelGLM)

# Extract the actual churn values from the entire dataset
actual_churn <- customer_churn$Churn

# Predict churn using the model and round the predictions to get binary outcomes
modeled_churn <- round(fitted(modelGLM)) # Use fitted values to classify as 0 or 1 based on a 0.5 threshold

# Predict probabilities of churn for the entire dataset
prediced_probabilities <- predict(modelGLM, type= "response") # Predicted probabilities of each class

# Calculate the accuracy of the model for the entire dataset
accuracy_model <- mean(actual_churn == modeled_churn)

# Create a confusion matrix from the model predictions and actual values for the entire dataset
outcomes <- table(modeled_churn, actual_churn)
outcomes

# Convert the table to a confusion matrix and plot it
confusion <- conf_mat(outcomes)
autoplot(confusion)