Exercise 2: Regressions (Public)
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Regressions

    Part 1: Simple Linear Regression

    # Load necessary libraries
    library(readxl) # for reading Excel files
    library(tidyverse) # for data manipulation and visualization
    
    # Read datasets from files
    fish <- read.csv('fish.csv') # Read fish data from CSV
    customer_churn <- read.csv('customer_churn.csv') # Read customer churn data from CSV
    College_Town <- read_excel('College_Town.xlsx') # Read college town data from Excel
    
    # Filter the dataset for Bream species
    bream <- fish %>% 
      filter(Species == "Bream")
    
    # Perform a quick exploratory data analysis with a scatter plot
    ggplot(bream, aes(Length, Weight)) + 
      geom_point()
    
    # Build a linear model object for Weight as a function of Length
    model1 <- lm(Weight ~ Length, bream)
    
    # Display a summary of the linear model
    summary(model1)
    
    # Load broom library for tidying model outputs
    library(broom)
    
    # Extract coefficients of the linear model
    coefficients(model1)
    
    # Augment data with information from the linear model
    augment(model1)
    
    # Get fitted values from the model
    fitted(model1)
    
    # Get a glance of the model's statistics
    glance(model1)
    
    # Extract R-squared value from the model's summary
    model1 %>% 
      glance() %>%
      pull(r.squared)
    
    # Plot the linear model on top of the scatter plot
    ggplot(bream, aes(Length, Weight)) +
      geom_point() +
      geom_smooth(method = "lm", se=FALSE)
    
    # Step 1: Build a new random dataset for prediction
    explanatory_data <- tibble(Length = 20:40)
    
    # Step 2: Predict the values using the new dataset
    predict(model1, explanatory_data)
    
    # Step 3: Build a new prediction dataset for plotting
    prediction_data <- explanatory_data %>%
      mutate(Weight = predict(model1, explanatory_data))
    
    # Step 4: Plot the new random data over the existing plot
    ggplot(bream, aes(Length, Weight)) +
      geom_point() +
      geom_smooth(method = "lm", se=FALSE) +
      geom_point(
        data=prediction_data,
        color="red"
      )
    
    # Install and load ggfortify for enhanced plotting of model diagnostics
    install.packages("ggfortify")
    library(ggfortify)
    
    # Plot residuals and diagnostics for the linear model
    autoplot(model1, which=1:3, nrow=3, ncol=1) # Residuals
    autoplot(model1, which=4:6, nrow=3, ncol=1) # Outliers

    Part 2: Multiple Linear Regressions

    # Display the structure and summary statistics of the College_Town dataframe
    str(College_Town)
    summary(College_Town)
    
    # Comments on different regression model formulas
    #1 y = b0 + b1x1 + b2x2 + b3x3
    #2 y = b0 + b1x1 + b2x2 + Log(b3x3)
    #3 log(y) =  b0 + b1x1 + b2x2 + b3x3
    #4 log(y) = b0 + b1x1 + b2x2 + Log(b3x3)
    
    # Perform exploratory data analysis
    # Visualize the relationship between Sqft and Rent
    ggplot(College_Town, aes(Sqft, Rent)) +
      geom_point()
    
    # Visualize the relationship between Beds and Rent
    ggplot(College_Town, aes(Beds, Rent)) +
      geom_point()
    
    # Visualize the relationship between Baths and Rent
    ggplot(College_Town, aes(Baths, Rent)) +
      geom_point()
    
    # Display the arguments of the lm function
    args(lm)
    
    # Fit four different linear models with various transformations
    lmodel1 <- lm(Rent ~ Beds + Baths + Sqft, College_Town)
    lmodel2 <- lm(Rent ~ Beds + Baths + log(Sqft), College_Town)
    lmodel3 <- lm(log(Rent) ~ Beds + Baths + Sqft, College_Town)
    lmodel4 <- lm(log(Rent) ~ Beds + Baths + log(Sqft), College_Town)
    
    # Display the summary of each model
    summary(lmodel1)
    summary(lmodel2)
    summary(lmodel3)
    summary(lmodel4) # this is identified as the best model
    
    #############################################################################################
    # Introduction to FOR loops
    for (x in 1:5) {
      print(x)
    }
    
    for (x in 1:5) {
      print(x^2)
    }
    
    cities <- c("Copenhagen", "Odense", "Aalborg", "Aarhus", "Roskilde")
    
    for(city in cities) {
      print(city)
    }
    
    # Break the loop when city name length is 7
    for(city in cities) {
      if(nchar(city) == 7)
        break
      print(city)
    }
    
    # Skip printing the city name when its length is 7
    for(city in cities) {
      if(nchar(city) == 7) {
        next
      } else {
        print(city)
      }
    }
    #############################################################################################
    
    # Print the R-squared value for each model
    models <- list(lmodel1, lmodel2, lmodel3, lmodel4)
    for(model in models) {
      print(glance(model))
    }
    for(model in models) {
      print(model %>% 
              glance() %>%
              pull(r.squared))
    }
    
    #Step 1 Generate a new dataset for prediction
    explanatory_data_new <- expand_grid(
      Beds=1:5, 
      Baths = 1:4,
      Sqft = unique(College_Town$Sqft)
    )
    explanatory_data_new <- explanatory_data_new %>% 
      mutate(log_Sqft = log(Sqft))
    
    #Step 2 predictions using model4
    predict(lmodel4, explanatory_data_new)
    
    # Step 3 Create a new dataframe with predictions
    prediction_data_new <- explanatory_data_new %>% 
      mutate(log_Rent_pred = predict(lmodel4, explanatory_data_new), 
             Rent= exp(log_Rent_pred))
    
    # Step 4 Plot the original data and predictions in a log-log regression model
    ggplot(College_Town, aes(Sqft, Rent)) +
      geom_point() + 
      geom_smooth(method = "lm", se=FALSE) +
      geom_point(
        data= prediction_data_new,
        color="red",
        alpha=0.4
      )

    Part 3: Multiple Logistic Regression

    # Load necessary library for data manipulation
    library(dplyr)
    
    # Display a summary of the customer churn dataset
    summary(customer_churn)
    
    # Create a subset of the first 300 rows from the customer churn dataset
    test <- customer_churn %>%
      slice(1:300) # slice is a function for rows
    
    # Fit a multiple logistic regression model using selected predictors
    modelGLM <- glm(Churn ~ Age + Years + Total_Purchase + Account_Manager, family = binomial, data = test)
    
    # Display a summary of the model to check coefficients and model statistics
    summary(modelGLM)
    
    # Extract the actual churn values from the test dataset
    actual_churn <- test$Churn
    
    # Predict churn using the model and round the predictions to get binary outcomes
    modeled_churn <- round(fitted(modelGLM)) # Use fitted values to classify as 0 or 1 based on a 0.5 threshold
    
    # Predict probabilities of churn for the test dataset
    prediced_probabilities <- predict(modelGLM, type= "response") # Predicted probabilities of each class
    
    # Calculate the accuracy of the model
    accuracy_model <- mean(actual_churn == modeled_churn)
    
    # Load necessary library for creating confusion matrices
    library(yardstick)
    
    # Create a confusion matrix from the model predictions and actual values
    outcomes <- table(modeled_churn, actual_churn)
    outcomes
    
    # Convert the table to a confusion matrix and plot it
    confusion <- conf_mat(outcomes)
    autoplot(confusion)
    
    # Fit the model again, this time using the ENTIRE customer churn dataset
    modelGLM <- glm(Churn ~ Age + Years + Total_Purchase + Account_Manager, family = binomial, data = customer_churn)
    
    # Display a summary of the model to check coefficients and model statistics
    summary(modelGLM)
    
    # Extract the actual churn values from the entire dataset
    actual_churn <- customer_churn$Churn
    
    # Predict churn using the model and round the predictions to get binary outcomes
    modeled_churn <- round(fitted(modelGLM)) # Use fitted values to classify as 0 or 1 based on a 0.5 threshold
    
    # Predict probabilities of churn for the entire dataset
    prediced_probabilities <- predict(modelGLM, type= "response") # Predicted probabilities of each class
    
    # Calculate the accuracy of the model for the entire dataset
    accuracy_model <- mean(actual_churn == modeled_churn)
    
    # Create a confusion matrix from the model predictions and actual values for the entire dataset
    outcomes <- table(modeled_churn, actual_churn)
    outcomes
    
    # Convert the table to a confusion matrix and plot it
    confusion <- conf_mat(outcomes)
    autoplot(confusion)