Competition - Loan Data
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Context

    Data from borrowers that took loans - some paid back and others are still in progress.

    library(baguette)
    library(corrplot)
    library(DataExplorer)
    library(GGally)
    library(ggmosaic)
    library(lubridate)
    library(ranger)
    library(skimr)
    library(themis)
    library(tidymodels)
    library(tidyverse)
    library(vip)
    library(zoo)
    
    t <- theme(axis.text.x = element_text(size = 15),
               axis.text.y = element_text(size = 15),
               title = element_text(size = 16))
    
    options(scipen = 999)
    tidyverse_logo()
    loans <- readr::read_csv('data/loans.csv.gz')
    glimpse(loans)
    # skimr::skim(loans)
    
    print(loans, n = Inf)
    

    Data Definition

    Variableclassdescription
    not_fully_paidfctloan is not fully paid = 1; otherwise = 0
    credit_policyfctcustomer meets credit underwriting criteria = 1; otherwise = 0
    pub_recnumborrower's number of derogatory public records (5) class
    installmentnummonthly payment (if funded)
    int_ratenuminterest rate (6% - 22%)
    log_annual_incnumborrower (self reported) annual income (natural log)
    ficonumborrower FICO credit score
    days_with_cr_linenumborrower # days has had credit line
    revol_balnumborrower revolving balance (amount unpaid/end of credit card billing cycle)
    inq_last_6mthsnumborrower number of inquiries by creditors in the last 6 months
    delinq_2yrsnumborrower times had been 30+ days past due on a payment in the past 2 years
    revol_utilnumborrower revolving line utilization (0-100%)
    dtinumborrower debt-to-income ratio (0-30%)
    purposechrloan purpose

    Extract useful insights and visualize them in the most interesting way possible

    Proportions of being paid versus meeting underwriting policy

    Proprotion Paid 84%, Not Paid 16%
    There is a large imbalance in the categories
    loans %>% count(not_fully_paid) %>% mutate(proportion = n / sum(n))
    
    cp <- loans %>%
      mutate(credit = if_else(credit_policy  == 1, "Meets underwriting policy", "Does not meet policy"),
             paid   = if_else(not_fully_paid == 1, "Not Fully Paid ", " Paid")) %>%
      select(credit, paid)
    # Contingency table
    table(cp$credit, cp$paid)
    
    cp %>%
      ggplot() +
      geom_mosaic(aes(x = product(paid, credit), fill = credit)) +
      theme(legend.position = 'none') +
      labs(x = 'Underwriting Policy', y = 'Loans which are paid') +
      ggtitle("Borrowers meeting policy vs. having the loan paid") + t
      
    Users who meet underwriting policy typically have their loans paid more than users who do not meet policy.
    There are many more users overall who meet policy than not.
    What type of borrowers do not meet our underwriting policy?
    • high interest rates
    • short installment loans
    • lower income
    • high debt to income ratio
    • low FICO scores
    • short credit history
    • tend to not have loans fully paid
    The opposite is true about users who do meet our underwriting policy.
    suppressWarnings(print(loans %>% select(credit_policy, int_rate, installment, log_annual_inc, 
                     dti, fico, days_with_cr_line, not_fully_paid) %>%
      GGally::ggpairs(mapping = aes(color = factor(credit_policy)))))

    Correlations

    corrplot(cor(loans %>% select_if(is.numeric)))

    Log Regression Elimination: Most important factors in having loans that are paid

    summary(glm(not_fully_paid ~ credit_policy + pub_rec + installment + log_annual_inc + fico +
                  revol_bal + inq_last_6mths + int_rate, loans, family = 'binomial'))
    Best predictors of loans that are currently paid:
    • low interest rates
    • low number of public records
    • low number of inquiries in the past 6 months
    • high income
    • high FICO scores
    loans %>%
        mutate(paid = as.factor(case_when(not_fully_paid == 0 ~ 'Yes', 
                                          not_fully_paid == 1 ~ 'No'))) %>%
        ggplot(aes(int_rate, fill = paid)) +
        geom_histogram(bins = 42, position = 'identity', alpha = 0.42, color = 'black') +
        scale_fill_manual(values = c("black", "chartreuse")) +
        scale_y_continuous(breaks = seq(0, 850,50)) +
        scale_x_continuous(labels = scales::percent_format()) +
        labs(y = '', x = "Interest Rate", fill = "Fully Paid") +
        theme(panel.grid.major = element_blank(),
              panel.grid.minor = element_blank()) +
        ggtitle("Distribution of paid loans by interest rate") + t
    
    loans %>%
        mutate(paid = as.factor(case_when(not_fully_paid == 0 ~ 'Yes', 
                                          not_fully_paid == 1 ~ 'No'))) %>%
        ggplot(aes(log_annual_inc, fill = paid)) +
        geom_histogram(bins = 42, position = 'identity', alpha = 0.42, color = 'black') +
        scale_fill_manual(values = c("black", "chartreuse")) +
        scale_y_continuous(breaks = seq(0, 20000,1000)) +
        scale_x_continuous(breaks = seq(0, 13, 1)) +
        theme(panel.grid.major = element_blank(),
              panel.grid.minor = element_blank()) +
        labs(x = "Log Scale Annual Income", y = "", fill = "Fully Paid") +
        ggtitle("Distribution of paid loans by income") + t
    fit <- glm(not_fully_paid ~ int_rate + pub_rec + log_annual_inc, loans, family = 'binomial')
    
    newdata <- with(loans, expand.grid(int_rate = seq(min(int_rate), max(int_rate), length = 22),
                                       pub_rec = unique(pub_rec),
                                       log_annual_inc = quantile(log_annual_inc)))
    
    newdata$probability <- predict(fit, newdata = newdata, type = 'response')
    
    suppressWarnings(print(newdata %>%
        ggplot(aes(int_rate, probability, color = factor(pub_rec))) +
        geom_point() +
        geom_line() +
        geom_hline(yintercept = 0.1, size = 1.5)+
        geom_hline(yintercept = 0.5, size = 1.5)+
        facet_grid(~ log_annual_inc) +
        geom_smooth(method = 'glm', method.args = list(family = 'binomial'), 
                    formula = y ~ x, se = FALSE) +
        scale_x_continuous(labels = scales::percent_format()) +
        labs(x = "Interest Rate", y = 'Probability of Payment', color = "# of derogatory marks") +
        theme(panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(),
              axis.text.x = element_text(angle = 45, hjust = 1)) +
        ggtitle("Probability of payment with Interest Rate & Public Records") + t ))