Beta
LAB 7: PARTE CALIFICADA
MARKO ANDRE TALLEDO HERRERA (20162271)
# Leemos las librerias necesarias para el estudio
library(tidyverse)
library(dplyr)
library(ggplot2)
library(stats)
library(e1071)
# Vemos si existen nulos dentro del dataset
colSums(is.na(led_data))
dim(led_data)
# es al rededor del 18% de filas con datos nulos
# Omitimos los datos nulos
led_data = na.omit(led_data)
# Categorizamos y discretizamos variables para crear variables dummies
install.packages('countrycode')
library(countrycode)
led_data = led_data %>%
mutate(Continent = countrycode(Country, 'country.name', 'continent'))
led_data['discret_year'] = ifelse(led_data['Year'] >= 2010, "2010-2015",ifelse(led_data['Year'] >= 2005,"2005-2010",ifelse(led_data['Year'] >= 2000,"200-2005","")))
# Dummificamos las columnas
install.packages('fastDummies')
library(fastDummies)
led_data = dummy_cols(led_data, select_columns = 'Continent')
led_data = dummy_cols(led_data, select_columns = 'discret_year')
led_data = dummy_cols(led_data, select_columns = 'Status')
led_data = led_data[, -which(names(led_data) %in% c("Country", "Year","Status",'Continent','discret_year','Status_Developing','Continent_NA'))]
head(led_data)
# PCA DEL DATASET
prcomp(led_data,scale. = TRUE)
summary(prcomp(led_data),scale.=TRUE)
biplot(prcomp(led_data), scale.=TRUE)
#Sample
x<- sample(1:nrow(led_data),1154)
led_train = led_data[x, ]
led_test = led_data[-x, ]
#Regresion
fit=lm(data=led_train ,Lifeexpectancy ~ .)
fit
#Resumen
summary(fit)
#Graficas de diagnostico
plot(fit)
#Prediccion
pred=predict(fit, newdata = led_test)
style=c(rep(1,12), rep(2,4))