Beta
HR Analytics: Exploring Employee Data with R
# Import libraries needed
library(dplyr)
library(tidyr)
library(readr)
library(broom)
library(ggplot2)
# Get data urls
hr_data_url <- "https://assets.datacamp.com/production/course_5977/datasets/hr_data.csv"
fair_pay_data_url <- "https://assets.datacamp.com/production/course_5977/datasets/fair_pay_data.csv"
survey_data_url <- "https://assets.datacamp.com/production/course_5977/datasets/survey_data.csv"
# Read data
hr_data <- read_csv(hr_data_url, show_col_types = FALSE)
fair_pay_data <- read_csv(fair_pay_data_url, show_col_types = FALSE)
survey_data <- read_csv(survey_data_url, show_col_types = FALSE)
# Show head and structure of each dataframe
head(hr_data)
glimpse(hr_data)
head(fair_pay_data)
glimpse(fair_pay_data)
head(survey_data)
glimpse(survey_data)
# Merge survey and fair_pay data
merged <- merge(survey_data, fair_pay_data, by = c("employee_id", "department", "salary"), all.x = TRUE)
head(merged)
glimpse(merged)
# Merge the merged dataframe with hr_data
data <- merge(merged, hr_data, by = c("employee_id", "department", "job_level"))
head(data)
glimpse(data)
# Check for missing values
sum(is.na(data))
mean_salary_for_dep <- data %>%
group_by(department) %>%
summarize(mean_salary = mean(salary))
ggplot(mean_salary_for_dep, aes(x=department, y=mean_salary, fill=department)) +
geom_col() +
scale_fill_manual(values=c("darkred",
"darkgreen",
"darkblue")) +
theme(legend.position = "none")
# Are employees in finance earning the same as others?
data <- data %>%
mutate(is_finance = ifelse(department == "Finance", 1, 0))
test <- t.test(salary~is_finance, data=data) %>%
tidy()
test %>%
pull(p.value, statistic)
if(test$p.value < 0.05){
print('Null hypothesis rejected at 95% CL')
} else {
print("Failed to reject null hypothesis at 95% CL")
}
mean_salary_for_dep_and_gender <- data %>%
group_by(department, gender) %>%
summarize(mean_salary = mean(salary))
ggplot(mean_salary_for_dep_and_gender, aes(x=department, y=mean_salary, fill=department)) +
geom_col() +
facet_wrap(~gender) +
scale_fill_manual(values=c("darkred",
"darkgreen",
"darkblue")) +
theme(legend.position = "none")
# Are gender and department independent?
test <- chisq.test(data$department, data$gender) %>%
tidy()
test %>%
pull(p.value, statistic)
if(test$p.value < 0.05){
print('Null hypothesis rejected at 95% CL')
} else {
print("Failed to reject null hypothesis at 95% CL")
}
mean_salary_for_dep_and_level <- data %>%
group_by(department, job_level) %>%
summarize(mean_salary = mean(salary))
ggplot(mean_salary_for_dep_and_level, aes(x=job_level, y=mean_salary, fill=job_level)) +
geom_col() +
facet_wrap(~department) +
scale_fill_manual(values=c("lightblue",
"darkolivegreen",
"darkcyan")) +
theme(legend.position = "none")
# Are salaried employees earning the same as hourly ones?
salaried_hourly <- data %>%
filter(job_level != "Manager")
test <- t.test(salary~job_level, data=salaried_hourly) %>%
tidy()
test %>%
pull(p.value, statistic)
if(test$p.value < 0.05){
print('Null hypothesis rejected at 95% CL')
} else {
print("Failed to reject null hypothesis at 95% CL")
}
ggplot(data, aes(x=gender, fill=job_level)) +
geom_bar() +
scale_fill_manual(values=c("lightblue",
"darkolivegreen",
"darkcyan"))
# Are gender and job level independent?
test <- chisq.test(data$gender, data$job_level) %>%
tidy()
test %>%
pull(p.value, statistic)
if(test$p.value < 0.05){
print('Null hypothesis rejected at 95% CL')
} else {
print("Failed to reject null hypothesis at 95% CL")
}