Beta
Data Manipulation with dplyr
Run the hidden code cell below to import the data used in this course.
# Load the Tidyverse
library(tidyverse)
# Load the course datasets
babynames <- read_rds("datasets/babynames.rds")
counties <- read_rds("datasets/counties.rds")
### RENAME a variable ##
counties %>%
# Count the number of counties in each state
count(state) %>%
# Rename the n column to num_counties
rename(num_counties = n)
## COUNT and SORT the observation
# Use count to find the number of counties in each region (and sort from the region with the highest number of counties to the loowest)
counties_selected %>%
count(region, sort = T)
### Mutating and counting
counties_selected %>%
# Add population_walk containing the total number of people who walk to work
mutate(population_walk = population*walk/100) %>%
# Count weighted by the new column, sort in descending order
count(state, wt = population_walk, sort = T)
### SLICE_MIN, SLICE_MAX extract an also draw extreme observations
counties_selected %>%
# Find the total population for each combination of state and metro
group_by(state, metro) %>%
summarize(total_pop = sum(population)) %>%
# Extract the most populated row for each state
slice_max(total_pop, n = 1) %>%
# Count the states with more people in Metro or Nonmetro areas
ungroup() %>%
count(metro)
########### SELECT ##############
counties %>%
# Select the state, county, population, and those ending with "work"
select(state, county, population, ends_with("work")) %>%
# Filter for counties that have at least 50% of people engaged in public work
filter(public_work > 50)
############### SELECT MULTIPLE categories from a single categorical variable #################
selected_names <- babynames %>%
# Filter for the names Steven, Thomas, and Matthew
filter(name %in% c("Steven", "Thomas", "Matthew"))
################# FILTER NA IN a column ##################### this is from "merging datasets in diplyr" work package
# Join versions to sets
sets %>%
left_join (inventory_version_1, by = c("set_num")) %>%
# Filter for where version is na
filter(is.na(version))
########### replace NA with zeros
parts %>%
count(part_cat_id) %>%
right_join(part_categories, by = c("part_cat_id" = "id")) %>%
# Use replace_na to replace missing values in the n column
replace_na(list(n=0))
# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
geom_line()
### FILTER and plot selected categories from a variable
selected_names <- babynames %>%
# Filter for the names Steven, Thomas, and Matthew
filter(name %in% c("Steven", "Thomas", "Matthew"))
# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
geom_line()
### LAG ###
babynames_fraction %>%
# Arrange the data in order of name, then year
arrange(name, year) %>%
# Group the data by name
group_by(name) %>%
# Add a ratio column that contains the ratio of fraction between each year
mutate (ratio = fraction/lag(fraction))
### OR more useful, the difference between each year
# mutate (ratio = fraction-lag(fraction))
# the pivot_longer() function is used to convert data from a wider format to a longer format, often called "melting" or "unpivoting" the data
# below i convert data structure from 3x3 to 9*2 (9 rows 2 columns)
library(tidyr)
movie_reviews %>%
(everything(),
pivot_longer(everything(),
names_to = "movie",
values_to = "rating")
# the str_trim() function is used to remove leading and trailing whitespace (spaces, tabs, and newlines) from a character string
library(stringr)
library(dplyr)
plants %>%
mutate(water_requirement = str_trim(water_requirement)) # this removes spaces from the water_requirement variable
### import columns as text
library(readxl)
read_xlsx("photo.xlsx", col_types = "text")
# merge two (categeorical) columns into one:
library(tidyr)
unite(df, country_iso, countries, iso, sep = "/")
# The str_pad() function is used to pad character strings with a specific character or characters to a specified width or length
library(stringr)
x <- c("A", "BC", "DEF")
str_pad(x, width = 4,
side = "left", pad = "_")
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Add your notes here
# Add your code snippets here