Workspace
Stefan Stojkovic/

Data Manipulation with dplyr

0
Beta
Spinner

Data Manipulation with dplyr

Run the hidden code cell below to import the data used in this course.

# Load the Tidyverse
library(tidyverse)

# Load the course datasets
babynames <- read_rds("datasets/babynames.rds")
counties <- read_rds("datasets/counties.rds")

### RENAME a variable ##
counties %>%
  # Count the number of counties in each state
  count(state) %>%
  # Rename the n column to num_counties
  rename(num_counties = n)

## COUNT and SORT the observation 
# Use count to find the number of counties in each region (and sort from the region with the highest number of counties to the loowest)
counties_selected %>%
  count(region, sort = T)

### Mutating and counting
counties_selected %>%
  # Add population_walk containing the total number of people who walk to work 
  mutate(population_walk = population*walk/100) %>%
  # Count weighted by the new column, sort in descending order
  count(state, wt = population_walk, sort = T)

### SLICE_MIN, SLICE_MAX extract an also draw extreme observations
counties_selected %>%
  # Find the total population for each combination of state and metro
  group_by(state, metro) %>%
  summarize(total_pop = sum(population)) %>%
  # Extract the most populated row for each state
  slice_max(total_pop, n = 1) %>%
  # Count the states with more people in Metro or Nonmetro areas
  ungroup() %>% 
  count(metro)

########### SELECT ##############
counties %>%
  # Select the state, county, population, and those ending with "work"
  select(state, county, population, ends_with("work")) %>%
  # Filter for counties that have at least 50% of people engaged in public work
  filter(public_work > 50)

############### SELECT MULTIPLE categories from a single categorical variable #################
selected_names <- babynames %>%
  # Filter for the names Steven, Thomas, and Matthew 
  filter(name %in% c("Steven", "Thomas", "Matthew"))

################# FILTER NA IN a column ##################### this is from "merging datasets in diplyr" work package
# Join versions to sets
sets %>%
   left_join (inventory_version_1, by = c("set_num")) %>%
  # Filter for where version is na
  filter(is.na(version))

########### replace NA with zeros
parts %>%
  count(part_cat_id) %>%
  right_join(part_categories, by = c("part_cat_id" = "id")) %>%
  # Use replace_na to replace missing values in the n column
  replace_na(list(n=0))

# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
  geom_line()

### FILTER and plot selected categories from a variable
selected_names <- babynames %>%
  # Filter for the names Steven, Thomas, and Matthew 
  filter(name %in% c("Steven", "Thomas", "Matthew"))

# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
  geom_line()

### LAG ###
babynames_fraction %>%
  # Arrange the data in order of name, then year 
  arrange(name, year) %>%
  # Group the data by name
group_by(name) %>%
  # Add a ratio column that contains the ratio of fraction between each year 
  mutate (ratio = fraction/lag(fraction))
### OR more useful, the difference between each year
# mutate (ratio = fraction-lag(fraction))


# the pivot_longer() function is used to convert data from a wider format to a longer format, often called "melting" or "unpivoting" the data
# below i convert data structure from 3x3 to 9*2 (9 rows 2 columns)
library(tidyr)

movie_reviews %>% 
	(everything(), 
	pivot_longer(everything(), 
               names_to = "movie", 
               values_to = "rating")
	 
# the str_trim() function is used to remove leading and trailing whitespace (spaces, tabs, and newlines) from a character string
library(stringr)
library(dplyr)

plants %>% 
  mutate(water_requirement = str_trim(water_requirement)) # this removes spaces from the water_requirement variable
	 
### import columns as text
library(readxl)
read_xlsx("photo.xlsx", col_types = "text")
	 
# merge two (categeorical) columns into one:
library(tidyr)
unite(df, country_iso, countries, iso, sep = "/")
	 
# The str_pad() function is used to pad character strings with a specific character or characters to a specified width or length
library(stringr)
x <- c("A", "BC", "DEF")
	 
str_pad(x, width = 4, 
    side = "left", pad = "_")

Take Notes

Add notes about the concepts you've learned and code cells with code you want to keep.

Add your notes here

# Add your code snippets here