this is the nav!
Workspace
Stefan Stojkovic/

# Data Manipulation with dplyr

0
Beta

## .mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}Data Manipulation with dplyr

Run the hidden code cell below to import the data used in this course.

```.mfe-app-workspace-qcdhrn{font-size:13px;line-height:1.5384615384615385;font-family:JetBrainsMonoNL,Menlo,Monaco,'Courier New',monospace;}```# Load the Tidyverse
library(tidyverse)

### RENAME a variable ##
counties %>%
# Count the number of counties in each state
count(state) %>%
# Rename the n column to num_counties
rename(num_counties = n)

## COUNT and SORT the observation
# Use count to find the number of counties in each region (and sort from the region with the highest number of counties to the loowest)
counties_selected %>%
count(region, sort = T)

### Mutating and counting
counties_selected %>%
# Add population_walk containing the total number of people who walk to work
mutate(population_walk = population*walk/100) %>%
# Count weighted by the new column, sort in descending order
count(state, wt = population_walk, sort = T)

### SLICE_MIN, SLICE_MAX extract an also draw extreme observations
counties_selected %>%
# Find the total population for each combination of state and metro
group_by(state, metro) %>%
summarize(total_pop = sum(population)) %>%
# Extract the most populated row for each state
slice_max(total_pop, n = 1) %>%
# Count the states with more people in Metro or Nonmetro areas
ungroup() %>%
count(metro)

########### SELECT ##############
counties %>%
# Select the state, county, population, and those ending with "work"
select(state, county, population, ends_with("work")) %>%
# Filter for counties that have at least 50% of people engaged in public work
filter(public_work > 50)

############### SELECT MULTIPLE categories from a single categorical variable #################
selected_names <- babynames %>%
# Filter for the names Steven, Thomas, and Matthew
filter(name %in% c("Steven", "Thomas", "Matthew"))

################# FILTER NA IN a column ##################### this is from "merging datasets in diplyr" work package
# Join versions to sets
sets %>%
left_join (inventory_version_1, by = c("set_num")) %>%
# Filter for where version is na
filter(is.na(version))

########### replace NA with zeros
parts %>%
count(part_cat_id) %>%
right_join(part_categories, by = c("part_cat_id" = "id")) %>%
# Use replace_na to replace missing values in the n column
replace_na(list(n=0))

# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
geom_line()

### FILTER and plot selected categories from a variable
selected_names <- babynames %>%
# Filter for the names Steven, Thomas, and Matthew
filter(name %in% c("Steven", "Thomas", "Matthew"))

# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
geom_line()

### LAG ###
babynames_fraction %>%
# Arrange the data in order of name, then year
arrange(name, year) %>%
# Group the data by name
group_by(name) %>%
# Add a ratio column that contains the ratio of fraction between each year
mutate (ratio = fraction/lag(fraction))
### OR more useful, the difference between each year
# mutate (ratio = fraction-lag(fraction))

# the pivot_longer() function is used to convert data from a wider format to a longer format, often called "melting" or "unpivoting" the data
# below i convert data structure from 3x3 to 9*2 (9 rows 2 columns)
library(tidyr)

movie_reviews %>%
(everything(),
pivot_longer(everything(),
names_to = "movie",
values_to = "rating")

# the str_trim() function is used to remove leading and trailing whitespace (spaces, tabs, and newlines) from a character string
library(stringr)
library(dplyr)

plants %>%
mutate(water_requirement = str_trim(water_requirement)) # this removes spaces from the water_requirement variable

### import columns as text

# merge two (categeorical) columns into one:
library(tidyr)
unite(df, country_iso, countries, iso, sep = "/")

# The str_pad() function is used to pad character strings with a specific character or characters to a specified width or length
library(stringr)
x <- c("A", "BC", "DEF")

``# Add your code snippets here``