Beta
Exploring World Cup Data in R
This dataset (source) includes 44,066 results of international football matches starting from the very first official match in 1872 up to 2022. The matches range from FIFA World Cup to FIFI Wild Cup to regular friendly matches. The matches are strictly men's full internationals and the data does not include Olympic Games or matches where at least one of the teams was the nation's B-team, U-23 or a league select team.
Import and prepare the datasets
# Import tidyverse package
library(tidyverse)
Hidden output
# Read results.csv file
results <- read.csv("results.csv")
# Convert the date column to a datetime
results$date <- as.Date(results$date)
# Get year from date; store in new column year
results$year <- format(results$date, "%Y")
# See the results
results
# Read winners.csv file
winners <- read.csv("winners.csv")
# View data
winners
Get the FIFA World Cup data
# Filter results for FIFA World Cup matches
world_cup <- results %>%
filter(tournament == "FIFA World Cup")
# See the results
world_cup
How many matches and teams in every world cup?
# Count the number of matches per year
matches_per_year <- world_cup %>%
group_by(year) %>%
summarise(matches = n())
# See the result
matches_per_year
# Get unique home and away teams by year
unique_teams_per_year <- world_cup %>%
group_by(year) %>%
reframe(team = unique(c(home_team, away_team)))
# Count the number of teams by year
number_of_teams_per_year <- unique_teams_per_year %>%
group_by(year) %>%
summarize(teams = n())
number_of_teams_per_year
# Merge the two data frames
merged_data <- merge(matches_per_year, number_of_teams_per_year, by = "year")
# See the merged data
merged_data
# Visualize merged data
ggplot(merged_data, aes(x = year, y = matches, fill = "Matches")) +
geom_col(position = "dodge") +
geom_col(aes(y = teams, fill = "Teams"), position = "dodge") +
geom_hline(yintercept = 0, color = "black", linetype = "solid") +
scale_fill_manual(values = c("grey90", "#377771")) +
labs(title="Matches and Teams in Each World Cup",
subtitle = "There has been a gradually increase in the number of\nmatches played as the competition adds more teams to compete.",
caption = "Data: Github @martj42 | Viz: Evan Gower",
x = "", y = "Number of") +
theme_minimal() +
theme(plot.title = element_text(size = 16, vjust = 6, face = "bold"),
plot.subtitle = element_text(size = 12, vjust = 7.5),
plot.caption = element_text(size = 7, vjust = 6),
plot.margin = margin(40, 20, 0, 20),
axis.title.y = element_text(size = 10, vjust = 3),
axis.text.x = element_text(size = 7, vjust = 6),
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
legend.position = 'top',
legend.title = element_blank(),
legend.text = element_text(size = 12),
legend.margin = margin(-15, 218, -15, -100)) +
guides(fill = guide_legend(
keywidth = 1, keyheight = 0.4, defualt.unit = "cm", label.position = 'right', nrow = 1))
Which teams have the most appearances?
# Count appearances by team; sort by appearances
appearance_by_country <- unique_teams_per_year %>%
group_by(team) %>%
summarize(appearances = n()) %>%
arrange(desc(appearances)) %>%
head(10)
# See the result
appearance_by_country
# Add column indicating if team is Brazil
appearance_by_country$is_brazil <- ifelse(appearance_by_country$team == "Brazil", TRUE, FALSE)
# Visualize appearance by country
ggplot(appearance_by_country, aes(reorder(team, appearances), appearances, fill = is_brazil)) +
geom_col(width = 0.8) +
geom_hline(yintercept = 0, color = "black", linetype = "solid") +
scale_fill_manual(values = c("grey90", "#377771")) +
labs(title="Teams with the Most Appearances in the World Cup",
subtitle = "There has been 22 world cups and Brazil has appeared in every single one.",
caption = "Data: Github @martj42 | Viz: Evan Gower",
x = "", y = "Number of Appearances") +
theme_minimal() +
theme(plot.title = element_text(size = 16, vjust = 6, face = "bold"),
plot.subtitle = element_text(size = 12, vjust = 7.5),
plot.caption = element_text(size = 7, vjust = 6),
plot.margin = margin(40, 20, 0, 20),
axis.title.y = element_text(size = 10, vjust = 3),
axis.text.x = element_text(size = 8, vjust = 6),
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
legend.position = 'none')