Exploring World Cup Data in R
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Exploring World Cup Data in R

    This dataset (source) includes 44,066 results of international football matches starting from the very first official match in 1872 up to 2022. The matches range from FIFA World Cup to FIFI Wild Cup to regular friendly matches. The matches are strictly men's full internationals and the data does not include Olympic Games or matches where at least one of the teams was the nation's B-team, U-23 or a league select team.

    Import and prepare the datasets

    # Import tidyverse package
    library(tidyverse)
    Hidden output
    # Read results.csv file
    results <- read.csv("results.csv")
    
    # Convert the date column to a datetime
    results$date <- as.Date(results$date)
    
    # Get year from date; store in new column year
    results$year <- format(results$date, "%Y")
    
    # See the results
    results
    # Read winners.csv file
    winners <- read.csv("winners.csv")
    
    # View data
    winners

    Get the FIFA World Cup data

    # Filter results for FIFA World Cup matches
    world_cup <- results %>%
      filter(tournament == "FIFA World Cup")
    
    # See the results
    world_cup

    How many matches and teams in every world cup?

    # Count the number of matches per year
    matches_per_year <- world_cup %>%
      group_by(year) %>%
      summarise(matches = n())
    
    # See the result
    matches_per_year
    # Get unique home and away teams by year
    unique_teams_per_year <- world_cup %>%
      group_by(year) %>%
      reframe(team = unique(c(home_team, away_team)))
    
    # Count the number of teams by year
    number_of_teams_per_year <- unique_teams_per_year %>%
      group_by(year) %>%
      summarize(teams = n())
    
    number_of_teams_per_year
    # Merge the two data frames
    merged_data <- merge(matches_per_year, number_of_teams_per_year, by = "year")
    
    # See the merged data
    merged_data
    # Visualize merged data
    ggplot(merged_data, aes(x = year, y = matches, fill = "Matches")) +
      geom_col(position = "dodge") +
      geom_col(aes(y = teams, fill = "Teams"), position = "dodge") +
      geom_hline(yintercept = 0, color = "black", linetype = "solid") +
      scale_fill_manual(values = c("grey90", "#377771")) +
      labs(title="Matches and Teams in Each World Cup", 
           subtitle = "There has been a gradually increase in the number of\nmatches played as the competition adds more teams to compete.",
           caption = "Data: Github @martj42 | Viz: Evan Gower",
            x = "", y = "Number of") +
      theme_minimal() +
      theme(plot.title = element_text(size = 16, vjust = 6, face = "bold"),
            plot.subtitle = element_text(size = 12, vjust = 7.5),
    		plot.caption = element_text(size = 7, vjust = 6),
    		plot.margin = margin(40, 20, 0, 20),
    		axis.title.y = element_text(size = 10, vjust = 3),
    		axis.text.x = element_text(size = 7, vjust = 6),
            panel.grid.major.x = element_blank(), 
            panel.grid.minor.y = element_blank(),
    		legend.position = 'top', 
            legend.title = element_blank(), 
            legend.text = element_text(size = 12),
    	    legend.margin = margin(-15, 218, -15, -100)) +
      guides(fill = guide_legend(
        keywidth = 1, keyheight = 0.4, defualt.unit = "cm", label.position = 'right', nrow = 1))

    Which teams have the most appearances?

    # Count appearances by team; sort by appearances
    appearance_by_country <- unique_teams_per_year %>%
      group_by(team) %>%
      summarize(appearances = n()) %>%
      arrange(desc(appearances)) %>%
      head(10)
    
    # See the result
    appearance_by_country
    # Add column indicating if team is Brazil
    appearance_by_country$is_brazil <- ifelse(appearance_by_country$team == "Brazil", TRUE, FALSE)
    
    # Visualize appearance by country
    ggplot(appearance_by_country, aes(reorder(team, appearances), appearances, fill = is_brazil)) +
      geom_col(width = 0.8) +
      geom_hline(yintercept = 0, color = "black", linetype = "solid") +
      scale_fill_manual(values = c("grey90", "#377771")) +
      labs(title="Teams with the Most Appearances in the World Cup", 
           subtitle = "There has been 22 world cups and Brazil has appeared in every single one.",
           caption = "Data: Github @martj42 | Viz: Evan Gower",
            x = "", y = "Number of Appearances") +
      theme_minimal() +
      theme(plot.title = element_text(size = 16, vjust = 6, face = "bold"),
            plot.subtitle = element_text(size = 12, vjust = 7.5),
    		plot.caption = element_text(size = 7, vjust = 6),
    		plot.margin = margin(40, 20, 0, 20),
    		axis.title.y = element_text(size = 10, vjust = 3),
    		axis.text.x = element_text(size = 8, vjust = 6),
            panel.grid.major.x = element_blank(), 
            panel.grid.minor.y = element_blank(),
    		legend.position = 'none')