Querying and Wrangling Data

Megha Joshi

Introduction to the tidyverse

Hex stickers for all the packages that make up the tidyverse

Source: https://www.tidyverse.org/
  • Set of packages for
    • wrangling
    • cleaning
    • organizing
    • summarizing
    • visualizing data

Example Dataset: Men’s World Cup Matches

Source: tidytuesdayR

library(tidyverse)
library(tidyverse)
library(tidytuesdayR)


tt <- tt_load("2022-11-29")
tt

wc_matches <- tt$wcmatches

Example Dataset: Men’s World Cup Matches

glimpse(wc_matches)
Rows: 900
Columns: 15
$ year           <dbl> 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1930, 1…
$ country        <chr> "Uruguay", "Uruguay", "Uruguay", "Uruguay", "Uruguay", …
$ city           <chr> "Montevideo", "Montevideo", "Montevideo", "Montevideo",…
$ stage          <chr> "Group 1", "Group 4", "Group 2", "Group 3", "Group 1", …
$ home_team      <chr> "France", "Belgium", "Brazil", "Peru", "Argentina", "Ch…
$ away_team      <chr> "Mexico", "United States", "Yugoslavia", "Romania", "Fr…
$ home_score     <dbl> 4, 0, 1, 1, 1, 3, 0, 0, 1, 6, 1, 0, 0, 4, 3, 6, 6, 4, 2…
$ away_score     <dbl> 1, 3, 2, 3, 0, 0, 4, 3, 0, 3, 0, 1, 4, 0, 1, 1, 1, 2, 3…
$ outcome        <chr> "H", "A", "A", "A", "H", "H", "A", "A", "H", "H", "H", …
$ win_conditions <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ winning_team   <chr> "France", "United States", "Yugoslavia", "Romania", "Ar…
$ losing_team    <chr> "Mexico", "Belgium", "Brazil", "Peru", "France", "Mexic…
$ date           <date> 1930-07-13, 1930-07-13, 1930-07-14, 1930-07-14, 1930-0…
$ month          <chr> "Jul", "Jul", "Jul", "Jul", "Jul", "Jul", "Jul", "Jul",…
$ dayofweek      <chr> "Sunday", "Sunday", "Monday", "Monday", "Tuesday", "Wed…

dplyr

Verbs

Verb Definition
select() Retrieve particular variables by their name or position
filter() Retrieve rows meeting condition(s)
mutate() Add a variable or transform existing ones
group_by() Groups rows that have same value on one or more variables
summarize() Run functions on aggregated data
arrange() Sort based on a variable in ascending or descending order
slice() Retrieve rows of the data based on their location
case_when() Vectorize multiple ifelse() statements

select()

wc_matches %>%
  select(year, country, stage, home_team, away_team)
# A tibble: 900 × 5
    year country stage   home_team away_team    
   <dbl> <chr>   <chr>   <chr>     <chr>        
 1  1930 Uruguay Group 1 France    Mexico       
 2  1930 Uruguay Group 4 Belgium   United States
 3  1930 Uruguay Group 2 Brazil    Yugoslavia   
 4  1930 Uruguay Group 3 Peru      Romania      
 5  1930 Uruguay Group 1 Argentina France       
 6  1930 Uruguay Group 1 Chile     Mexico       
 7  1930 Uruguay Group 2 Bolivia   Yugoslavia   
 8  1930 Uruguay Group 4 Paraguay  United States
 9  1930 Uruguay Group 3 Uruguay   Peru         
10  1930 Uruguay Group 1 Argentina Mexico       
# ℹ 890 more rows

filter()

wc_matches %>%
  filter(stage == "Final")
# A tibble: 20 × 15
    year country   city  stage home_team away_team home_score away_score outcome
   <dbl> <chr>     <chr> <chr> <chr>     <chr>          <dbl>      <dbl> <chr>  
 1  1930 Uruguay   Mont… Final Uruguay   Argentina          4          2 H      
 2  1934 Italy     Rome  Final Italy     Czechosl…          2          1 H      
 3  1938 France    Colo… Final Hungary   Italy              2          4 A      
 4  1954 Switzerl… Berne Final West Ger… Hungary            3          2 H      
 5  1958 Sweden    Solna Final Sweden    Brazil             2          5 A      
 6  1962 Chile     Sant… Final Brazil    Czechosl…          3          1 H      
 7  1966 England   Lond… Final England   West Ger…          4          2 H      
 8  1970 Mexico    Mexi… Final Brazil    Italy              4          1 H      
 9  1974 Germany   Muni… Final West Ger… Netherla…          2          1 H      
10  1978 Argentina Buen… Final Argentina Netherla…          3          1 H      
11  1982 Spain     Madr… Final Italy     West Ger…          3          1 H      
12  1986 Mexico    Mexi… Final Argentina West Ger…          3          2 H      
13  1990 Italy     Rome  Final West Ger… Argentina          1          0 H      
14  1994 United S… Pasa… Final Brazil    Italy              0          0 H      
15  1998 France    Sain… Final France    Brazil             3          0 H      
16  2002 Japan     Yoko… Final Germany   Brazil             0          2 A      
17  2006 Germany   Berl… Final Italy     France             1          1 H      
18  2010 South Af… Joha… Final Netherla… Spain              0          1 A      
19  2014 Brazil    Rio … Final Germany   Argentina          1          0 H      
20  2018 Russia    Mosc… Final France    Croatia            4          2 H      
# ℹ 6 more variables: win_conditions <chr>, winning_team <chr>,
#   losing_team <chr>, date <date>, month <chr>, dayofweek <chr>

mutate()

wc_matches <- 
  wc_matches %>%
  mutate(stage = ifelse(stage == "Final Round", 
                        "Final", stage),
         score_diff = abs(home_score - away_score)) 

wc_matches %>%
  select(year, stage, home_team, away_team, score_diff)
# A tibble: 900 × 5
    year stage   home_team away_team     score_diff
   <dbl> <chr>   <chr>     <chr>              <dbl>
 1  1930 Group 1 France    Mexico                 3
 2  1930 Group 4 Belgium   United States          3
 3  1930 Group 2 Brazil    Yugoslavia             1
 4  1930 Group 3 Peru      Romania                2
 5  1930 Group 1 Argentina France                 1
 6  1930 Group 1 Chile     Mexico                 3
 7  1930 Group 2 Bolivia   Yugoslavia             4
 8  1930 Group 4 Paraguay  United States          3
 9  1930 Group 3 Uruguay   Peru                   1
10  1930 Group 1 Argentina Mexico                 3
# ℹ 890 more rows

group_by() and summarize()

avg_score_diff <- 
  wc_matches %>%
  filter(stage %in% c("Final", 
                      "Semifinals", 
                      "Quarterfinals", 
                      "Round of 16")) %>%
  group_by(stage) %>%
  summarize(n_matches = n(),
            mean_diff = mean(score_diff),
            sd_diff = sd(score_diff))

avg_score_diff
# A tibble: 4 × 4
  stage         n_matches mean_diff sd_diff
  <chr>             <int>     <dbl>   <dbl>
1 Final                26      1.81    1.39
2 Quarterfinals        66      1.30    1.29
3 Round of 16          89      1.47    1.18
4 Semifinals           36      1.75    1.52

arrange() and case_when()

avg_score_diff <- 
  avg_score_diff %>%
  ungroup() %>%
  mutate(order = case_when(stage == "Round of 16" ~ 1,
                           stage == "Quarterfinals" ~ 2, 
                           stage == "Semifinals" ~ 3, 
                           stage == "Final" ~ 4)) %>%
  arrange(order)

avg_score_diff
# A tibble: 4 × 5
  stage         n_matches mean_diff sd_diff order
  <chr>             <int>     <dbl>   <dbl> <dbl>
1 Round of 16          89      1.47    1.18     1
2 Quarterfinals        66      1.30    1.29     2
3 Semifinals           36      1.75    1.52     3
4 Final                26      1.81    1.39     4

slice()

avg_score_diff %>%
  arrange(desc(mean_diff)) %>%
  slice(1)
# A tibble: 1 × 5
  stage n_matches mean_diff sd_diff order
  <chr>     <int>     <dbl>   <dbl> <dbl>
1 Final        26      1.81    1.39     4

tidyr

Verbs

Verb Definition
pivot_longer() Transform data to long format
pivot_wider() Transform data to wide format
separate() Separate a column into multiple columns
unite() Unite multiple columns into one column

pivot_longer()

avg_score_diff
# A tibble: 4 × 5
  stage         n_matches mean_diff sd_diff order
  <chr>             <int>     <dbl>   <dbl> <dbl>
1 Round of 16          89      1.47    1.18     1
2 Quarterfinals        66      1.30    1.29     2
3 Semifinals           36      1.75    1.52     3
4 Final                26      1.81    1.39     4

pivot_longer()

long_dat <- 
  avg_score_diff %>%
  select(-order) %>%
  pivot_longer(n_matches:sd_diff,
               names_to = 'stat', 
               values_to = 'val')

long_dat
# A tibble: 12 × 3
   stage         stat        val
   <chr>         <chr>     <dbl>
 1 Round of 16   n_matches 89   
 2 Round of 16   mean_diff  1.47
 3 Round of 16   sd_diff    1.18
 4 Quarterfinals n_matches 66   
 5 Quarterfinals mean_diff  1.30
 6 Quarterfinals sd_diff    1.29
 7 Semifinals    n_matches 36   
 8 Semifinals    mean_diff  1.75
 9 Semifinals    sd_diff    1.52
10 Final         n_matches 26   
11 Final         mean_diff  1.81
12 Final         sd_diff    1.39

separate()

long_dat_clean <- 
  long_dat %>%
  separate(stat, into = c("stat", "var"), sep = "_") 

long_dat_clean
# A tibble: 12 × 4
   stage         stat  var       val
   <chr>         <chr> <chr>   <dbl>
 1 Round of 16   n     matches 89   
 2 Round of 16   mean  diff     1.47
 3 Round of 16   sd    diff     1.18
 4 Quarterfinals n     matches 66   
 5 Quarterfinals mean  diff     1.30
 6 Quarterfinals sd    diff     1.29
 7 Semifinals    n     matches 36   
 8 Semifinals    mean  diff     1.75
 9 Semifinals    sd    diff     1.52
10 Final         n     matches 26   
11 Final         mean  diff     1.81
12 Final         sd    diff     1.39

pivot_wider()

long_dat_clean %>%
  select(-var) %>%
  pivot_wider(names_from = stat, values_from = val) %>%
  rename(Stage = stage, 
         `Number of Matches` = n,
         `Average Difference in Goals` = mean,
         `SD of Difference in Goals` = sd)
# A tibble: 4 × 4
  Stage        `Number of Matches` Average Difference i…¹ SD of Difference in …²
  <chr>                      <dbl>                  <dbl>                  <dbl>
1 Round of 16                   89                   1.47                   1.18
2 Quarterfina…                  66                   1.30                   1.29
3 Semifinals                    36                   1.75                   1.52
4 Final                         26                   1.81                   1.39
# ℹ abbreviated names: ¹​`Average Difference in Goals`,
#   ²​`SD of Difference in Goals`

unite()

wc_matches %>%
  unite(world_cup, year, country, sep =  ": ") %>%
  filter(stage == "Final")
# A tibble: 26 × 15
   world_cup       city  stage home_team away_team home_score away_score outcome
   <chr>           <chr> <chr> <chr>     <chr>          <dbl>      <dbl> <chr>  
 1 1930: Uruguay   Mont… Final Uruguay   Argentina          4          2 H      
 2 1934: Italy     Rome  Final Italy     Czechosl…          2          1 H      
 3 1938: France    Colo… Final Hungary   Italy              2          4 A      
 4 1950: Brazil    Rio … Final Brazil    Sweden             7          1 H      
 5 1950: Brazil    São … Final Spain     Uruguay            2          2 D      
 6 1950: Brazil    Rio … Final Brazil    Spain              6          1 H      
 7 1950: Brazil    São … Final Sweden    Uruguay            2          3 A      
 8 1950: Brazil    Rio … Final Brazil    Uruguay            1          2 A      
 9 1950: Brazil    São … Final Spain     Sweden             1          3 A      
10 1954: Switzerl… Berne Final West Ger… Hungary            3          2 H      
# ℹ 16 more rows
# ℹ 7 more variables: win_conditions <chr>, winning_team <chr>,
#   losing_team <chr>, date <date>, month <chr>, dayofweek <chr>,
#   score_diff <dbl>

Thank you!