Strings and Factors

Megha Joshi

Strings

Introduction to stringr

Hexagon logo of the stringr package with a violin in the middle

Source: https://stringr.tidyverse.org/
  • What are strings?

    • Character(s) inside ‘..’ or “..”
  • stringr is a package in tidyverse for manipulating and working with strings

    x <- "This is a string."
    x
    [1] "This is a string."

Verbs

Verb Definition
str_detect() Returns TRUE if a pattern is present in a string
str_split() Split a string into two or more
str_remove() Remove a pattern from a string
str_sub() Extract parts of strings given locations
str_wrap() Wrap string as a paragraph

These are just a handful of available functions. Please check out https://stringr.tidyverse.org/reference/index.html for more.

Example Dataset

library(tidytuesdayR)
library(tidyverse)

tuesdata <- tidytuesdayR::tt_load('2025-02-11')
cdc_data <- tuesdata$cdc_datasets

glimpse(cdc_data)
Rows: 1,257
Columns: 27
$ dataset_url                  <chr> "https://archive.org/download/20250128-cd…
$ contact_name                 <chr> "Physical Effects Research Branch (PERB),…
$ contact_email                <chr> "sa-cin-webteam@cdc.gov", "sa-cin-webteam…
$ bureau_code                  <chr> "009:20", "009:20", "009:20", "009:20", "…
$ program_code                 <chr> "009:034", "009:034", "009:034", "009:034…
$ category                     <chr> "National Institute for Occupational Safe…
$ tags                         <chr> "This dataset does not have any tags", "T…
$ publisher                    <chr> NA, NA, NA, NA, "Active Bacterial Core Su…
$ public_access_level          <chr> NA, NA, NA, NA, "public", NA, "public", "…
$ footnotes                    <chr> NA, NA, NA, NA, "*ABCs IPD Isolates were …
$ license                      <chr> NA, NA, NA, NA, NA, "The license for this…
$ source_link                  <chr> NA, NA, NA, NA, NA, "http://www.cdc.gov/p…
$ issued                       <chr> NA, NA, NA, NA, NA, NA, "2021-10-01", "20…
$ geographic_coverage          <chr> NA, NA, NA, NA, NA, NA, "US", "US", NA, N…
$ temporal_applicability       <chr> NA, NA, NA, NA, NA, NA, "2020-07-01/2021-…
$ update_frequency             <chr> NA, NA, NA, NA, NA, NA, "monthly", "month…
$ described_by                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, "https://…
$ homepage                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, "https://…
$ geographic_unit_of_analysis  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ suggested_citation           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ geospatial_resolution        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ references                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ glossary_methodology         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ access_level_comment         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ analytical_methods_reference <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ language                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ collection                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

str_detect()

str_detect(cdc_data$tags, "covid")
   [1] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
  [13] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
  [37] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
  [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
  [73]  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE
  [97]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
 [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
 [193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
 [205]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE
 [217] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
 [229]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [289] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
 [301]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [337] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [349] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [361] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE
 [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE
 [397] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [445] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
 [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [649] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [673] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [685] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [697] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [709] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [721] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [733] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [745] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [757] FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [769] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [781] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [793] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [805] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [817] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [829] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [841] FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE
 [853] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [865] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE
 [877] FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [889]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [901]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [913] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [925] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
 [937] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
 [949] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [961] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [973] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [985] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [997] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1009] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
[1021]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
[1033]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
[1045]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1057]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE
[1069] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
[1081] FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[1093] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE
[1105]  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE
[1117]  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE
[1129]  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[1141]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1153] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1165] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1189] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1213] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1225] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1237] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[1249] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

str_detect() with filter()

cdc_data %>%
  filter(str_detect(tags, "covid"))
# A tibble: 209 × 27
   dataset_url      contact_name contact_email bureau_code program_code category
   <chr>            <chr>        <chr>         <chr>       <chr>        <chr>   
 1 https://archive… CDC-INFO     cdcinfo@cdci… 009:20      009:020      Laborat…
 2 https://archive… CDC-INFO     cdcinfo@cdci… 009:20      009:020      Laborat…
 3 https://archive… CDC-INFO     cdcinfo@cdci… 009:20      009:020      Laborat…
 4 https://archive… National Sy… nssp@cdc.gov  009:20      009:037      Public …
 5 https://archive… National Sy… nssp@cdc.gov  009:20      009:026      Public …
 6 https://archive… National Ce… cdcinfo@cdc.… 009:20      009:020      NCHS    
 7 https://archive… National Ce… cdcinfo@cdc.… 009:20      009:020      NCHS    
 8 https://archive… National Ce… cdcinfo@cdc.… 009:20      009:020      NCHS    
 9 https://archive… National Ce… cdcinfo@cdc.… 009:20      009:020      NCHS    
10 https://archive… National Ce… cdcinfo@cdc.… 009:20      009:020      NCHS    
# ℹ 199 more rows
# ℹ 21 more variables: tags <chr>, publisher <chr>, public_access_level <chr>,
#   footnotes <chr>, license <chr>, source_link <chr>, issued <chr>,
#   geographic_coverage <chr>, temporal_applicability <chr>,
#   update_frequency <chr>, described_by <chr>, homepage <chr>,
#   geographic_unit_of_analysis <chr>, suggested_citation <chr>,
#   geospatial_resolution <chr>, references <chr>, …

str_split()

emails <- cdc_data$contact_email

head(str_split(emails, "@"))
[[1]]
[1] "sa-cin-webteam" "cdc.gov"       

[[2]]
[1] "sa-cin-webteam" "cdc.gov"       

[[3]]
[1] "sa-cin-webteam" "cdc.gov"       

[[4]]
[1] "HHERequestHelp" "cdc.gov"       

[[5]]
[1] "abcs"    "cdc.gov"

[[6]]
[1] "cdcinfo" "cdc.gov"

str_remove()

cdc_data %>%
  mutate(bureau_code_new = str_remove(bureau_code, ":")) %>%
  select(bureau_code, bureau_code_new)
# A tibble: 1,257 × 2
   bureau_code bureau_code_new
   <chr>       <chr>          
 1 009:20      00920          
 2 009:20      00920          
 3 009:20      00920          
 4 009:20      00920          
 5 009:20      00920          
 6 009:00      00900          
 7 009:20      00920          
 8 009:20      00920          
 9 009:20      00920          
10 009:20      00920          
# ℹ 1,247 more rows

str_sub()

cdc_data %>%
  mutate(program_three = str_sub(program_code, 5, 7)) %>%
  select(program_code, program_three)
# A tibble: 1,257 × 2
   program_code program_three
   <chr>        <chr>        
 1 009:034      034          
 2 009:034      034          
 3 009:034      034          
 4 009:034      034          
 5 009:020      020          
 6 009:020      020          
 7 009:020      020          
 8 009:020      020          
 9 009:020      020          
10 009:020      020          
# ℹ 1,247 more rows

str_wrap()

highest_cat <- 
  cdc_data %>%
  group_by(category) %>%
  count() %>%
  ungroup() %>%
  arrange(desc(n)) %>%
  slice(1:5)

p <- 
  ggplot(highest_cat, aes(x = category, y = n)) + 
  geom_bar(stat = "identity") +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 20)) +
  theme_bw()

The Plot

p

Factors

Introduction to forcats

Hexagon logo of the forcats package with image of a box with four cats in it

Source: https://forcats.tidyverse.org/
  • What are factors?

    • Categorical variables with set number levels (possible values)
  • forcats is a package in tidyverse for working with factors

    factor(c(LETTERS[1:5]))
    [1] A B C D E
    Levels: A B C D E

Verbs

Col1 Col2
fct_relevel() Change levels of factor
fct_lump() Lump different levels of factor
fct_infreq() Order levels of factor based on frequency

These are again just a handful of available functions. Please check out https://forcats.tidyverse.org/reference/index.html for more.

Categories from CDC Data

cdc_cat <- 
  cdc_data %>%
  mutate(category = factor(category))

head(levels(cdc_cat$category))
[1] "500 Cities & Places"                   
[2] "Administrative"                        
[3] "Assisted Reproductive Technology (ART)"
[4] "Behavioral Risk Factors"               
[5] "Cancer Research Citation Search"       
[6] "Case Surveillance"                     

fct_relevel()

cdc_data<-
  cdc_data %>%
  mutate(new_cat = fct_relevel(category, "500 Cities & Places", after = 2))

head(levels(cdc_data$new_cat))
[1] "Administrative"                        
[2] "Assisted Reproductive Technology (ART)"
[3] "500 Cities & Places"                   
[4] "Behavioral Risk Factors"               
[5] "Cancer Research Citation Search"       
[6] "Case Surveillance"                     

fct_lump()

cdc_data <- 
  cdc_data %>%
  mutate(cat_short = fct_lump_n(category, n = 3))

table(cdc_data$cat_short)

National Institute for Occupational Safety and Health 
                                                  108 
                                                 NCHS 
                                                  184 
                                                NNDSS 
                                                  293 
                                                Other 
                                                  672 

fct_infreq()

cdc_data <- 
  cdc_data %>%
  mutate(cat_short = fct_infreq(cat_short))

table(cdc_data$cat_short)

                                                Other 
                                                  672 
                                                NNDSS 
                                                  293 
                                                 NCHS 
                                                  184 
National Institute for Occupational Safety and Health 
                                                  108 

Thank you!