R4DS2e, Chapter 4

Andrew Pua

2023-09-04

Learning objectives

  • Get a sense of how to transform data frames when:

    • Cleaning a dataset
    • Calculating subgroup summaries
  • Get a sense of the motivation and core ideas behind dplyr verbs and the pipe |>

Jumping in: data cleaning

flights |> 
  filter(dest == "IAH") |> 
  mutate(speed = distance / air_time * 60) |> 
  select(year:day, dep_time, carrier, flight, speed) |> 
  arrange(desc(speed))
  • Articulate what operations were applied to the data.
  • What questions were asked to get to this code?

Output: data cleaning

library(dplyr)
library(nycflights13)
flights |> 
  filter(dest == "IAH") |> 
  mutate(speed = distance / air_time * 60) |> 
  select(year:day, dep_time, carrier, flight, speed) |> 
  arrange(desc(speed))
# A tibble: 7,198 × 7
    year month   day dep_time carrier flight speed
   <int> <int> <int>    <int> <chr>    <int> <dbl>
 1  2013     7     9      707 UA         226  522.
 2  2013     8    27     1850 UA        1128  521.
 3  2013     8    28      902 UA        1711  519.
 4  2013     8    28     2122 UA        1022  519.
 5  2013     6    11     1628 UA        1178  515.
 6  2013     8    27     1017 UA         333  515.
 7  2013     8    27     1205 UA        1421  515.
 8  2013     8    27     1758 UA         302  515.
 9  2013     9    27      521 UA         252  515.
10  2013     8    28      625 UA         559  515.
# ℹ 7,188 more rows

Output: no pipes

library(dplyr)
library(nycflights13)
flightsIAH <- filter(flights, dest == "IAH")  
flightsIAH <- mutate(flightsIAH, speed = distance / air_time * 60)
flightsIAH <- select(flightsIAH, year:day, dep_time, carrier, flight, speed)
flightsIAH <- arrange(flightsIAH, desc(speed))
flightsIAH
# A tibble: 7,198 × 7
    year month   day dep_time carrier flight speed
   <int> <int> <int>    <int> <chr>    <int> <dbl>
 1  2013     7     9      707 UA         226  522.
 2  2013     8    27     1850 UA        1128  521.
 3  2013     8    28      902 UA        1711  519.
 4  2013     8    28     2122 UA        1022  519.
 5  2013     6    11     1628 UA        1178  515.
 6  2013     8    27     1017 UA         333  515.
 7  2013     8    27     1205 UA        1421  515.
 8  2013     8    27     1758 UA         302  515.
 9  2013     9    27      521 UA         252  515.
10  2013     8    28      625 UA         559  515.
# ℹ 7,188 more rows

Jumping in: subgroup summaries

library(dplyr)
library(nycflights13)
flights |> 
  group_by(month) |> 
  summarize(
    delay = mean(dep_delay, na.rm = TRUE), 
    n = n()
  )
  • Articulate what operations were applied to the data.
  • What questions were asked to get to this code?

Output: subgroup summaries

library(dplyr)
library(nycflights13)
flights |> 
  group_by(month) |> 
  summarize(
    delay = mean(dep_delay, na.rm = TRUE), 
    n = n()
  )
# A tibble: 12 × 3
   month delay     n
   <int> <dbl> <int>
 1     1 10.0  27004
 2     2 10.8  24951
 3     3 13.2  28834
 4     4 13.9  28330
 5     5 13.0  28796
 6     6 20.8  28243
 7     7 21.7  29425
 8     8 12.6  29327
 9     9  6.72 27574
10    10  6.24 28889
11    11  5.44 27268
12    12 16.6  28135

Output: no pipes

library(dplyr)
library(nycflights13)
flightsGrouped <- group_by(flights, month)   
flightsAvg <- summarize(flightsGrouped, delay = mean(dep_delay, na.rm = TRUE), n = n())
flightsAvg
# A tibble: 12 × 3
   month delay     n
   <int> <dbl> <int>
 1     1 10.0  27004
 2     2 10.8  24951
 3     3 13.2  28834
 4     4 13.9  28330
 5     5 13.0  28796
 6     6 20.8  28243
 7     7 21.7  29425
 8     8 12.6  29327
 9     9  6.72 27574
10    10  6.24 28889
11    11  5.44 27268
12    12 16.6  28135

Why do the previous exercises?

  • Inspired by Exercise 4.5.7.6, also try introducing typos

  • Contrast with + in ggplot(): More about “layering”

  • Introduce you to dplyr verbs right away:

    • Applied to rows: filter(), mutate()
    • Applied to columns: select(), arrange()
    • Applied to groups: group_by(), summarize()
  • Later, I will contrast with some base R.

Recurring operations

  • Get variable names
  • Forming conditions for filtering and matching
  • Writing out formulas for variable creation
  • Quick interactive exploration
  • Answering a question through a series of operations and report a final answer

Get variable names

library(dplyr)
library(nycflights13)
glimpse(flights)
Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
colnames(flights) # a base R comparison
 [1] "year"           "month"          "day"            "dep_time"      
 [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
 [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
[13] "origin"         "dest"           "air_time"       "distance"      
[17] "hour"           "minute"         "time_hour"     

Forming conditions: Exercise 4.2.5.1

To answer the exercise, it helps to use glimpse first.

  • Had an arrival delay of two or more hours: arr_delay >= 2
  • Flew to Houston (IAH or HOU): dest == "IAH" | dest == "HOU" or dest %in% c("IAH", "HOU")
  • Arrived more than two hours late, but didn’t leave late: arr_delay >= 120 & dep_delay <= 0

A harder question was:

  • Were delayed by at least an hour, but made up over 30 minutes in flight:

    • Suggested solution here was: dep_delay >= 60 & dep_delay - arr_delay > 30
    • But somehow this felt more complicated to me.
    • So we will do an interactive exploration.

Quick interactive exploration

  • Verbs like select(), rename(), relocate() are useful for “quick peeks” into the dataset, while on the command line.
  • Options for select(), relocate(), mutate()
  • slice_head() and related commands also belong to this category.
  • They can be useful to get a sense of what the data looks like when you “zoom in” a bit.
library(dplyr)
library(nycflights13)
flights |> 
  select(starts_with(c("dep", "arr")), ends_with("time")) |>
  rename(t_dep = dep_time, t_arr = arr_time, d_arr = arr_delay, 
         d_dep = dep_delay, s_arr = sched_arr_time, s_dep = sched_dep_time) |>
  select(s_dep, t_dep, d_dep, s_arr, t_arr, d_arr, air_time) |>
  # will not work for flights arriving after 000 hours
  mutate(f_time = s_arr - s_dep, ac_time = t_arr - t_dep) |> 
  slice_head(n = 5)
# A tibble: 5 × 9
  s_dep t_dep d_dep s_arr t_arr d_arr air_time f_time ac_time
  <int> <int> <dbl> <int> <int> <dbl>    <dbl>  <int>   <int>
1   515   517     2   819   830    11      227    304     313
2   529   533     4   830   850    20      227    301     317
3   540   542     2   850   923    33      160    310     381
4   545   544    -1  1022  1004   -18      183    477     460
5   600   554    -6   837   812   -25      116    237     258

Exercise 4.2.5.5

Problem: Which flights traveled the farthest distance?

library(dplyr)
library(nycflights13)
flights |> 
  arrange(desc(distance)) |>
  select(distance, origin, dest, carrier, flight)
# A tibble: 336,776 × 5
   distance origin dest  carrier flight
      <dbl> <chr>  <chr> <chr>    <int>
 1     4983 JFK    HNL   HA          51
 2     4983 JFK    HNL   HA          51
 3     4983 JFK    HNL   HA          51
 4     4983 JFK    HNL   HA          51
 5     4983 JFK    HNL   HA          51
 6     4983 JFK    HNL   HA          51
 7     4983 JFK    HNL   HA          51
 8     4983 JFK    HNL   HA          51
 9     4983 JFK    HNL   HA          51
10     4983 JFK    HNL   HA          51
# ℹ 336,766 more rows
library(dplyr)
library(nycflights13)
flights |> 
  slice_max(order_by = distance) |>
  select(distance, origin, dest, carrier, flight)
# A tibble: 342 × 5
   distance origin dest  carrier flight
      <dbl> <chr>  <chr> <chr>    <int>
 1     4983 JFK    HNL   HA          51
 2     4983 JFK    HNL   HA          51
 3     4983 JFK    HNL   HA          51
 4     4983 JFK    HNL   HA          51
 5     4983 JFK    HNL   HA          51
 6     4983 JFK    HNL   HA          51
 7     4983 JFK    HNL   HA          51
 8     4983 JFK    HNL   HA          51
 9     4983 JFK    HNL   HA          51
10     4983 JFK    HNL   HA          51
# ℹ 332 more rows

What if you want the five flights with the farthest distance?

library(dplyr)
library(nycflights13)
flights |> 
  select(distance, origin, dest) |>
  group_by(origin, dest) |>
  distinct() |>
  ungroup() |>
  slice_max(order_by = distance, n = 5)
# A tibble: 5 × 3
  distance origin dest 
     <dbl> <chr>  <chr>
1     4983 JFK    HNL  
2     4963 EWR    HNL  
3     3370 EWR    ANC  
4     2586 JFK    SFO  
5     2576 JFK    OAK  

Mix cleaning, subgroup summaries, and visualization

Problem: Which plane traveled the most times to LAX in 2013? Plot the distribution of its delay times.

library(dplyr)
library(nycflights13)
plane <- flights |> 
  filter(dest == "LAX") |>
  group_by(tailnum) |>
  summarize(timesToLAX = n()) |>
  slice_max(order_by = timesToLAX) |>
  select(tailnum)
plane
# A tibble: 1 × 1
  tailnum
  <chr>  
1 N328AA 
library(dplyr)
library(nycflights13)
library(ggplot2)
flights |>
  filter(tailnum == as.character(plane)) |>
  mutate(totalDelay = dep_delay + arr_delay) |>
  ggplot(aes(x = totalDelay)) +
      geom_histogram()

Contrast: cleaning with base R

library(nycflights13)
flightsIAH <- flights[flights$dest == "IAH", ]
flightsIAH$speed <- flightsIAH$distance / flightsIAH$air_time * 60
# flightsIAH$speed <- with(flightsIAH, distance / air_time * 60)
flightsIAH <- flightsIAH[, c("year", "month", "day", "carrier", "flight", "speed")]
flightsIAH <- flightsIAH[order(flightsIAH$speed, decreasing = TRUE), ]
# flightsIAH$speed <- sort(flightsIAH$speed, decreasing = TRUE, na.last = TRUE)
flightsIAH
# A tibble: 7,198 × 6
    year month   day carrier flight speed
   <int> <int> <int> <chr>    <int> <dbl>
 1  2013     7     9 UA         226  522.
 2  2013     8    27 UA        1128  521.
 3  2013     8    28 UA        1711  519.
 4  2013     8    28 UA        1022  519.
 5  2013     6    11 UA        1178  515.
 6  2013     8    27 UA         333  515.
 7  2013     8    27 UA        1421  515.
 8  2013     8    27 UA         302  515.
 9  2013     9    27 UA         252  515.
10  2013     8    28 UA         559  515.
# ℹ 7,188 more rows

Contrast: group summaries with base R

library(nycflights13)
flightsAvg <- data.frame(month = 1:12)
flightsAvg$delay <- tapply(flights$dep_delay, flights$month, mean, na.rm = TRUE)
flightsAvg$n <- tapply(flights$dep_delay, flights$month, length)
flightsAvg
   month     delay     n
1      1 10.036665 27004
2      2 10.816843 24951
3      3 13.227076 28834
4      4 13.938038 28330
5      5 12.986859 28796
6      6 20.846332 28243
7      7 21.727787 29425
8      8 12.611040 29327
9      9  6.722476 27574
10    10  6.243988 28889
11    11  5.435362 27268
12    12 16.576688 28135