R4DS2e, Chapter 4

Andrew Pua

2023-09-04

Learning objectives

Get a sense of how to transform data frames when:
- Cleaning a dataset
- Calculating subgroup summaries
Get a sense of the motivation and core ideas behind dplyr verbs and the pipe |>

Jumping in: data cleaning

flights |> 
  filter(dest == "IAH") |> 
  mutate(speed = distance / air_time * 60) |> 
  select(year:day, dep_time, carrier, flight, speed) |> 
  arrange(desc(speed))

Articulate what operations were applied to the data.
What questions were asked to get to this code?

Output: data cleaning

library(dplyr)
library(nycflights13)
flights |> 
  filter(dest == "IAH") |> 
  mutate(speed = distance / air_time * 60) |> 
  select(year:day, dep_time, carrier, flight, speed) |> 
  arrange(desc(speed))

# A tibble: 7,198 × 7
    year month   day dep_time carrier flight speed
   <int> <int> <int>    <int> <chr>    <int> <dbl>
 1  2013     7     9      707 UA         226  522.
 2  2013     8    27     1850 UA        1128  521.
 3  2013     8    28      902 UA        1711  519.
 4  2013     8    28     2122 UA        1022  519.
 5  2013     6    11     1628 UA        1178  515.
 6  2013     8    27     1017 UA         333  515.
 7  2013     8    27     1205 UA        1421  515.
 8  2013     8    27     1758 UA         302  515.
 9  2013     9    27      521 UA         252  515.
10  2013     8    28      625 UA         559  515.
# ℹ 7,188 more rows

Output: no pipes

library(dplyr)
library(nycflights13)
flightsIAH <- filter(flights, dest == "IAH")  
flightsIAH <- mutate(flightsIAH, speed = distance / air_time * 60)
flightsIAH <- select(flightsIAH, year:day, dep_time, carrier, flight, speed)
flightsIAH <- arrange(flightsIAH, desc(speed))
flightsIAH

# A tibble: 7,198 × 7
    year month   day dep_time carrier flight speed
   <int> <int> <int>    <int> <chr>    <int> <dbl>
 1  2013     7     9      707 UA         226  522.
 2  2013     8    27     1850 UA        1128  521.
 3  2013     8    28      902 UA        1711  519.
 4  2013     8    28     2122 UA        1022  519.
 5  2013     6    11     1628 UA        1178  515.
 6  2013     8    27     1017 UA         333  515.
 7  2013     8    27     1205 UA        1421  515.
 8  2013     8    27     1758 UA         302  515.
 9  2013     9    27      521 UA         252  515.
10  2013     8    28      625 UA         559  515.
# ℹ 7,188 more rows

Jumping in: subgroup summaries

library(dplyr)
library(nycflights13)
flights |> 
  group_by(month) |> 
  summarize(
    delay = mean(dep_delay, na.rm = TRUE), 
    n = n()
  )

Articulate what operations were applied to the data.
What questions were asked to get to this code?

Output: subgroup summaries

library(dplyr)
library(nycflights13)
flights |> 
  group_by(month) |> 
  summarize(
    delay = mean(dep_delay, na.rm = TRUE), 
    n = n()
  )

# A tibble: 12 × 3
   month delay     n
   <int> <dbl> <int>
 1     1 10.0  27004
 2     2 10.8  24951
 3     3 13.2  28834
 4     4 13.9  28330
 5     5 13.0  28796
 6     6 20.8  28243
 7     7 21.7  29425
 8     8 12.6  29327
 9     9  6.72 27574
10    10  6.24 28889
11    11  5.44 27268
12    12 16.6  28135

Output: no pipes

library(dplyr)
library(nycflights13)
flightsGrouped <- group_by(flights, month)   
flightsAvg <- summarize(flightsGrouped, delay = mean(dep_delay, na.rm = TRUE), n = n())
flightsAvg

# A tibble: 12 × 3
   month delay     n
   <int> <dbl> <int>
 1     1 10.0  27004
 2     2 10.8  24951
 3     3 13.2  28834
 4     4 13.9  28330
 5     5 13.0  28796
 6     6 20.8  28243
 7     7 21.7  29425
 8     8 12.6  29327
 9     9  6.72 27574
10    10  6.24 28889
11    11  5.44 27268
12    12 16.6  28135

Why do the previous exercises?

Inspired by Exercise 4.5.7.6, also try introducing typos
Contrast with + in ggplot(): More about “layering”
Introduce you to dplyr verbs right away:
- Applied to rows: filter(), mutate()
- Applied to columns: select(), arrange()
- Applied to groups: group_by(), summarize()
Later, I will contrast with some base R.

Recurring operations

Get variable names
Forming conditions for filtering and matching
Writing out formulas for variable creation
Quick interactive exploration
Answering a question through a series of operations and report a final answer

Get variable names

library(dplyr)
library(nycflights13)
glimpse(flights)

Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

colnames(flights) # a base R comparison

 [1] "year"           "month"          "day"            "dep_time"      
 [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
 [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
[13] "origin"         "dest"           "air_time"       "distance"      
[17] "hour"           "minute"         "time_hour"

Forming conditions: Exercise 4.2.5.1

To answer the exercise, it helps to use glimpse first.

Had an arrival delay of two or more hours: arr_delay >= 2
Flew to Houston (IAH or HOU): dest == "IAH" | dest == "HOU" or dest %in% c("IAH", "HOU")
Arrived more than two hours late, but didn’t leave late: arr_delay >= 120 & dep_delay <= 0

A harder question was:

Were delayed by at least an hour, but made up over 30 minutes in flight:
- Suggested solution here was: dep_delay >= 60 & dep_delay - arr_delay > 30
- But somehow this felt more complicated to me.
- So we will do an interactive exploration.

Quick interactive exploration

Verbs like select(), rename(), relocate() are useful for “quick peeks” into the dataset, while on the command line.
Options for select(), relocate(), mutate()
slice_head() and related commands also belong to this category.
They can be useful to get a sense of what the data looks like when you “zoom in” a bit.

library(dplyr)
library(nycflights13)
flights |> 
  select(starts_with(c("dep", "arr")), ends_with("time")) |>
  rename(t_dep = dep_time, t_arr = arr_time, d_arr = arr_delay, 
         d_dep = dep_delay, s_arr = sched_arr_time, s_dep = sched_dep_time) |>
  select(s_dep, t_dep, d_dep, s_arr, t_arr, d_arr, air_time) |>
  # will not work for flights arriving after 000 hours
  mutate(f_time = s_arr - s_dep, ac_time = t_arr - t_dep) |> 
  slice_head(n = 5)

# A tibble: 5 × 9
  s_dep t_dep d_dep s_arr t_arr d_arr air_time f_time ac_time
  <int> <int> <dbl> <int> <int> <dbl>    <dbl>  <int>   <int>
1   515   517     2   819   830    11      227    304     313
2   529   533     4   830   850    20      227    301     317
3   540   542     2   850   923    33      160    310     381
4   545   544    -1  1022  1004   -18      183    477     460
5   600   554    -6   837   812   -25      116    237     258

Exercise 4.2.5.5

Problem: Which flights traveled the farthest distance?

library(dplyr)
library(nycflights13)
flights |> 
  arrange(desc(distance)) |>
  select(distance, origin, dest, carrier, flight)

# A tibble: 336,776 × 5
   distance origin dest  carrier flight
      <dbl> <chr>  <chr> <chr>    <int>
 1     4983 JFK    HNL   HA          51
 2     4983 JFK    HNL   HA          51
 3     4983 JFK    HNL   HA          51
 4     4983 JFK    HNL   HA          51
 5     4983 JFK    HNL   HA          51
 6     4983 JFK    HNL   HA          51
 7     4983 JFK    HNL   HA          51
 8     4983 JFK    HNL   HA          51
 9     4983 JFK    HNL   HA          51
10     4983 JFK    HNL   HA          51
# ℹ 336,766 more rows

library(dplyr)
library(nycflights13)
flights |> 
  slice_max(order_by = distance) |>
  select(distance, origin, dest, carrier, flight)

# A tibble: 342 × 5
   distance origin dest  carrier flight
      <dbl> <chr>  <chr> <chr>    <int>
 1     4983 JFK    HNL   HA          51
 2     4983 JFK    HNL   HA          51
 3     4983 JFK    HNL   HA          51
 4     4983 JFK    HNL   HA          51
 5     4983 JFK    HNL   HA          51
 6     4983 JFK    HNL   HA          51
 7     4983 JFK    HNL   HA          51
 8     4983 JFK    HNL   HA          51
 9     4983 JFK    HNL   HA          51
10     4983 JFK    HNL   HA          51
# ℹ 332 more rows

What if you want the five flights with the farthest distance?

library(dplyr)
library(nycflights13)
flights |> 
  select(distance, origin, dest) |>
  group_by(origin, dest) |>
  distinct() |>
  ungroup() |>
  slice_max(order_by = distance, n = 5)

# A tibble: 5 × 3
  distance origin dest 
     <dbl> <chr>  <chr>
1     4983 JFK    HNL  
2     4963 EWR    HNL  
3     3370 EWR    ANC  
4     2586 JFK    SFO  
5     2576 JFK    OAK

Mix cleaning, subgroup summaries, and visualization

Problem: Which plane traveled the most times to LAX in 2013? Plot the distribution of its delay times.

library(dplyr)
library(nycflights13)
plane <- flights |> 
  filter(dest == "LAX") |>
  group_by(tailnum) |>
  summarize(timesToLAX = n()) |>
  slice_max(order_by = timesToLAX) |>
  select(tailnum)
plane

# A tibble: 1 × 1
  tailnum
  <chr>  
1 N328AA

library(dplyr)
library(nycflights13)
library(ggplot2)
flights |>
  filter(tailnum == as.character(plane)) |>
  mutate(totalDelay = dep_delay + arr_delay) |>
  ggplot(aes(x = totalDelay)) +
      geom_histogram()

Contrast: cleaning with base R

library(nycflights13)
flightsIAH <- flights[flights$dest == "IAH", ]
flightsIAH$speed <- flightsIAH$distance / flightsIAH$air_time * 60
# flightsIAH$speed <- with(flightsIAH, distance / air_time * 60)
flightsIAH <- flightsIAH[, c("year", "month", "day", "carrier", "flight", "speed")]
flightsIAH <- flightsIAH[order(flightsIAH$speed, decreasing = TRUE), ]
# flightsIAH$speed <- sort(flightsIAH$speed, decreasing = TRUE, na.last = TRUE)
flightsIAH

# A tibble: 7,198 × 6
    year month   day carrier flight speed
   <int> <int> <int> <chr>    <int> <dbl>
 1  2013     7     9 UA         226  522.
 2  2013     8    27 UA        1128  521.
 3  2013     8    28 UA        1711  519.
 4  2013     8    28 UA        1022  519.
 5  2013     6    11 UA        1178  515.
 6  2013     8    27 UA         333  515.
 7  2013     8    27 UA        1421  515.
 8  2013     8    27 UA         302  515.
 9  2013     9    27 UA         252  515.
10  2013     8    28 UA         559  515.
# ℹ 7,188 more rows

Contrast: group summaries with base R

library(nycflights13)
flightsAvg <- data.frame(month = 1:12)
flightsAvg$delay <- tapply(flights$dep_delay, flights$month, mean, na.rm = TRUE)
flightsAvg$n <- tapply(flights$dep_delay, flights$month, length)
flightsAvg

   month     delay     n
1      1 10.036665 27004
2      2 10.816843 24951
3      3 13.227076 28834
4      4 13.938038 28330
5      5 12.986859 28796
6      6 20.846332 28243
7      7 21.727787 29425
8      8 12.611040 29327
9      9  6.722476 27574
10    10  6.243988 28889
11    11  5.435362 27268
12    12 16.576688 28135