2023-09-04
Get a sense of how to transform data frames when:
Get a sense of the motivation and core ideas behind dplyr
verbs and the pipe |>
library(dplyr)
library(nycflights13)
flights |>
filter(dest == "IAH") |>
mutate(speed = distance / air_time * 60) |>
select(year:day, dep_time, carrier, flight, speed) |>
arrange(desc(speed))
# A tibble: 7,198 × 7
year month day dep_time carrier flight speed
<int> <int> <int> <int> <chr> <int> <dbl>
1 2013 7 9 707 UA 226 522.
2 2013 8 27 1850 UA 1128 521.
3 2013 8 28 902 UA 1711 519.
4 2013 8 28 2122 UA 1022 519.
5 2013 6 11 1628 UA 1178 515.
6 2013 8 27 1017 UA 333 515.
7 2013 8 27 1205 UA 1421 515.
8 2013 8 27 1758 UA 302 515.
9 2013 9 27 521 UA 252 515.
10 2013 8 28 625 UA 559 515.
# ℹ 7,188 more rows
library(dplyr)
library(nycflights13)
flightsIAH <- filter(flights, dest == "IAH")
flightsIAH <- mutate(flightsIAH, speed = distance / air_time * 60)
flightsIAH <- select(flightsIAH, year:day, dep_time, carrier, flight, speed)
flightsIAH <- arrange(flightsIAH, desc(speed))
flightsIAH
# A tibble: 7,198 × 7
year month day dep_time carrier flight speed
<int> <int> <int> <int> <chr> <int> <dbl>
1 2013 7 9 707 UA 226 522.
2 2013 8 27 1850 UA 1128 521.
3 2013 8 28 902 UA 1711 519.
4 2013 8 28 2122 UA 1022 519.
5 2013 6 11 1628 UA 1178 515.
6 2013 8 27 1017 UA 333 515.
7 2013 8 27 1205 UA 1421 515.
8 2013 8 27 1758 UA 302 515.
9 2013 9 27 521 UA 252 515.
10 2013 8 28 625 UA 559 515.
# ℹ 7,188 more rows
library(dplyr)
library(nycflights13)
flights |>
group_by(month) |>
summarize(
delay = mean(dep_delay, na.rm = TRUE),
n = n()
)
# A tibble: 12 × 3
month delay n
<int> <dbl> <int>
1 1 10.0 27004
2 2 10.8 24951
3 3 13.2 28834
4 4 13.9 28330
5 5 13.0 28796
6 6 20.8 28243
7 7 21.7 29425
8 8 12.6 29327
9 9 6.72 27574
10 10 6.24 28889
11 11 5.44 27268
12 12 16.6 28135
library(dplyr)
library(nycflights13)
flightsGrouped <- group_by(flights, month)
flightsAvg <- summarize(flightsGrouped, delay = mean(dep_delay, na.rm = TRUE), n = n())
flightsAvg
# A tibble: 12 × 3
month delay n
<int> <dbl> <int>
1 1 10.0 27004
2 2 10.8 24951
3 3 13.2 28834
4 4 13.9 28330
5 5 13.0 28796
6 6 20.8 28243
7 7 21.7 29425
8 8 12.6 29327
9 9 6.72 27574
10 10 6.24 28889
11 11 5.44 27268
12 12 16.6 28135
Inspired by Exercise 4.5.7.6, also try introducing typos
Contrast with +
in ggplot()
: More about “layering”
Introduce you to dplyr
verbs right away:
filter()
, mutate()
select()
, arrange()
group_by()
, summarize()
Later, I will contrast with some base R.
Rows: 336,776
Columns: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
To answer the exercise, it helps to use glimpse
first.
arr_delay >= 2
dest == "IAH" | dest == "HOU"
or dest %in% c("IAH", "HOU")
arr_delay >= 120 & dep_delay <= 0
A harder question was:
Were delayed by at least an hour, but made up over 30 minutes in flight:
dep_delay >= 60 & dep_delay - arr_delay > 30
select()
, rename()
, relocate()
are useful for “quick peeks” into the dataset, while on the command line.select()
, relocate()
, mutate()
slice_head()
and related commands also belong to this category.library(dplyr)
library(nycflights13)
flights |>
select(starts_with(c("dep", "arr")), ends_with("time")) |>
rename(t_dep = dep_time, t_arr = arr_time, d_arr = arr_delay,
d_dep = dep_delay, s_arr = sched_arr_time, s_dep = sched_dep_time) |>
select(s_dep, t_dep, d_dep, s_arr, t_arr, d_arr, air_time) |>
# will not work for flights arriving after 000 hours
mutate(f_time = s_arr - s_dep, ac_time = t_arr - t_dep) |>
slice_head(n = 5)
# A tibble: 5 × 9
s_dep t_dep d_dep s_arr t_arr d_arr air_time f_time ac_time
<int> <int> <dbl> <int> <int> <dbl> <dbl> <int> <int>
1 515 517 2 819 830 11 227 304 313
2 529 533 4 830 850 20 227 301 317
3 540 542 2 850 923 33 160 310 381
4 545 544 -1 1022 1004 -18 183 477 460
5 600 554 -6 837 812 -25 116 237 258
Problem: Which flights traveled the farthest distance?
library(dplyr)
library(nycflights13)
flights |>
arrange(desc(distance)) |>
select(distance, origin, dest, carrier, flight)
# A tibble: 336,776 × 5
distance origin dest carrier flight
<dbl> <chr> <chr> <chr> <int>
1 4983 JFK HNL HA 51
2 4983 JFK HNL HA 51
3 4983 JFK HNL HA 51
4 4983 JFK HNL HA 51
5 4983 JFK HNL HA 51
6 4983 JFK HNL HA 51
7 4983 JFK HNL HA 51
8 4983 JFK HNL HA 51
9 4983 JFK HNL HA 51
10 4983 JFK HNL HA 51
# ℹ 336,766 more rows
library(dplyr)
library(nycflights13)
flights |>
slice_max(order_by = distance) |>
select(distance, origin, dest, carrier, flight)
# A tibble: 342 × 5
distance origin dest carrier flight
<dbl> <chr> <chr> <chr> <int>
1 4983 JFK HNL HA 51
2 4983 JFK HNL HA 51
3 4983 JFK HNL HA 51
4 4983 JFK HNL HA 51
5 4983 JFK HNL HA 51
6 4983 JFK HNL HA 51
7 4983 JFK HNL HA 51
8 4983 JFK HNL HA 51
9 4983 JFK HNL HA 51
10 4983 JFK HNL HA 51
# ℹ 332 more rows
What if you want the five flights with the farthest distance?
library(dplyr)
library(nycflights13)
flights |>
select(distance, origin, dest) |>
group_by(origin, dest) |>
distinct() |>
ungroup() |>
slice_max(order_by = distance, n = 5)
# A tibble: 5 × 3
distance origin dest
<dbl> <chr> <chr>
1 4983 JFK HNL
2 4963 EWR HNL
3 3370 EWR ANC
4 2586 JFK SFO
5 2576 JFK OAK
Problem: Which plane traveled the most times to LAX in 2013? Plot the distribution of its delay times.
library(nycflights13)
flightsIAH <- flights[flights$dest == "IAH", ]
flightsIAH$speed <- flightsIAH$distance / flightsIAH$air_time * 60
# flightsIAH$speed <- with(flightsIAH, distance / air_time * 60)
flightsIAH <- flightsIAH[, c("year", "month", "day", "carrier", "flight", "speed")]
flightsIAH <- flightsIAH[order(flightsIAH$speed, decreasing = TRUE), ]
# flightsIAH$speed <- sort(flightsIAH$speed, decreasing = TRUE, na.last = TRUE)
flightsIAH
# A tibble: 7,198 × 6
year month day carrier flight speed
<int> <int> <int> <chr> <int> <dbl>
1 2013 7 9 UA 226 522.
2 2013 8 27 UA 1128 521.
3 2013 8 28 UA 1711 519.
4 2013 8 28 UA 1022 519.
5 2013 6 11 UA 1178 515.
6 2013 8 27 UA 333 515.
7 2013 8 27 UA 1421 515.
8 2013 8 27 UA 302 515.
9 2013 9 27 UA 252 515.
10 2013 8 28 UA 559 515.
# ℹ 7,188 more rows
library(nycflights13)
flightsAvg <- data.frame(month = 1:12)
flightsAvg$delay <- tapply(flights$dep_delay, flights$month, mean, na.rm = TRUE)
flightsAvg$n <- tapply(flights$dep_delay, flights$month, length)
flightsAvg
month delay n
1 1 10.036665 27004
2 2 10.816843 24951
3 3 13.227076 28834
4 4 13.938038 28330
5 5 12.986859 28796
6 6 20.846332 28243
7 7 21.727787 29425
8 8 12.611040 29327
9 9 6.722476 27574
10 10 6.243988 28889
11 11 5.435362 27268
12 12 16.576688 28135