Welcome to the hands-on workshop “Tidy Work in Tidyverse”. Most of the things necessary to complete the tutorials and challenges were covered in the lecture. However, sometimes the tasks require that you check the docs or search online. Not all our solutions are optimal. Let us know if you can do better or solve things in a different way. If stuck, look at hints, next google and if still stuck, turn to TA. It is a lot of material, do not fee bad if you do not solve all tasks. Good luck!
magrittr
):my_cars <- mtcars[, c(1:4, 7)]
my_cars <- my_cars[my_cars$disp > mean(my_cars$disp), ]
print(my_cars)
my_cars <- colMeans(my_cars)
my_cars <- mtcars %>%
select(c(1:4, 7)) %>%
filter(disp > mean(disp)) %T>%
print() %>%
colMeans()
cor(mtcars)
mtcars %>% cor()
cor(mtcars$gear, mtcars$mpg)
mtcars %$% cor(gear, mpg)
mtcars
dataset to a tibble vehicles
.
vehicles <- mtcars %>% as_tibble()
cyl
) variable using:
[[index]]
accessor,[[string]]
accessor,$
accessor.
vehicles[['cyl']]
vehicles[[2]]
vehicles$cyl
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
vehicles %T>%
{print(.[['cyl']])} %T>%
{print(.[[2]])} %>%
.$cyl
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
vehicles
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # … with 22 more rows
vehicles %>% head(n = 30)
## # A tibble: 30 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # … with 20 more rows
options(tibble.print_min = 15, tibble.print_max = 30)
vehicles
back to a data.frame
called automobiles
.
automobiles <- as.data.frame(vehicles)
Do you think tibbles are lazy? Try to create a tibble that tests whether lazy evaluation applies to tibbles too.
tibble(x = sample(1:10, size = 10, replace = T), y = log10(x))
The nycflights13
package contains information about all flights that departed from NYC (i.e., EWR, JFK and LGA) in 2013: 336,776 flights with 16 variables. To help understand what causes delays, it also includes a number of other useful datasets: weather, planes, airports, airlines. We will use it to train working with tibbles and dplyr
.
nycflights13
package (install if necessary)
install.packages('nycflights13')
library(nycflights13)
?nycflights13
## No documentation for 'nycflights13' in specified packages and libraries:
## you could try '??nycflights13'
flights
tibble.
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## 11 2013 1 1 558 600 -2 849 851
## 12 2013 1 1 558 600 -2 853 856
## 13 2013 1 1 558 600 -2 924 917
## 14 2013 1 1 558 600 -2 923 937
## 15 2013 1 1 559 600 -1 941 910
## # … with 336,761 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
carrier
and arr_time
flights %>% select(-carrier, -arr_time)
## # A tibble: 336,776 x 17
## year month day dep_time sched_dep_time dep_delay sched_arr_time arr_delay
## <int> <int> <int> <int> <int> <dbl> <int> <dbl>
## 1 2013 1 1 517 515 2 819 11
## 2 2013 1 1 533 529 4 830 20
## 3 2013 1 1 542 540 2 850 33
## 4 2013 1 1 544 545 -1 1022 -18
## 5 2013 1 1 554 600 -6 837 -25
## 6 2013 1 1 554 558 -4 728 12
## 7 2013 1 1 555 600 -5 854 19
## 8 2013 1 1 557 600 -3 723 -14
## 9 2013 1 1 557 600 -3 846 -8
## 10 2013 1 1 558 600 -2 745 8
## 11 2013 1 1 558 600 -2 851 -2
## 12 2013 1 1 558 600 -2 856 -3
## 13 2013 1 1 558 600 -2 917 7
## 14 2013 1 1 558 600 -2 937 -14
## 15 2013 1 1 559 600 -1 910 31
## # … with 336,761 more rows, and 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
carrier
, tailnum
and origin
flights %>% select(carrier, tailnum, origin)
## # A tibble: 336,776 x 3
## carrier tailnum origin
## <chr> <chr> <chr>
## 1 UA N14228 EWR
## 2 UA N24211 LGA
## 3 AA N619AA JFK
## 4 B6 N804JB JFK
## 5 DL N668DN LGA
## 6 UA N39463 EWR
## 7 B6 N516JB EWR
## 8 EV N829AS LGA
## 9 B6 N593JB JFK
## 10 AA N3ALAA LGA
## 11 B6 N793JB JFK
## 12 B6 N657JB JFK
## 13 UA N29129 JFK
## 14 UA N53441 EWR
## 15 AA N3DUAA LGA
## # … with 336,761 more rows
day
through carrier
flights %>% select(-(day:carrier))
## # A tibble: 336,776 x 11
## year month flight tailnum origin dest air_time distance hour minute
## <int> <int> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1545 N14228 EWR IAH 227 1400 5 15
## 2 2013 1 1714 N24211 LGA IAH 227 1416 5 29
## 3 2013 1 1141 N619AA JFK MIA 160 1089 5 40
## 4 2013 1 725 N804JB JFK BQN 183 1576 5 45
## 5 2013 1 461 N668DN LGA ATL 116 762 6 0
## 6 2013 1 1696 N39463 EWR ORD 150 719 5 58
## 7 2013 1 507 N516JB EWR FLL 158 1065 6 0
## 8 2013 1 5708 N829AS LGA IAD 53 229 6 0
## 9 2013 1 79 N593JB JFK MCO 140 944 6 0
## 10 2013 1 301 N3ALAA LGA ORD 138 733 6 0
## 11 2013 1 49 N793JB JFK PBI 149 1028 6 0
## 12 2013 1 71 N657JB JFK TPA 158 1005 6 0
## 13 2013 1 194 N29129 JFK LAX 345 2475 6 0
## 14 2013 1 1124 N53441 EWR SFO 361 2565 6 0
## 15 2013 1 707 N3DUAA LGA DFW 257 1389 6 0
## # … with 336,761 more rows, and 1 more variable: time_hour <dttm>
arr
ival (hint: ?tidyselect
)
flights %>% select(contains('arr_'))
## # A tibble: 336,776 x 3
## arr_time sched_arr_time arr_delay
## <int> <int> <dbl>
## 1 830 819 11
## 2 850 830 20
## 3 923 850 33
## 4 1004 1022 -18
## 5 812 837 -25
## 6 740 728 12
## 7 913 854 19
## 8 709 723 -14
## 9 838 846 -8
## 10 753 745 8
## 11 849 851 -2
## 12 853 856 -3
## 13 924 917 7
## 14 923 937 -14
## 15 941 910 31
## # … with 336,761 more rows
v <- c("arr_time", "sched_arr_time", "arr_delay")
v <- c("arr_time", "sched_arr_time", "arr_delay")
flights %>% select(v) # or
flights %>% select(one_of(v))
## # A tibble: 336,776 x 3
## arr_time sched_arr_time arr_delay
## <int> <int> <dbl>
## 1 830 819 11
## 2 850 830 20
## 3 923 850 33
## 4 1004 1022 -18
## 5 812 837 -25
## 6 740 728 12
## 7 913 854 19
## 8 709 723 -14
## 9 838 846 -8
## 10 753 745 8
## 11 849 851 -2
## 12 853 856 -3
## 13 924 917 7
## 14 923 937 -14
## 15 941 910 31
## # … with 336,761 more rows
## # A tibble: 336,776 x 3
## arr_time sched_arr_time arr_delay
## <int> <int> <dbl>
## 1 830 819 11
## 2 850 830 20
## 3 923 850 33
## 4 1004 1022 -18
## 5 812 837 -25
## 6 740 728 12
## 7 913 854 19
## 8 709 723 -14
## 9 838 846 -8
## 10 753 745 8
## 11 849 851 -2
## 12 853 856 -3
## 13 924 917 7
## 14 923 937 -14
## 15 941 910 31
## # … with 336,761 more rows
dest
to destination
using select()
and rename()
. What is the difference between the two approaches?
flights %>% select(destination = dest) %>% head()
flights %>% rename(destination = dest) %>% head()
# select keeps only the renamed column while rename returns the whole dataset
# with the column renamed
## # A tibble: 6 x 1
## destination
## <chr>
## 1 IAH
## 2 IAH
## 3 MIA
## 4 BQN
## 5 ATL
## 6 ORD
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, destination <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% filter(arr_delay < 0)
## # A tibble: 188,933 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 557 600 -3 709 723
## 4 2013 1 1 557 600 -3 838 846
## 5 2013 1 1 558 600 -2 849 851
## 6 2013 1 1 558 600 -2 853 856
## 7 2013 1 1 558 600 -2 923 937
## 8 2013 1 1 559 559 0 702 706
## 9 2013 1 1 559 600 -1 854 902
## 10 2013 1 1 600 600 0 851 858
## 11 2013 1 1 601 600 1 844 850
## 12 2013 1 1 602 610 -8 812 820
## 13 2013 1 1 606 610 -4 858 910
## 14 2013 1 1 606 610 -4 837 845
## 15 2013 1 1 607 607 0 858 915
## # … with 188,918 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% filter(dep_delay >= 10, dep_delay <= 33) # or
flights %>% filter(between(dep_delay, 10, 33))
## # A tibble: 40,558 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 611 600 11 945 931
## 2 2013 1 1 623 610 13 920 915
## 3 2013 1 1 632 608 24 740 728
## 4 2013 1 1 743 730 13 1107 1100
## 5 2013 1 1 743 730 13 1059 1056
## 6 2013 1 1 851 840 11 1215 1206
## 7 2013 1 1 906 843 23 1134 1125
## 8 2013 1 1 912 900 12 1241 1220
## 9 2013 1 1 914 900 14 1058 1043
## 10 2013 1 1 920 905 15 1039 1025
## 11 2013 1 1 921 900 21 1237 1227
## 12 2013 1 1 930 905 25 1218 1209
## 13 2013 1 1 933 904 29 1252 1210
## 14 2013 1 1 953 921 32 1320 1241
## 15 2013 1 1 1011 1001 10 1133 1128
## # … with 40,543 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## # A tibble: 40,558 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 611 600 11 945 931
## 2 2013 1 1 623 610 13 920 915
## 3 2013 1 1 632 608 24 740 728
## 4 2013 1 1 743 730 13 1107 1100
## 5 2013 1 1 743 730 13 1059 1056
## 6 2013 1 1 851 840 11 1215 1206
## 7 2013 1 1 906 843 23 1134 1125
## 8 2013 1 1 912 900 12 1241 1220
## 9 2013 1 1 914 900 14 1058 1043
## 10 2013 1 1 920 905 15 1039 1025
## 11 2013 1 1 921 900 21 1237 1227
## 12 2013 1 1 930 905 25 1218 1209
## 13 2013 1 1 933 904 29 1252 1210
## 14 2013 1 1 953 921 32 1320 1241
## 15 2013 1 1 1011 1001 10 1133 1128
## # … with 40,543 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% filter(is.na(arr_time))
## # A tibble: 8,713 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 2016 1930 46 NA 2220
## 2 2013 1 1 NA 1630 NA NA 1815
## 3 2013 1 1 NA 1935 NA NA 2240
## 4 2013 1 1 NA 1500 NA NA 1825
## 5 2013 1 1 NA 600 NA NA 901
## 6 2013 1 2 2041 2045 -4 NA 2359
## 7 2013 1 2 2145 2129 16 NA 33
## 8 2013 1 2 NA 1540 NA NA 1747
## 9 2013 1 2 NA 1620 NA NA 1746
## 10 2013 1 2 NA 1355 NA NA 1459
## 11 2013 1 2 NA 1420 NA NA 1644
## 12 2013 1 2 NA 1321 NA NA 1536
## 13 2013 1 2 NA 1545 NA NA 1910
## 14 2013 1 2 NA 1330 NA NA 1640
## 15 2013 1 2 NA 1601 NA NA 1735
## # … with 8,698 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
?slice
)
flights %>% slice(1234:1258)
## # A tibble: 25 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 1236 1238 -2 1446 1446
## 2 2013 1 2 1236 1240 -4 1403 1405
## 3 2013 1 2 1236 1200 36 1417 1330
## 4 2013 1 2 1239 1200 39 1429 1342
## 5 2013 1 2 1239 1220 19 1328 1320
## 6 2013 1 2 1240 1245 -5 1538 1600
## 7 2013 1 2 1240 1240 0 1556 1540
## 8 2013 1 2 1240 1245 -5 1521 1616
## 9 2013 1 2 1243 1229 14 1453 1428
## 10 2013 1 2 1244 900 224 1431 1104
## 11 2013 1 2 1245 1249 -4 1718 1800
## 12 2013 1 2 1249 1245 4 1555 1550
## 13 2013 1 2 1249 1258 -9 1535 1610
## 14 2013 1 2 1250 1245 5 1349 1350
## 15 2013 1 2 1251 1250 1 1552 1603
## 16 2013 1 2 1253 1300 -7 1505 1508
## 17 2013 1 2 1254 1300 -6 1545 1537
## 18 2013 1 2 1255 1259 -4 1501 1502
## 19 2013 1 2 1256 1252 4 1600 1555
## 20 2013 1 2 1256 1300 -4 1526 1518
## 21 2013 1 2 1256 1225 31 1409 1345
## 22 2013 1 2 1257 1303 -6 1356 1411
## 23 2013 1 2 1257 1300 -3 1529 1527
## 24 2013 1 2 1257 1257 0 1613 1607
## 25 2013 1 2 1258 1255 3 1537 1535
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
?sample_n()
) 3 random flights per day in March
flights %>% filter(month == 3) %>%
group_by(day) %>%
sample_n(3)
## # A tibble: 93 x 19
## # Groups: day [31]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 3 1 1458 1420 38 1647 1550
## 2 2013 3 1 1714 1645 29 1920 1906
## 3 2013 3 1 758 800 -2 1106 1125
## 4 2013 3 2 554 600 -6 904 901
## 5 2013 3 2 926 935 -9 1307 1247
## 6 2013 3 2 1446 1450 -4 1752 1755
## 7 2013 3 3 721 729 -8 839 912
## 8 2013 3 3 744 745 -1 1007 1011
## 9 2013 3 3 1154 1200 -6 1301 1330
## 10 2013 3 4 2126 2128 -2 2336 2355
## 11 2013 3 4 1629 1630 -1 1937 2015
## 12 2013 3 4 1808 1815 -7 2022 2045
## 13 2013 3 5 1710 1700 10 1941 1933
## 14 2013 3 5 927 900 27 1231 1220
## 15 2013 3 5 1618 1543 35 1817 1745
## # … with 78 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(month == 1) %>%
group_by(carrier) %>%
top_n(5, dep_delay)
## # A tibble: 77 x 19
## # Groups: carrier [16]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 848 1835 853 1001 1950
## 2 2013 1 1 2205 1720 285 46 2040
## 3 2013 1 1 2343 1724 379 314 1938
## 4 2013 1 2 1412 838 334 1710 1147
## 5 2013 1 2 1607 1030 337 2003 1355
## 6 2013 1 2 2131 1512 379 2340 1741
## 7 2013 1 3 1933 1730 123 2131 1953
## 8 2013 1 3 2056 1605 291 2239 1754
## 9 2013 1 4 1731 1602 89 1837 1722
## 10 2013 1 4 1831 1730 61 2029 1953
## 11 2013 1 5 1344 817 327 1635 1127
## 12 2013 1 6 1019 900 79 1558 1530
## 13 2013 1 7 1042 900 102 1620 1530
## 14 2013 1 7 2021 1415 366 2332 1724
## 15 2013 1 9 641 900 1301 1242 1530
## # … with 62 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
mutate(route=paste(origin,"-",dest)) %>%
distinct(route,.keep_all=T) %>%
nrow()
## [1] 224
# JFK - LAX
flights %>%
mutate(route=paste(origin,"-",dest)) %>%
group_by(route) %>%
count() %>%
arrange(-n)
## # A tibble: 224 x 2
## # Groups: route [224]
## route n
## <chr> <int>
## 1 JFK - LAX 11262
## 2 LGA - ATL 10263
## 3 LGA - ORD 8857
## 4 JFK - SFO 8204
## 5 LGA - CLT 6168
## 6 EWR - ORD 6100
## 7 JFK - BOS 5898
## 8 LGA - MIA 5781
## 9 JFK - MCO 5464
## 10 EWR - BOS 5327
## 11 EWR - SFO 5127
## 12 LGA - DTW 5040
## 13 EWR - CLT 5026
## 14 EWR - ATL 5022
## 15 EWR - MCO 4941
## # … with 209 more rows
air_time
is the amount of time in minutes spent in the air. Add a new column air_spd
that will contain aircraft’s airspeed in mph
flights %>% mutate(air_spd = distance/(air_time / 60))
## # A tibble: 336,776 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## 11 2013 1 1 558 600 -2 849 851
## 12 2013 1 1 558 600 -2 853 856
## 13 2013 1 1 558 600 -2 924 917
## 14 2013 1 1 558 600 -2 923 937
## 15 2013 1 1 559 600 -1 941 910
## # … with 336,761 more rows, and 12 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # air_spd <dbl>
air_spd
variable
flights %>% transmute(air_spd = distance/(air_time / 60))
## # A tibble: 336,776 x 1
## air_spd
## <dbl>
## 1 370.
## 2 374.
## 3 408.
## 4 517.
## 5 394.
## 6 288.
## 7 404.
## 8 259.
## 9 405.
## 10 319.
## 11 414.
## 12 382.
## 13 430.
## 14 426.
## 15 324.
## # … with 336,761 more rows
group_by()
, summarise()
and n()
to see how many planes were delayed (departure) every month
flights %>%
filter(dep_delay > 0) %>%
group_by(month) %>%
summarise(num_dep_delayed = n())
## # A tibble: 12 x 2
## month num_dep_delayed
## <int> <int>
## 1 1 9662
## 2 2 9124
## 3 3 11209
## 4 4 10543
## 5 5 11291
## 6 6 12655
## 7 7 13909
## 8 8 11713
## 9 9 7815
## 10 10 8722
## 11 11 8239
## 12 12 13550
dep_delay
per month?
flights %>%
group_by(month) %>%
summarise(mean_dep_delay = mean(dep_delay, na.rm = T))
## # A tibble: 12 x 2
## month mean_dep_delay
## <int> <dbl>
## 1 1 10.0
## 2 2 10.8
## 3 3 13.2
## 4 4 13.9
## 5 5 13.0
## 6 6 20.8
## 7 7 21.7
## 8 8 12.6
## 9 9 6.72
## 10 10 6.24
## 11 11 5.44
## 12 12 16.6
flights %>%
filter(arr_delay > 0) %>%
group_by(origin) %>%
summarise(cnt = n()) %>%
arrange(desc(cnt))
## # A tibble: 3 x 2
## origin cnt
## <chr> <int>
## 1 EWR 50099
## 2 JFK 42885
## 3 LGA 40020
summarise()
to sum total dep_delay
per month in hours
flights %>%
group_by(month) %>%
summarize(tot_dep_delay = sum(dep_delay/60, na.rm = T))
## # A tibble: 12 x 2
## month tot_dep_delay
## <int> <dbl>
## 1 1 4430.
## 2 2 4271.
## 3 3 6167.
## 4 4 6426.
## 5 5 6111.
## 6 6 9462.
## 7 7 10315.
## 8 8 6062.
## 9 9 3039.
## 10 10 2982.
## 11 11 2449.
## 12 12 7490.
group_size()
on carrier
what does it return?
flights %>%
group_by(carrier) %>%
group_size()
## [1] 18460 32729 714 54635 48110 54173 685 3260 342 26397 32 58665
## [13] 20536 5162 12275 601
n_groups()
to check the number of unique origin-carrier pairs,
flights %>%
group_by(carrier) %>%
n_groups()
## [1] 16
Note on ungroup
Depending on the version of dplyr
you may or may need to use the ungroup()
if you want to group your data on some other variables. In the newer versions, summarise
and mutate
drop one aggregation level.