Manipulate Data Frame in R

In this post, I use nycflights13 dataset for demonstration

library(nycflights13)

This package contains information about all flights that departed from NYC (e.g. EWR, JFK and LGA) in 2013: 336,776 flights in total.

head(flights)
# Source: local data frame [6 x 16]
#
#   year month day dep_time dep_delay arr_time
# 1 2013     1   1      517         2      830
# 2 2013     1   1      533         4      850
# 3 2013     1   1      542         2      923
# 4 2013     1   1      544        -1     1004
# 5 2013     1   1      554        -6      812
# 6 2013     1   1      554        -4      740
# Variables not shown: arr_delay (dbl), carrier
#  (chr), tailnum (chr), flight (int), origin
#  (chr), dest (chr), air_time (dbl), distance
#  (dbl), hour (dbl), minute (dbl)

Load dplyr library

library(dplyr)

Subsetting

Select columns by name

select(flights, year, month, day)
#> Source: local data frame [336,776 x 3]
#>
#> year month day
#> 1 2013 1 1
#> 2 2013 1 1
#> 3 2013 1 1
#> 4 2013 1 1
#> .. ... ... ...

Select all columns between year and day (inclusive)

select(flights, -(year:day))
#> Source: local data frame [336,776 x 13]
#>
#> dep_time dep_delay arr_time arr_delay carrier tailnum flight origin
#> 1 517 2 830 11 UA N14228 1545 EWR
#> 2 533 4 850 20 UA N24211 1714 LGA
#> 3 542 2 923 33 AA N619AA 1141 JFK
#> 4 544 -1 1004 -18 B6 N804JB 725 JFK
#> .. ... ... ... ... ... ... ... ...
#> dest
#> 1 IAH
#> 2 IAH
#> 3 MIA
#> 4 BQN
#> .. ...
#> Variables not shown: air_time (dbl), distance (dbl), hour (dbl), minute
#> (dbl)

Select all columns except those from year to day (inclusive)

select(flights, -(year:day))
#> Source: local data frame [336,776 x 13]
#>
#> dep_time dep_delay arr_time arr_delay carrier tailnum flight origin
#> 1 517 2 830 11 UA N14228 1545 EWR
#> 2 533 4 850 20 UA N24211 1714 LGA
#> 3 542 2 923 33 AA N619AA 1141 JFK
#> 4 544 -1 1004 -18 B6 N804JB 725 JFK
#> .. ... ... ... ... ... ... ... ...
#> dest
#> 1 IAH
#> 2 IAH
#> 3 MIA
#> 4 BQN
#> .. ...
#> Variables not shown: air_time (dbl), distance (dbl), hour (dbl), minute
#> (dbl)

Filtering

filter(flights, month == 1, day == 1)
#> Source: local data frame [842 x 16]
#>
#>    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
#> 1  2013     1   1      517         2      830        11      UA  N14228
#> 2  2013     1   1      533         4      850        20      UA  N24211
#> 3  2013     1   1      542         2      923        33      AA  N619AA
#> 4  2013     1   1      544        -1     1004       -18      B6  N804JB
#> ..  ...   ... ...      ...       ...      ...       ...     ...     ...
#> Variables not shown: flight (int), origin (chr), dest (chr), air_time
#>   (dbl), distance (dbl), hour (dbl), minute (dbl)

Arrange (Sorting)

arrange(flights, year, month, day)
#> Source: local data frame [336,776 x 16]
#>
#>    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
#> 1  2013     1   1      517         2      830        11      UA  N14228
#> 2  2013     1   1      533         4      850        20      UA  N24211
#> 3  2013     1   1      542         2      923        33      AA  N619AA
#> 4  2013     1   1      544        -1     1004       -18      B6  N804JB
#> ..  ...   ... ...      ...       ...      ...       ...     ...     ...
#> Variables not shown: flight (int), origin (chr), dest (chr), air_time
#>   (dbl), distance (dbl), hour (dbl), minute (dbl)

Add new column

mutate(flights,
  gain = arr_delay - dep_delay,
  speed = distance / air_time * 60)
#> Source: local data frame [336,776 x 18]
#>
#>    year month day dep_time dep_delay arr_time arr_delay carrier tailnum
#> 1  2013     1   1      517         2      830        11      UA  N14228
#> 2  2013     1   1      533         4      850        20      UA  N24211
#> 3  2013     1   1      542         2      923        33      AA  N619AA
#> 4  2013     1   1      544        -1     1004       -18      B6  N804JB
#> ..  ...   ... ...      ...       ...      ...       ...     ...     ...
#> Variables not shown: flight (int), origin (chr), dest (chr), air_time
#>   (dbl), distance (dbl), hour (dbl), minute (dbl), gain (dbl), speed (dbl)

Summary

You can use summary() and group_by function for this task

flights %>%
  group_by(year, month, day) %>%
  select(arr_delay, dep_delay) %>%
  summarise(
    arr = mean(arr_delay, na.rm = TRUE),
    dep = mean(dep_delay, na.rm = TRUE)
  ) %>%
  filter(arr > 30 | dep > 30)
#> Source: local data frame [49 x 5]
#> Groups: year, month
#>
#>   year month day      arr      dep
#> 1  2013     1  16 34.24736 24.61287
#> 2  2013     1  31 32.60285 28.65836
#> 3  2013     2  11 36.29009 39.07360
#> 4  2013     2  27 31.25249 37.76327
#> 5  2013     3   8 85.86216 83.53692
#> 6  2013     3  18 41.29189 30.11796
#> 7  2013     4  10 38.41231 33.02368
#> 8  2013     4  12 36.04814 34.83843
#> 9  2013     4  18 36.02848 34.91536
#> 10 2013     4  19 47.91170 46.12783
..  ...   ... ...      ...      ...

References

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s