The basic introduce of R package dplyr!   

The basic introduce of R package dplyr!

大数据处理的R包dplyr的简介!

20 Aug 2016

Go back sutdyDplyr_bigDataManipulate

参考自http://blog.163.com/zzz216@yeah/blog/static/16255468420147179438149/

这个包是关于R的大数据操纵的。

我在操作数据的时候,经常会遇到的问题就是:数据操纵的困难性。

如果你用excel的话就会比较难受,如果你全部都用perl或者python来做的话它不是可视化的,看起来很不爽。

下面的这个R包就是用来处理这个问题的。

1. data e.g.

> library(Lahman)    #contain the data Batting
> library(hflights)  #hflights
> library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
> head(Batting)
##    playerID yearID stint teamID lgID  G  AB  R  H X2B X3B HR RBI SB CS BB
## 1 abercda01   1871     1    TRO   NA  1   4  0  0   0   0  0   0  0  0  0
## 2  addybo01   1871     1    RC1   NA 25 118 30 32   6   0  0  13  8  1  4
## 3 allisar01   1871     1    CL1   NA 29 137 28 40   4   5  0  19  3  1  2
## 4 allisdo01   1871     1    WS3   NA 27 133 28 44  10   2  2  27  1  1  0
## 5 ansonca01   1871     1    RC1   NA 25 120 29 39  11   3  0  16  6  2  2
## 6 armstbo01   1871     1    FW1   NA 12  49  9 11   2   1  0   5  0  1  0
##   SO IBB HBP SH SF GIDP
## 1  0  NA  NA NA NA   NA
## 2  0  NA  NA NA NA   NA
## 3  5  NA  NA NA NA   NA
## 4  2  NA  NA NA NA   NA
## 5  1  NA  NA NA NA   NA
## 6  1  NA  NA NA NA   NA
> dim(Batting)
## [1] 99846    22
> head(hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0
> dim(hflights)
## [1] 227496     21

2. get the pretty data_frame

> hflights_df <- tbl_df(hflights) 
> #main function: print a few rows and all the columns that fit on one screen
> head(hflights_df)
## Source: local data frame [6 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   (int) (int)      (int)     (int)   (int)   (int)         (chr)     (int)
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
> dim(hflights_df) #the real_content is not change
## [1] 227496     21

3. all.equal() if DFs are equal

3.1 Usage:

all.equal(target, current, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE, …)

3.2 Arguments:

target,current two data frames to compare ignore_col_order should order of columns be ignored? ignore_row_order should order of rows be ignored? convert Should similar classes be converted? Currently this will convert factor to character and integer to double. … Ignored. Needed for compatibility with the generic.

3.3 Value:

TRUE if equal, otherwise a character vector describing the first reason why they’re not equal. Use isTRUE if using the result in an if expression.

3.4 Example:

scramble <- function(x) x[sample(nrow(x)), sample(ncol(x))]
# By default, ordering of rows and columns ignored
#scramble: this function is used to create the random rows and cols
all.equal(hflights_df, scramble(hflights_df))
## [1] TRUE
# But those can be overriden if desired
all.equal(hflights_df, scramble(hflights_df), ignore_col_order = FALSE)
## [1] "Same column names, but different order"
all.equal(hflights_df, scramble(hflights_df), ignore_row_order = FALSE)
## [1] "Same row values, but different order"

4. between()

x <- rnorm(1e2)#create 100 random numbers, x is a vector
x[between(x, -1, 1)]#the between function will return the rows that match the condition
##  [1]  0.01159025 -0.76418635 -0.60378139 -0.91729323 -0.09186508
##  [6]  0.16995918  0.43951431  0.70526777 -0.44850924 -0.31262047
## [11]  0.79378814  0.94118552  0.83372821  0.77183898  0.74612961
## [16] -0.89819918 -0.23378251  0.93948791  0.12784919 -0.27063089
## [21]  0.97049925  0.16930058 -0.13013862  0.91423381  0.95958486
## [26] -0.64351737  0.49971289 -0.10727338 -0.19121181  0.28548158
## [31]  0.47254886  0.61120044  0.77038482 -0.31773573 -0.86995467
## [36]  0.28239279  0.12880472  0.48147443  0.59262837  0.83965585
## [41] -0.30273771 -0.66584216 -0.79693787 -0.40362007 -0.89121606
## [46]  0.52518125  0.07783058 -0.95155915 -0.59636589  0.77272185
## [51]  0.13838240 -0.40675328  0.28416826  0.65017261  0.01681285
## [56]  0.63855545 -0.05124218 -0.32015164 -0.83101910  0.57370937
## [61] -0.25491281  0.40285921 -0.39210839 -0.49727192 -0.99756794
## [66]  0.66930703  0.01183750 -0.42695589  0.82150957

5. bind()

one <- mtcars[1:4, ]
two <- mtcars[11:14, ]
# You can either supply data frames as arguments
q1<-bind_rows(one, two)#you can also add more argument
q1<-bind_rows(one, two,one,two)
# Or a single argument containing a list of data frames
q2<-bind_rows(list(one, two))
bind_rows(split(mtcars, mtcars$cyl))
# When you supply a column name with the `.id` argument, a new
# column is created to link each row to its original data frame
q3<-bind_rows(list(one, two), .id = "id")
q4<-bind_rows(list(a = one, b = two), .id = "id")
q5<-bind_rows("group 1" = one, "group 2" = two, .id = "groups")
# Columns don't need to match when row-binding
bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
## Not run:
# Rows do need to match when column-binding
bind_cols(data.frame(x = 1), data.frame(y = 1:2))
## End(Not run)
bind_cols(one, two)
bind_cols(list(one, two))

6. filter()

> #also if you want to select the object that meet other requirements
> #e.g. you want to choose the variable Month both equal to 1 and 2
> #you can write in this *filter(hflights_df, Month == 1 | Month == 2)* or &
>   filter<-filter(hflights_df, Month == 1, DayofMonth == 1)
>   head(filter)
## Source: local data frame [6 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   (int) (int)      (int)     (int)   (int)   (int)         (chr)     (int)
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          1         6     728     840            AA       460
## 3  2011     1          1         6    1631    1736            AA      1121
## 4  2011     1          1         6    1756    2112            AA      1294
## 5  2011     1          1         6    1012    1347            AA      1700
## 6  2011     1          1         6    1211    1325            AA      1820
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
>   dim(filter)
## [1] 552  21
> #use base *hflights[hflights$Month == 1 & hflights$DayofMonth == 1, ]*

7. arrange() like the function sort()

> #if you need the reverse order you can try *arrange(hflights_df, desc(ArrDelay))*
> arrange<-arrange(hflights_df, DayofMonth, Month, Year)
> head(arrange)
## Source: local data frame [6 x 21]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   (int) (int)      (int)     (int)   (int)   (int)         (chr)     (int)
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          1         6     728     840            AA       460
## 3  2011     1          1         6    1631    1736            AA      1121
## 4  2011     1          1         6    1756    2112            AA      1294
## 5  2011     1          1         6    1012    1347            AA      1700
## 6  2011     1          1         6    1211    1325            AA      1820
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
> dim(arrange)
## [1] 227496     21
> #use base
> #hflights[order(hflights$DayofMonth, hflights$Month, hflights$Year), ]
> #hflights[order(desc(hflights$ArrDelay)), ]

8. select()

> #also you can use this way *select(hflights_df, Year:DayOfWeek)*
> #if you don't want to choose some variables
> #use *select(hflights_df, -(Year:DayOfWeek))*
> select<-select(hflights_df, Year, Month, DayOfWeek)
> head(select)
## Source: local data frame [6 x 3]
## 
##    Year Month DayOfWeek
##   (int) (int)     (int)
## 1  2011     1         6
## 2  2011     1         7
## 3  2011     1         1
## 4  2011     1         2
## 5  2011     1         3
## 6  2011     1         4
> dim(select)
## [1] 227496      3

9. mutate()

> #similat to base::transform() ,but it can use the new create varible gain
> mutate<-mutate(hflights_df, 
+   gain = ArrDelay - DepDelay, 
+   gain_per_hour = gain / (AirTime / 60)
+ )
> head(mutate)
## Source: local data frame [6 x 23]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   (int) (int)      (int)     (int)   (int)   (int)         (chr)     (int)
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int), gain (int), gain_per_hour (dbl)
> dim(mutate)
## [1] 227496     23

10. summarise()

> summarise(hflights_df, 
+   delay = mean(DepDelay, na.rm = TRUE))
## Source: local data frame [1 x 1]
## 
##      delay
##      (dbl)
## 1 9.444951

11. group_by()

> #it wil be much stronger if you use this combine with the operation mentioned above
> #e.g. you want to see the count/average fly distance/delay of each flight
> planes <- group_by(hflights_df, TailNum)
> head(planes)
## Source: local data frame [6 x 21]
## Groups: TailNum [6]
## 
##    Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
##   (int) (int)      (int)     (int)   (int)   (int)         (chr)     (int)
## 1  2011     1          1         6    1400    1500            AA       428
## 2  2011     1          2         7    1401    1501            AA       428
## 3  2011     1          3         1    1352    1502            AA       428
## 4  2011     1          4         2    1403    1513            AA       428
## 5  2011     1          5         3    1405    1507            AA       428
## 6  2011     1          6         4    1359    1503            AA       428
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
##   (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
##   Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
##   CancellationCode (chr), Diverted (int)
> dim(planes)
## [1] 227496     21
> delay <- summarise(planes, 
+   count = n(), 
+   dist = mean(Distance, na.rm = TRUE), 
+   delay = mean(ArrDelay, na.rm = TRUE))
> head(delay)
## Source: local data frame [6 x 4]
## 
##   TailNum count      dist     delay
##     (chr) (int)     (dbl)     (dbl)
## 1           795  938.7157       NaN
## 2  N0EGMQ    40 1095.2500  1.918919
## 3  N10156   317  801.7192  8.199357
## 4  N10575    94  631.5319 18.148936
## 5  N11106   308  774.9805 10.101639
## 6  N11107   345  768.1130  8.052786
> dim(delay)
## [1] 3320    4
> delay <- filter(delay, count > 20, dist < 2000)
> head(delay)
## Source: local data frame [6 x 4]
## 
##   TailNum count      dist     delay
##     (chr) (int)     (dbl)     (dbl)
## 1           795  938.7157       NaN
## 2  N0EGMQ    40 1095.2500  1.918919
## 3  N10156   317  801.7192  8.199357
## 4  N10575    94  631.5319 18.148936
## 5  N11106   308  774.9805 10.101639
## 6  N11107   345  768.1130  8.052786
> dim(delay)
## [1] 1526    4
> ##also you can use ggplot2 to give a picture
> library(ggplot2)
> ggplot(delay, aes(dist, delay)) + 
+   geom_point(aes(size = count), alpha = 1/2) + 
+   geom_smooth() + 
+   scale_size_area()
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

12. the symbol %>%

> Batting %>%
+     group_by(playerID) %>%
+     summarise(total = sum(G)) %>%
+     arrange(desc(total)) %>%
+     head(5)
## Source: local data frame [5 x 2]
## 
##    playerID total
##       (chr) (int)
## 1  rosepe01  3562
## 2 yastrca01  3308
## 3 aaronha01  3298
## 4 henderi01  3081
## 5  cobbty01  3035

13. the official documentation

you can search everything use keyword dplyr