大数据处理的R包dplyr的简介!
20 Aug 2016
Go back参考自http://blog.163.com/zzz216@yeah/blog/static/16255468420147179438149/
这个包是关于R的大数据操纵的。
我在操作数据的时候,经常会遇到的问题就是:数据操纵的困难性。
如果你用excel的话就会比较难受,如果你全部都用perl或者python来做的话它不是可视化的,看起来很不爽。
下面的这个R包就是用来处理这个问题的。
> library(Lahman) #contain the data Batting
> library(hflights) #hflights
> library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
> head(Batting)
## playerID yearID stint teamID lgID G AB R H X2B X3B HR RBI SB CS BB
## 1 abercda01 1871 1 TRO NA 1 4 0 0 0 0 0 0 0 0 0
## 2 addybo01 1871 1 RC1 NA 25 118 30 32 6 0 0 13 8 1 4
## 3 allisar01 1871 1 CL1 NA 29 137 28 40 4 5 0 19 3 1 2
## 4 allisdo01 1871 1 WS3 NA 27 133 28 44 10 2 2 27 1 1 0
## 5 ansonca01 1871 1 RC1 NA 25 120 29 39 11 3 0 16 6 2 2
## 6 armstbo01 1871 1 FW1 NA 12 49 9 11 2 1 0 5 0 1 0
## SO IBB HBP SH SF GIDP
## 1 0 NA NA NA NA NA
## 2 0 NA NA NA NA NA
## 3 5 NA NA NA NA NA
## 4 2 NA NA NA NA NA
## 5 1 NA NA NA NA NA
## 6 1 NA NA NA NA NA
> dim(Batting)
## [1] 99846 22
> head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
> dim(hflights)
## [1] 227496 21
> hflights_df <- tbl_df(hflights)
> #main function: print a few rows and all the columns that fit on one screen
> head(hflights_df)
## Source: local data frame [6 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## (int) (int) (int) (int) (int) (int) (chr) (int)
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
> dim(hflights_df) #the real_content is not change
## [1] 227496 21
all.equal(target, current, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE, …)
target,current two data frames to compare ignore_col_order should order of columns be ignored? ignore_row_order should order of rows be ignored? convert Should similar classes be converted? Currently this will convert factor to character and integer to double. … Ignored. Needed for compatibility with the generic.
TRUE if equal, otherwise a character vector describing the first reason why they’re not equal. Use isTRUE if using the result in an if expression.
scramble <- function(x) x[sample(nrow(x)), sample(ncol(x))]
# By default, ordering of rows and columns ignored
#scramble: this function is used to create the random rows and cols
all.equal(hflights_df, scramble(hflights_df))
## [1] TRUE
# But those can be overriden if desired
all.equal(hflights_df, scramble(hflights_df), ignore_col_order = FALSE)
## [1] "Same column names, but different order"
all.equal(hflights_df, scramble(hflights_df), ignore_row_order = FALSE)
## [1] "Same row values, but different order"
x <- rnorm(1e2)#create 100 random numbers, x is a vector
x[between(x, -1, 1)]#the between function will return the rows that match the condition
## [1] 0.01159025 -0.76418635 -0.60378139 -0.91729323 -0.09186508
## [6] 0.16995918 0.43951431 0.70526777 -0.44850924 -0.31262047
## [11] 0.79378814 0.94118552 0.83372821 0.77183898 0.74612961
## [16] -0.89819918 -0.23378251 0.93948791 0.12784919 -0.27063089
## [21] 0.97049925 0.16930058 -0.13013862 0.91423381 0.95958486
## [26] -0.64351737 0.49971289 -0.10727338 -0.19121181 0.28548158
## [31] 0.47254886 0.61120044 0.77038482 -0.31773573 -0.86995467
## [36] 0.28239279 0.12880472 0.48147443 0.59262837 0.83965585
## [41] -0.30273771 -0.66584216 -0.79693787 -0.40362007 -0.89121606
## [46] 0.52518125 0.07783058 -0.95155915 -0.59636589 0.77272185
## [51] 0.13838240 -0.40675328 0.28416826 0.65017261 0.01681285
## [56] 0.63855545 -0.05124218 -0.32015164 -0.83101910 0.57370937
## [61] -0.25491281 0.40285921 -0.39210839 -0.49727192 -0.99756794
## [66] 0.66930703 0.01183750 -0.42695589 0.82150957
one <- mtcars[1:4, ]
two <- mtcars[11:14, ]
# You can either supply data frames as arguments
q1<-bind_rows(one, two)#you can also add more argument
q1<-bind_rows(one, two,one,two)
# Or a single argument containing a list of data frames
q2<-bind_rows(list(one, two))
bind_rows(split(mtcars, mtcars$cyl))
# When you supply a column name with the `.id` argument, a new
# column is created to link each row to its original data frame
q3<-bind_rows(list(one, two), .id = "id")
q4<-bind_rows(list(a = one, b = two), .id = "id")
q5<-bind_rows("group 1" = one, "group 2" = two, .id = "groups")
# Columns don't need to match when row-binding
bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
## Not run:
# Rows do need to match when column-binding
bind_cols(data.frame(x = 1), data.frame(y = 1:2))
## End(Not run)
bind_cols(one, two)
bind_cols(list(one, two))
> #also if you want to select the object that meet other requirements
> #e.g. you want to choose the variable Month both equal to 1 and 2
> #you can write in this *filter(hflights_df, Month == 1 | Month == 2)* or &
> filter<-filter(hflights_df, Month == 1, DayofMonth == 1)
> head(filter)
## Source: local data frame [6 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## (int) (int) (int) (int) (int) (int) (chr) (int)
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## 6 2011 1 1 6 1211 1325 AA 1820
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
> dim(filter)
## [1] 552 21
> #use base *hflights[hflights$Month == 1 & hflights$DayofMonth == 1, ]*
> #if you need the reverse order you can try *arrange(hflights_df, desc(ArrDelay))*
> arrange<-arrange(hflights_df, DayofMonth, Month, Year)
> head(arrange)
## Source: local data frame [6 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## (int) (int) (int) (int) (int) (int) (chr) (int)
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## 6 2011 1 1 6 1211 1325 AA 1820
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
> dim(arrange)
## [1] 227496 21
> #use base
> #hflights[order(hflights$DayofMonth, hflights$Month, hflights$Year), ]
> #hflights[order(desc(hflights$ArrDelay)), ]
> #also you can use this way *select(hflights_df, Year:DayOfWeek)*
> #if you don't want to choose some variables
> #use *select(hflights_df, -(Year:DayOfWeek))*
> select<-select(hflights_df, Year, Month, DayOfWeek)
> head(select)
## Source: local data frame [6 x 3]
##
## Year Month DayOfWeek
## (int) (int) (int)
## 1 2011 1 6
## 2 2011 1 7
## 3 2011 1 1
## 4 2011 1 2
## 5 2011 1 3
## 6 2011 1 4
> dim(select)
## [1] 227496 3
> #similat to base::transform() ,but it can use the new create varible gain
> mutate<-mutate(hflights_df,
+ gain = ArrDelay - DepDelay,
+ gain_per_hour = gain / (AirTime / 60)
+ )
> head(mutate)
## Source: local data frame [6 x 23]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## (int) (int) (int) (int) (int) (int) (chr) (int)
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int), gain (int), gain_per_hour (dbl)
> dim(mutate)
## [1] 227496 23
> summarise(hflights_df,
+ delay = mean(DepDelay, na.rm = TRUE))
## Source: local data frame [1 x 1]
##
## delay
## (dbl)
## 1 9.444951
> #it wil be much stronger if you use this combine with the operation mentioned above
> #e.g. you want to see the count/average fly distance/delay of each flight
> planes <- group_by(hflights_df, TailNum)
> head(planes)
## Source: local data frame [6 x 21]
## Groups: TailNum [6]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## (int) (int) (int) (int) (int) (int) (chr) (int)
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
> dim(planes)
## [1] 227496 21
> delay <- summarise(planes,
+ count = n(),
+ dist = mean(Distance, na.rm = TRUE),
+ delay = mean(ArrDelay, na.rm = TRUE))
> head(delay)
## Source: local data frame [6 x 4]
##
## TailNum count dist delay
## (chr) (int) (dbl) (dbl)
## 1 795 938.7157 NaN
## 2 N0EGMQ 40 1095.2500 1.918919
## 3 N10156 317 801.7192 8.199357
## 4 N10575 94 631.5319 18.148936
## 5 N11106 308 774.9805 10.101639
## 6 N11107 345 768.1130 8.052786
> dim(delay)
## [1] 3320 4
> delay <- filter(delay, count > 20, dist < 2000)
> head(delay)
## Source: local data frame [6 x 4]
##
## TailNum count dist delay
## (chr) (int) (dbl) (dbl)
## 1 795 938.7157 NaN
## 2 N0EGMQ 40 1095.2500 1.918919
## 3 N10156 317 801.7192 8.199357
## 4 N10575 94 631.5319 18.148936
## 5 N11106 308 774.9805 10.101639
## 6 N11107 345 768.1130 8.052786
> dim(delay)
## [1] 1526 4
> ##also you can use ggplot2 to give a picture
> library(ggplot2)
> ggplot(delay, aes(dist, delay)) +
+ geom_point(aes(size = count), alpha = 1/2) +
+ geom_smooth() +
+ scale_size_area()
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
> Batting %>%
+ group_by(playerID) %>%
+ summarise(total = sum(G)) %>%
+ arrange(desc(total)) %>%
+ head(5)
## Source: local data frame [5 x 2]
##
## playerID total
## (chr) (int)
## 1 rosepe01 3562
## 2 yastrca01 3308
## 3 aaronha01 3298
## 4 henderi01 3081
## 5 cobbty01 3035
you can search everything use keyword dplyr