Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- (df <- structure(list(x = c(1L, 2L, NA, 3L, NA),
- y = c(4L, 5L, NA, 6L, 7L),
- z = c(8L, 9L, 10L, 11L, NA)),
- .Names = c("x", "y", "z"),
- class = "data.frame",
- row.names = c(NA, -5L)))
- x y z
- 1 4 8
- 2 5 9
- NA NA 10
- 3 6 11
- NA 7 NA
- x y z
- 1 4 8
- 2 5 9
- 3 6 11
- NA 7 NA
- df[!with(df,is.na(x)& is.na(y)),]
- # x y z
- #1 1 4 8
- #2 2 5 9
- #4 3 6 11
- #5 NA 7 NA
- set.seed(237)
- df <- data.frame(x=sample(c(NA,1:20), 1e6, replace=T), y= sample(c(NA, 1:10), 1e6, replace=T), z= sample(c(NA, 5:15), 1e6,replace=T))
- f1 <- function() df[!with(df,is.na(x)& is.na(y)),]
- f2 <- function() df[rowSums(is.na(df[c("x", "y")])) != 2, ]
- f3 <- function() df[ apply( df, 1, function(x) sum(is.na(x))>1 ), ]
- library(microbenchmark)
- microbenchmark(f1(), f2(), f3(), unit="relative")
- Unit: relative
- #expr min lq median uq max neval
- # f1() 1.000000 1.000000 1.000000 1.000000 1.000000 100
- # f2() 1.044812 1.068189 1.138323 1.129611 0.856396 100
- # f3() 26.205272 25.848441 24.357665 21.799930 22.881378 100
- > df[rowSums(is.na(df[c("x", "y")])) != 2, ]
- x y z
- 1 1 4 8
- 2 2 5 9
- 4 3 6 11
- 5 NA 7 NA
- ## Sample data with 10 columns and 1 million rows
- set.seed(123)
- df <- data.frame(replicate(10, sample(c(NA, 1:20),
- 1e6, replace = TRUE)))
- f1 <- function() {
- df[!with(df, is.na(X1) & is.na(X2)), ]
- }
- f2 <- function() {
- df[rowSums(is.na(df[1:2])) != 2, ]
- }
- library(microbenchmark)
- microbenchmark(f1(), f2(), times = 20)
- # Unit: milliseconds
- # expr min lq median uq max neval
- # f1() 745.8378 1100.764 1128.047 1199.607 1310.236 20
- # f2() 784.2132 1101.695 1125.380 1163.675 1303.161 20
- f1_5 <- function() {
- df[!with(df, is.na(X1) & is.na(X2) & is.na(X3) &
- is.na(X4) & is.na(X5)), ]
- }
- f2_5 <- function() {
- df[rowSums(is.na(df[1:5])) != 5, ]
- }
- microbenchmark(f1_5(), f2_5(), times = 20)
- # Unit: seconds
- # expr min lq median uq max neval
- # f1_5() 1.275032 1.294777 1.325957 1.368315 1.572772 20
- # f2_5() 1.088564 1.169976 1.193282 1.225772 1.275915 20
- sel <- apply( df, 1, function(x) sum(is.na(x))>1 )
- df[ sel, ]
- sel <- apply( df[,c("x","y")], 1, function(x) sum(is.na(x))>1 )
- sel <- apply( df[,c("x","y")], 1, function(x) all(is.na(x)) )
- > microbenchmark( df[!with(df,is.na(x)& is.na(y)),], df[rowSums(is.na(df[c("x", "y")])) != 2, ], df[ apply( df, 1, function(x) sum(is.na(x))>1 ), ] )
- Unit: microseconds
- expr min lq median uq max neval
- df[!with(df, is.na(x) & is.na(y)), ] 67.148 71.5150 76.0340 86.0155 1049.576 100
- df[rowSums(is.na(df[c("x", "y")])) != 2, ] 132.064 139.8760 145.5605 166.6945 498.934 100
- df[apply(df, 1, function(x) sum(is.na(x)) > 1), ] 175.372 184.4305 201.6360 218.7150 321.583 100
- library(data.table)
- # set your data.frame into a data.table
- setDT(df)
- na.omit(df, cols = c('x', 'y'))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement