SHARE
TWEET

Untitled

a guest Jul 17th, 2017 48 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. (df <- structure(list(x = c(1L, 2L, NA, 3L, NA),
  2.                      y = c(4L, 5L, NA, 6L, 7L),
  3.                      z = c(8L, 9L, 10L, 11L, NA)),
  4.                 .Names = c("x", "y", "z"),
  5.                 class = "data.frame",
  6.                 row.names = c(NA, -5L)))
  7. x   y   z
  8. 1   4   8
  9. 2   5   9
  10. NA  NA  10
  11. 3   6   11
  12. NA  7   NA
  13.    
  14. x   y   z
  15. 1   4   8
  16. 2   5   9
  17. 3   6   11
  18. NA  7   NA
  19.    
  20. df[!with(df,is.na(x)& is.na(y)),]
  21. #      x y  z
  22. #1  1 4  8
  23. #2  2 5  9
  24. #4  3 6 11
  25. #5 NA 7 NA
  26.    
  27. set.seed(237)
  28. df <- data.frame(x=sample(c(NA,1:20), 1e6, replace=T), y= sample(c(NA, 1:10), 1e6, replace=T), z= sample(c(NA, 5:15), 1e6,replace=T))
  29.  
  30. f1 <- function() df[!with(df,is.na(x)& is.na(y)),]
  31. f2 <- function() df[rowSums(is.na(df[c("x", "y")])) != 2, ]
  32. f3 <- function()  df[ apply( df, 1, function(x) sum(is.na(x))>1 ), ]
  33.  
  34. library(microbenchmark)
  35.  
  36. microbenchmark(f1(), f2(), f3(), unit="relative")
  37. Unit: relative
  38. #expr       min        lq    median        uq       max neval
  39. # f1()  1.000000  1.000000  1.000000  1.000000  1.000000   100
  40. # f2()  1.044812  1.068189  1.138323  1.129611  0.856396   100
  41. # f3() 26.205272 25.848441 24.357665 21.799930 22.881378   100
  42.    
  43. > df[rowSums(is.na(df[c("x", "y")])) != 2, ]
  44.    x y  z
  45. 1  1 4  8
  46. 2  2 5  9
  47. 4  3 6 11
  48. 5 NA 7 NA
  49.    
  50. ## Sample data with 10 columns and 1 million rows
  51. set.seed(123)
  52. df <- data.frame(replicate(10, sample(c(NA, 1:20),
  53.                                       1e6, replace = TRUE)))
  54.    
  55. f1 <- function() {
  56.   df[!with(df, is.na(X1) & is.na(X2)), ]
  57. }
  58. f2 <- function() {
  59.   df[rowSums(is.na(df[1:2])) != 2, ]
  60. }
  61.  
  62. library(microbenchmark)
  63. microbenchmark(f1(), f2(), times = 20)
  64. # Unit: milliseconds
  65. #  expr      min       lq   median       uq      max neval
  66. #  f1() 745.8378 1100.764 1128.047 1199.607 1310.236    20
  67. #  f2() 784.2132 1101.695 1125.380 1163.675 1303.161    20
  68.    
  69. f1_5 <- function() {
  70.   df[!with(df, is.na(X1) & is.na(X2) & is.na(X3) &
  71.              is.na(X4) & is.na(X5)), ]
  72. }
  73. f2_5 <- function() {
  74.   df[rowSums(is.na(df[1:5])) != 5, ]
  75. }
  76.  
  77. microbenchmark(f1_5(), f2_5(), times = 20)
  78. # Unit: seconds
  79. #    expr      min       lq   median       uq      max neval
  80. #  f1_5() 1.275032 1.294777 1.325957 1.368315 1.572772    20
  81. #  f2_5() 1.088564 1.169976 1.193282 1.225772 1.275915    20
  82.    
  83. sel <- apply( df, 1, function(x) sum(is.na(x))>1 )
  84.    
  85. df[ sel, ]
  86.    
  87. sel <- apply( df[,c("x","y")], 1, function(x) sum(is.na(x))>1 )
  88.    
  89. sel <- apply( df[,c("x","y")], 1, function(x) all(is.na(x)) )
  90.    
  91. > microbenchmark( df[!with(df,is.na(x)& is.na(y)),], df[rowSums(is.na(df[c("x", "y")])) != 2, ], df[ apply( df, 1, function(x) sum(is.na(x))>1 ), ] )
  92. Unit: microseconds
  93.                                               expr     min       lq   median       uq      max neval
  94.               df[!with(df, is.na(x) & is.na(y)), ]  67.148  71.5150  76.0340  86.0155 1049.576   100
  95.         df[rowSums(is.na(df[c("x", "y")])) != 2, ] 132.064 139.8760 145.5605 166.6945  498.934   100
  96.  df[apply(df, 1, function(x) sum(is.na(x)) > 1), ] 175.372 184.4305 201.6360 218.7150  321.583   100
  97.    
  98. library(data.table)
  99.  
  100. # set your data.frame into a data.table
  101.   setDT(df)
  102.  
  103.  
  104. na.omit(df, cols = c('x', 'y'))
RAW Paste Data
Top