celestialgod

data manipulation

Jul 12th, 2015
479
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 5.42 KB | None | 0 0
  1. library(data.table)
  2. library(plyr)
  3. library(dplyr)
  4. library(magrittr)
  5. library(microbenchmark)
  6.  
  7. set.seed(100)
  8. n_rows = 5e4
  9. n_cols = 200
  10. dat_dt = matrix(as.character(rbinom(n_rows*n_cols, 15, .5)), n_rows) %>% data.table %>% tbl_dt(FALSE)
  11. txt = dat_dt %>% mutate_(Vcobime = paste0("paste(", paste0("V", 1:n_cols, collapse = ","),
  12.   ", sep = \",\")")) %>% .$Vcobime %>% paste(collapse = "\n")
  13. txt = gsub("15", "Null",txt)
  14.  
  15. dat_df_default = read.csv(textConnection(txt), head = FALSE)
  16. dat_df_notFactors = read.csv(textConnection(txt), head = FALSE, stringsAsFactors=FALSE)
  17. dat_df_nastrings = read.csv(textConnection(txt), head = FALSE, na.strings = "Null")
  18. dat_dt_default = fread(txt, head = FALSE)
  19. dat_dt_nastrings = fread(txt, head = FALSE, na.strings="Null")
  20.  
  21. sum(sapply(dat_df_notFactors, class)=="character")
  22. # 156 = number of character(factor) columns
  23.  
  24. convert_df_int = function(df, classToChange){
  25.   if (classToChange == "factor")
  26.     df[which(sapply(df, class)==classToChange)] = sapply(df[which(sapply(df, class)==classToChange)], function(x) as.integer(as.character(x)))
  27.   else if (classToChange == "character")
  28.     df[which(sapply(df, class)==classToChange)] = sapply(df[which(sapply(df, class)==classToChange)], as.integer)
  29.   df
  30. }
  31.  
  32. convert_df_int_2 = function(df){
  33.   loc = which(sapply(df, class)=="character")
  34.   df[,loc] = as.integer(as.matrix(df[,loc]))
  35.   df
  36. }
  37.  
  38. convert_df_mutate_each = function(df, classToChange){
  39.   loc = which(sapply(df, class)==classToChange)
  40.   if (classToChange == "factor")
  41.     df %>% mutate_each_(funs(as.character), names(loc)) %>% mutate_each_(funs(as.integer), names(loc))
  42.   else if (classToChange == "character")
  43.     df %>% mutate_each_(funs(as.integer), names(loc))
  44. }
  45.  
  46. convert_df_transform = function(df, classToChange){
  47.   loc = which(sapply(df, class)==classToChange)
  48.   if (classToChange == "factor")
  49.     expr = paste0(names(loc), "=as.integer(as.character(", names(loc), "))")
  50.   else if (classToChange == "character")
  51.     expr = paste0(names(loc), "=as.integer(", names(loc), ")")
  52.   eval(parse(text = paste0("df %>% transform(", paste(expr, collapse = ","),")")))
  53. }
  54.  
  55. df_method_1 = colwise(function(x) if(class(x)=="factor"){as.integer(as.character(x))}else{x})(dat_df_default)
  56. df_method_2 = colwise(function(x) if(class(x)=="character"){as.integer(x)}else{x})(dat_df_notFactors)
  57. dt_method_3 = colwise(function(x) if(class(x)=="character"){as.integer(x)}else{x})(dat_dt_default)
  58. dt_method_4 = colwise(function(x) if(class(x)=="character"){as.integer(x)}else{x})(dat_dt_nastrings)
  59. df_method_5 = convert_df_int(dat_df_default, "factor")
  60. df_method_6 = convert_df_int(dat_df_notFactors, "character")
  61. df_method_7 = convert_df_int_2(dat_df_notFactors)
  62. df_method_8 = convert_df_mutate_each(dat_df_default, "factor")
  63. df_method_9 = convert_df_mutate_each(dat_df_notFactors, "character")
  64. df_method_10 = convert_df_transform(dat_df_default, "factor")
  65. df_method_11 = convert_df_transform(dat_df_notFactors, "character")
  66. dt_method_12 = convert_df_mutate_each(dat_dt_default, "character")
  67. dt_method_13 = convert_df_mutate_each(dat_dt_nastrings, "character")
  68. dt_method_14 = convert_df_transform(dat_dt_default, "character")
  69. dt_method_15 = convert_df_transform(dat_dt_nastrings, "character")
  70.  
  71. all.equal(dat_df_nastrings, df_method_1, check.attributes = FALSE)  # TRUE
  72. all.equal(dat_df_nastrings, df_method_2, check.attributes = FALSE)  # TRUE
  73. all.equal(dat_df_nastrings, df_method_3, check.attributes = FALSE)  # TRUE
  74. all.equal(dat_df_nastrings, df_method_4, check.attributes = FALSE)  # TRUE
  75. all.equal(dat_df_nastrings, df_method_5, check.attributes = FALSE)  # TRUE
  76. all.equal(dat_df_nastrings, df_method_6, check.attributes = FALSE)  # TRUE
  77. all.equal(dat_df_nastrings, df_method_7, check.attributes = FALSE)  # TRUE
  78. all.equal(dat_df_nastrings, df_method_8, check.attributes = FALSE)  # TRUE
  79. all.equal(dat_df_nastrings, df_method_9, check.attributes = FALSE)  # TRUE
  80. all.equal(dat_df_nastrings, df_method_10, check.attributes = FALSE) # TRUE
  81. all.equal(dat_df_nastrings, df_method_11, check.attributes = FALSE) # TRUE
  82. all.equal(dat_df_nastrings, dt_method_12, check.attributes = FALSE) # TRUE
  83. all.equal(dat_df_nastrings, dt_method_13, check.attributes = FALSE) # TRUE
  84. all.equal(dat_df_nastrings, dt_method_14, check.attributes = FALSE) # TRUE
  85. all.equal(dat_df_nastrings, dt_method_15, check.attributes = FALSE) # TRUE
  86.  
  87. microbenchmark(
  88. method_1 = colwise(function(x) if(class(x)=="factor"){as.integer(as.character(x))}else{x})(dat_df_default),
  89. method_2 = colwise(function(x) if(class(x)=="character"){as.integer(x)}else{x})(dat_df_notFactors),
  90. method_3 = colwise(function(x) if(class(x)=="character"){as.integer(x)}else{x})(dat_dt_default),
  91. method_4 = colwise(function(x) if(class(x)=="character"){as.integer(x)}else{x})(dat_dt_nastrings),
  92. method_5 = convert_df_int(dat_df_default, "factor"),
  93. method_6 = convert_df_int(dat_df_notFactors, "character"),
  94. method_7 = convert_df_int_2(dat_df_notFactors),
  95. method_8 = convert_df_mutate_each(dat_df_default, "factor"),
  96. method_9 = convert_df_mutate_each(dat_df_notFactors, "character"),
  97. method_10 = convert_df_transform(dat_df_default, "factor"),
  98. method_11 = convert_df_transform(dat_df_notFactors, "character"),
  99. method_12 = convert_df_mutate_each(dat_dt_default, "character"),
  100. method_13 = convert_df_mutate_each(dat_dt_nastrings, "character"),
  101. method_14 = convert_df_transform(dat_dt_default, "character"),
  102. method_15 = convert_df_transform(dat_dt_nastrings, "character"),
  103. times = 20)
Advertisement
Add Comment
Please, Sign In to add comment