Advertisement
Guest User

Untitled

a guest
Oct 10th, 2019
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.50 KB | None | 0 0
  1. library(data.table)
  2. library(disk.frame)
  3. setup_disk.frame()
  4.  
  5. bench_disk.frame_data.table_group_by <- function(data1,n) {
  6. setDT(data1)
  7.  
  8. a.sharded.df = as.disk.frame(data1, shardby = c("year", "month", "day"))
  9. a.not_sharded.df = as.disk.frame(data1)
  10.  
  11. data.table_timing = system.time(data1[,.(mean_dep_time = mean(dep_time, na.rm=T)), .(year, month, day)])[3]
  12.  
  13. disk.frame_sharded_timing = system.time(
  14. a.sharded.df[
  15. ,
  16. .(mean_dep_time = mean(dep_time, na.rm=TRUE)),
  17. .(year, month, day),
  18. keep = c("year", "month","day", "dep_time")])[3]
  19.  
  20.  
  21. disk.frame_not_sharded_timing = system.time(
  22. a.not_sharded.df[
  23. ,
  24. .(
  25. sum_dep_time = sum(dep_time, na.rm=TRUE),
  26. n = sum(!is.na(dep_time))
  27. ),
  28. .(year, month, day),
  29. keep = c("year", "month","day", "dep_time")][
  30. ,
  31. .(mean_dep_time = sum(sum_dep_time)/sum(n)),
  32. .(year, month, day)
  33. ])[3]
  34.  
  35. barplot(
  36. c(data.table_timing, disk.frame_sharded_timing, disk.frame_not_sharded_timing),
  37. names.arg = c("data.table", "sharded disk.frame", "not sharded disk.frame"),
  38. main = glue:glue("flights duplicated {n} times group-by year, month, day"),
  39. ylab = "Seconds")
  40. }
  41.  
  42.  
  43. system.time(flights_100 <- rbindlist(lapply(1:100, function(x) nycflights13::flights)))
  44.  
  45. gc()
  46. bench_disk.frame_data.table_group_by(flights_100, 100)
  47.  
  48. system.time(flights_1000 <- rbindlist(lapply(1:10, function(x) flights_100)))
  49. rm(flights_100)
  50. gc()
  51.  
  52.  
  53. bench_disk.frame_data.table_group_by(flights_1000, 1000)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement