Advertisement
ivan866

generating a 50M line dataset

Sep 25th, 2022
2,038
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 0.66 KB | Science | 0 0
  1. require(data.table)
  2.  
  3. STR.VALS <- c('John, Denver','Michael, Davis','John, Carpenter','Steve, Hopkins','Jena, Malone','Katy, Darabont','Lisa, Minelli','Frank, Johnson',NA,'986188.9999.01')
  4. DT.CHUNK <- 5*10^7
  5. DT.FINAL <- 5*10^7
  6. N.CHUNKS <- DT.FINAL/DT.CHUNK
  7. APPENDED <- FALSE
  8.  
  9. for (ch in seq(1,N.CHUNKS)) {
  10.   DT <- data.table(col1=c(),col2=c(),col3=c(),col4=c())
  11.   DT[,col1 := sample(STR.VALS,DT.CHUNK,replace=TRUE)]
  12.   DT[,col2 := sample(seq(1,20),DT.CHUNK,replace=TRUE)]
  13.   DT[,col3 := sample(LETTERS,DT.CHUNK,replace=TRUE)]
  14.   DT[,col4 := rnorm(DT.CHUNK)]
  15.   fwrite(DT, 'fname.csv', append=APPENDED, col.names=!APPENDED, row.names=TRUE, scipen=9)
  16.   APPENDED <- TRUE
  17. }
Tags: dataset
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement