Advertisement
Guest User

Untitled

a guest
Aug 5th, 2016
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.74 KB | None | 0 0
  1. library(DBI)
  2. library(RMySQL)
  3.  
  4. baseFolder = "/datafeeds"
  5. # 1. Setting directory to FTP folder where files incoming from Adobe ----
  6. setwd(baseFolder)
  7.  
  8. # 2. Sort files into three separate folders ----
  9. hitData = "./hit_data"
  10. metaDataFolder="./metadata"
  11. columnDataFolder = "./columns"
  12.  
  13. # 3. Extract metadata; browsers, colour_deption, country, languages, etc ----
  14. metaDataFiles = c("browser_type.tsv","browser.tsv","color_depth.tsv","connection_type.tsv","country.tsv","javascript_version.tsv","languages.tsv","operating_systems.tsv","plugins.tsv","referrer_type.tsv","resolution.tsv","search_engines.tsv")
  15. lapply(list.files(pattern = "*.tar.gz"), function(x){
  16. if(grepl(pattern = "000000.tar.gz", x) || !dir.exists(metaDataFolder)){
  17. print(paste('Extracting metadata from',x))
  18. untar(x, files=metaDataFiles, exdir=metaDataFolder)
  19. }
  20. })
  21.  
  22. # 4. Extract column data and events ----
  23. columnData = c("column_headers.tsv","event.tsv")
  24. lapply(list.files(pattern = "*.tar.gz"), function(x){
  25. if(grepl(pattern = "000000.tar.gz", x) || !dir.exists(columnDataFolder)){
  26. print(paste('Extracting column data from',x))
  27. untar(x, files=columnData, exdir=columnDataFolder)
  28. }
  29. })
  30.  
  31. # 5. Extract all Hit Data and save to a separate folder ----
  32. lapply(list.files(pattern = "*.tar.gz"), function(x){
  33. untar(x, files="hit_data.tsv", exdir=hitData );
  34. hitDataFile = paste0(hitData, "/hit_data.tsv");
  35. # rename the hit_data file to match enclosing filename.
  36. newHitDataFile = paste0(hitData, "/", sub(".tar.gz",".tsv",x))
  37. file.rename(hitDataFile, newHitDataFile)
  38. })
  39.  
  40. # 6. Archive datafeeds ----
  41.  
  42. # 7. Importing metadata into MySQL ----
  43. conn <- dbConnect(RMySQL::MySQL(), host="localhost", user="rstudio", password="rstudio", dbname="rstudio");
  44.  
  45. setwd(paste0(baseFolder, "/", metaDataFolder))
  46.  
  47. #Create function due to repetitiveness
  48. #Since we're loading lookup tables with mostly same values each time, put source file in table
  49. loadlookup <- function(tblname){
  50. df <- read.csv2(paste(tblname,".tsv", sep=""), sep = "\t", header = FALSE, stringsAsFactors = FALSE)
  51. if(dbExistsTable(conn, tblname)){ dbRemoveTable(conn,tblname) }
  52. dbWriteTable(conn, name=tblname, value=df, row.names=FALSE,overwrite=TRUE,append=FALSE )
  53. dbSendQuery(conn, paste0("ALTER TABLE ", tblname, " CHANGE COLUMN V1 id BIGINT"))
  54. dbSendQuery(conn, paste0("ALTER TABLE ", tblname, " CHANGE COLUMN V2 label TEXT"))
  55. }
  56.  
  57. #untar files, place in directory by day
  58. metaDataTables = unlist(lapply(list.files(pattern = "*.tsv"), function(x){print(sub(".tsv","",x))}))
  59. for(file in list.files(pattern = "*.tsv")){
  60. print(file)
  61.  
  62. for(tbl in metaDataTables){
  63. loadlookup(tbl)
  64. }
  65. }
  66.  
  67. # 8. Load clickstream data ----
  68. #Set directory to avoid having to use paste to build urls
  69. setwd(paste0(baseFolder, "/", hitData))
  70.  
  71. #Set column headers for server calls
  72. column_headers = unlist(colnames(read.delim(
  73. paste0(baseFolder, "/", columnDataFolder, "/", "column_headers.tsv"),
  74. stringsAsFactors=FALSE
  75. )))
  76. column_headers = append(column_headers, "file")
  77.  
  78. # During testing, always drop the clickstream table.
  79. if(dbExistsTable(conn, "clickstream")){dbRemoveTable(conn, "clickstream")}
  80.  
  81. # Loop over entire list of files
  82. for(file in list.files()){
  83. print(file)
  84. # During testing, just use first 5 rows of data so things load faster.
  85. #hit_data = read.csv2(file, sep = "\t", header = FALSE, stringsAsFactors = FALSE, colClasses = "character", nrows=5)
  86. # Use the below line for prod
  87. hit_data = read.csv2(file, sep = "\t", header = FALSE, stringsAsFactors = FALSE, colClasses = "character")
  88. # Add the filename
  89. hit_data$file <- file
  90. colnames(hit_data) = column_headers
  91. dbWriteTable(conn, name = 'clickstream', value = hit_data, append = T, overwrite = F, col.names=column_headers)
  92. rm(hit_data)
  93. }
  94.  
  95. # Run analyze in MySQL so that query planner has accurate information
  96. dbGetQuery(conn, "ANALYZE TABLE clickstream")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement