Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(DBI)
- library(RMySQL)
- baseFolder = "/datafeeds"
- # 1. Setting directory to FTP folder where files incoming from Adobe ----
- setwd(baseFolder)
- # 2. Sort files into three separate folders ----
- hitData = "./hit_data"
- metaDataFolder="./metadata"
- columnDataFolder = "./columns"
- # 3. Extract metadata; browsers, colour_deption, country, languages, etc ----
- metaDataFiles = c("browser_type.tsv","browser.tsv","color_depth.tsv","connection_type.tsv","country.tsv","javascript_version.tsv","languages.tsv","operating_systems.tsv","plugins.tsv","referrer_type.tsv","resolution.tsv","search_engines.tsv")
- lapply(list.files(pattern = "*.tar.gz"), function(x){
- if(grepl(pattern = "000000.tar.gz", x) || !dir.exists(metaDataFolder)){
- print(paste('Extracting metadata from',x))
- untar(x, files=metaDataFiles, exdir=metaDataFolder)
- }
- })
- # 4. Extract column data and events ----
- columnData = c("column_headers.tsv","event.tsv")
- lapply(list.files(pattern = "*.tar.gz"), function(x){
- if(grepl(pattern = "000000.tar.gz", x) || !dir.exists(columnDataFolder)){
- print(paste('Extracting column data from',x))
- untar(x, files=columnData, exdir=columnDataFolder)
- }
- })
- # 5. Extract all Hit Data and save to a separate folder ----
- lapply(list.files(pattern = "*.tar.gz"), function(x){
- untar(x, files="hit_data.tsv", exdir=hitData );
- hitDataFile = paste0(hitData, "/hit_data.tsv");
- # rename the hit_data file to match enclosing filename.
- newHitDataFile = paste0(hitData, "/", sub(".tar.gz",".tsv",x))
- file.rename(hitDataFile, newHitDataFile)
- })
- # 6. Archive datafeeds ----
- # 7. Importing metadata into MySQL ----
- conn <- dbConnect(RMySQL::MySQL(), host="localhost", user="rstudio", password="rstudio", dbname="rstudio");
- setwd(paste0(baseFolder, "/", metaDataFolder))
- #Create function due to repetitiveness
- #Since we're loading lookup tables with mostly same values each time, put source file in table
- loadlookup <- function(tblname){
- df <- read.csv2(paste(tblname,".tsv", sep=""), sep = "\t", header = FALSE, stringsAsFactors = FALSE)
- if(dbExistsTable(conn, tblname)){ dbRemoveTable(conn,tblname) }
- dbWriteTable(conn, name=tblname, value=df, row.names=FALSE,overwrite=TRUE,append=FALSE )
- dbSendQuery(conn, paste0("ALTER TABLE ", tblname, " CHANGE COLUMN V1 id BIGINT"))
- dbSendQuery(conn, paste0("ALTER TABLE ", tblname, " CHANGE COLUMN V2 label TEXT"))
- }
- #untar files, place in directory by day
- metaDataTables = unlist(lapply(list.files(pattern = "*.tsv"), function(x){print(sub(".tsv","",x))}))
- for(file in list.files(pattern = "*.tsv")){
- print(file)
- for(tbl in metaDataTables){
- loadlookup(tbl)
- }
- }
- # 8. Load clickstream data ----
- #Set directory to avoid having to use paste to build urls
- setwd(paste0(baseFolder, "/", hitData))
- #Set column headers for server calls
- column_headers = unlist(colnames(read.delim(
- paste0(baseFolder, "/", columnDataFolder, "/", "column_headers.tsv"),
- stringsAsFactors=FALSE
- )))
- column_headers = append(column_headers, "file")
- # During testing, always drop the clickstream table.
- if(dbExistsTable(conn, "clickstream")){dbRemoveTable(conn, "clickstream")}
- # Loop over entire list of files
- for(file in list.files()){
- print(file)
- # During testing, just use first 5 rows of data so things load faster.
- #hit_data = read.csv2(file, sep = "\t", header = FALSE, stringsAsFactors = FALSE, colClasses = "character", nrows=5)
- # Use the below line for prod
- hit_data = read.csv2(file, sep = "\t", header = FALSE, stringsAsFactors = FALSE, colClasses = "character")
- # Add the filename
- hit_data$file <- file
- colnames(hit_data) = column_headers
- dbWriteTable(conn, name = 'clickstream', value = hit_data, append = T, overwrite = F, col.names=column_headers)
- rm(hit_data)
- }
- # Run analyze in MySQL so that query planner has accurate information
- dbGetQuery(conn, "ANALYZE TABLE clickstream")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement