daily pastebin goal
54%
SHARE
TWEET

Untitled

a guest Dec 13th, 2018 62 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1.  
  2. ################################ ALL FUNCTIONS ################################
  3. # get paths and deal with duplicates
  4. getPaths <- function( getName, ifDpl ){
  5.  
  6.   #all paths (with duplicates)
  7.   all <- allPaths[ which( grepl( getName, allPaths, ignore.case=TRUE ) ) ]
  8.  
  9.   if( ifDpl ){
  10.    
  11.     #paths without names
  12.     path <- unlist(strsplit( all, '[^/]+$' ))
  13.     #unique values
  14.     uniq <- !duplicated(path)
  15.     #names
  16.     names <- unlist(basename( all[uniq] ))
  17.     #default name
  18.     name <- paste0(getName,'.csv')
  19.    
  20.    
  21.     all <- c( paste0( path[uniq], names ) )
  22.    
  23.     if( length(path[!uniq]) != 0 )
  24.       all <- c( all, paste0( path[!uniq], name ) )
  25.    
  26.   }
  27.  
  28.   return( all )
  29.  
  30. }
  31.  
  32. # gather data from .xlsx and .csv files to data.frame
  33. gatherData <- function( all, func ){
  34.  
  35.   df <- data.frame()
  36.  
  37.   for( i in all ){
  38.     # print(i)
  39.     isCsv <- tail( unlist(strsplit( i, '[.]' )), n=1 ) == 'csv'
  40.    
  41.     gather <- function(name) {
  42.       switch(name,
  43.              price = do_price( i, isCsv ),
  44.              stadium = do_stadiums( i, isCsv )
  45.       )
  46.     }
  47.    
  48.     temp <- gather(func)
  49.    
  50.     #if theres nothing to bind, break
  51.     if( nrow(temp) == 0 ) next
  52.    
  53.    
  54.     #mandatory fields with data from folder names
  55.     temp[ , 'SPORT' ] <- sapply( strsplit(i, '/'), '[[', 7 )
  56.     temp[ , 'CODE' ] <- sapply( strsplit(i, '/'), '[[', 8 )
  57.    
  58.     df <- rbind(df, na.omit(temp))
  59.    
  60.   }
  61.   colnames(df)[3] <- 'AVERAGE.TICKET.PRICE'
  62.   colnames(df)[5] <- 'AVERAGE.TICKET.PRICE_USD'
  63.  
  64.   write.csv( df, file = paste0(func,'.csv'), row.names=FALSE, na='' )
  65.   return(df)
  66. }
  67.  
  68. # process stadium table data
  69. do_stadiums <- function( dataSet, isCsv ){
  70.  
  71.   temp <- data.frame()
  72.   get <- c('NAME','CAPACITY','TEAM','COMMENT','LINK','DATE')
  73.  
  74.   if (isCsv){
  75.    
  76.     #get all files in this directory
  77.     path <- unlist(strsplit( dataSet, '[^/]+$' ))
  78.     all <- dir( path, full.names = TRUE, recursive = TRUE )
  79.    
  80.     #filter fixed files and get DATE list
  81.     filtered <- all[ which( grepl( '[0-9]{2}_f.', all ) ) ]
  82.     date <- sapply( strsplit(filtered, '_'), '[[', 2 )
  83.    
  84.     for (i in date)
  85.       temp <- rbind( temp, na.omit(data.frame( read.csv(dataSet), DATE = i )[ , get ] ))
  86.    
  87.   }
  88.   else{
  89.    
  90.     #get all sheet names
  91.     sheetNames <- readxl::excel_sheets(dataSet)
  92.    
  93.     for (j in sheetNames){
  94.      
  95.       #check if sheet isn't empty
  96.       num <- nrow( read_excel(dataSet, sheet = j) )
  97.       if( num == 0 ) next
  98.      
  99.       sheet <- data.frame( read_excel( dataSet, sheet = j ), 'DATE' = j )
  100.      
  101.       #find missing columns and add them to data.frame
  102.       missing <- get[ !is.element( get, names(sheet) ) ]
  103.       sheet[,missing] <- NA
  104.      
  105.       temp <- rbind( temp, na.omit(sheet[, get]) )
  106.     }
  107.   }
  108.  
  109.   return(temp)
  110.  
  111. }
  112.  
  113. # process price table data
  114. do_price <- function( dataSet, isCsv ){
  115.  
  116.   years <- c('2014', '2015', '2016', '2017', '2018', '2014-15', '2015-16', '2016-17', '2017-18')
  117.  
  118.   temp <- data.frame()
  119.  
  120.   if (isCsv){
  121.     # no such files
  122.   }
  123.   else{
  124.    
  125.     #get all sheet names
  126.     sheetNames <- readxl::excel_sheets(dataSet)
  127.     sheetNames <- years[ is.element( years, sheetNames ) ]
  128.    
  129.     for (j in sheetNames){
  130.       # print(j)
  131.       #check if sheet isn't empty
  132.       num <- nrow( read_excel(dataSet, sheet = j) ) #check first value. if empty skip?
  133.       if( num == 0 ) next
  134.      
  135.       ifUS <- grepl( 'US', basename( dataSet ) )
  136.       datafr <- data.frame( read_excel( dataSet, sheet = j ))
  137.  
  138.       temp <- rbind( temp, na.omit(data.frame(
  139.         'DATE'                       = j,
  140.         'CLUB'                       = datafr[[1]],
  141.         if (typeof(datafr[[2]])      == "character"){
  142.           'AVERAGE.TICKET.PRICE'     = datafr[[2]]
  143.         }else{
  144.           'AVERAGE.TICKET.PRICE'     = round(datafr[[2]], 2)
  145.         },
  146.         'CURRENCY'                   = datafr[[3]],
  147.         if (typeof(datafr[[ ifelse( ifUS, 2, 4 ) ]]) == "character"){
  148.           'AVERAGE.TICKET.PRICE_USD' = datafr[[ ifelse( ifUS, 2, 4 ) ]]
  149.         }else{
  150.           'AVERAGE.TICKET.PRICE_USD' = round(datafr[[ ifelse( ifUS, 2, 4 ) ]], 2)
  151.         },
  152.         stringsAsFactors = FALSE
  153.       )))
  154.     }
  155.   }
  156.  
  157.   return(temp)
  158. }
  159.  
  160. # process totals table data
  161. doTotals <- function(allPaths){
  162.  
  163.   k <- subset(unique(dirname(allPaths)), str_count(unique(dirname(allPaths)), '/') == 7)
  164.   #preparing full paths
  165.  
  166.   k <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), '_totals_f.csv')
  167.  
  168.   #paths that are wrong. NOTES
  169.   # k <- k[!k %in% allPaths[allPaths %in% k]]
  170.  
  171.   #taking paths that exist
  172.   k <- allPaths[allPaths %in% k]
  173.  
  174.   df <- data.frame()
  175.  
  176.   for( i in k ){
  177.     code <- sapply(strsplit(i, '/'), '[[', 8)
  178.     sport_type <- sapply(strsplit(i, '/'), '[[', 7)
  179.     temp <- read.csv(file=i)
  180.    
  181.     #this step is important because at the begining R thinks that 2016 is a numeric value. Later for example if 2016-17 would be binded, R would import wrong data
  182.     temp$YEAR = as.character(temp$YEAR)
  183.    
  184.     #adding few more columns
  185.     temp$code = code
  186.     temp$sport_type = sport_type
  187.    
  188.     #rbind(df, temp) wont work because of different columns in temp dataframes. plyr is cool
  189.     df <- plyr::rbind.fill(df, na.omit(temp))
  190.   }
  191.   write.csv( df, file = paste0('totals','.csv'), row.names=FALSE, na='' )
  192.   return(df)
  193. }
  194.  
  195. # process fixed table data
  196. doFixed <- function(allPaths, k){
  197.   #all possible endings of each fixed and not fixed files
  198.   possibilities_f = c('_2014_f', '_2015_f', '_2016_f', '_2017_f', '_2018_f', '_2014-15_f', '_2015-16_f', '_2016-17_f', '_2017-18_f')
  199.  
  200.   df <- data.frame()
  201.   for(i in possibilities_f){
  202.     kk <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), i, '.csv')
  203.    
  204.     for (j in kk){
  205.       if (file.exists(j) && nrow(read.csv(file=j) != 0)){
  206.         code <- sapply(strsplit(j, '/'), '[[', 8)
  207.         sport_type <- sapply(strsplit(j, '/'), '[[', 7)
  208.        
  209.         temp <- read.csv(file=j)
  210.        
  211.         #renaming wrongly named columns so I could select them
  212.         if("BC.Zaporizhya" %in% colnames(temp)){
  213.           names(temp)[names(temp) == 'BC.Zaporizhya'] <- 'AWAY_TEAM'
  214.         }
  215.         if("SCORE" %in% colnames(temp)){
  216.           names(temp)[names(temp) == 'SCORE'] <- 'RESULT'
  217.         }
  218.        
  219.         #if this column name doesnt exist then add it (it's needed in order to bind dataframes)
  220.         if(!"MATCH_LINK" %in% colnames(temp)){
  221.           temp$MATCH_LINK = ''
  222.         }
  223.        
  224.         temp$SEASON_PART <- as.character(temp$SEASON_PART)
  225.         temp$MATCH_LINK <- as.character(temp$MATCH_LINK)
  226.        
  227.         temp <- data.frame(#is YEAR field realy needed here? Because date field is more informative
  228.           'code'        = code,
  229.           'sport_type'  = sport_type,
  230.           'DATE'        = temp[[1]],
  231.           'SEASON_PART' = temp$SEASON_PART,
  232.           'MATCH_LINK'  = temp$MATCH_LINK,
  233.           'HOME_TEAM'   = temp$HOME_TEAM,
  234.           'RESULT'      = temp$RESULT,
  235.           'AWAY_TEAM'   = temp$AWAY_TEAM,
  236.           'VENUE'       = temp$VENUE,
  237.           'ATTENDANCE'  = temp$ATTENDANCE,
  238.           'COMMENT'     = temp$COMMENT,
  239.           'CAPACITY'    = temp$CAPACITY
  240.         )
  241.         df <- rbind(df, na.omit(temp))
  242.         # df <- plyr::rbind.fill(df, temp)
  243.       }
  244.     }
  245.   }
  246.   write.csv( df, file = paste0('fixed','.csv'), row.names=FALSE, na='' )
  247.   return(df)
  248. }
  249.  
  250. # process not fixed table data
  251. doNotFixed <- function(allPaths, fileType, k, howToRead){
  252.   #generating all possible paths from allPaths variable
  253.   possibilities = c('_2014', '_2015', '_2016', '_2017', '_2018', '_2014-15', '_2015-16', '_2016-17', '_2017-18')
  254.  
  255.   df <- data.frame()
  256.   for(i in possibilities){
  257.     kk <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), i, fileType)
  258.    
  259.     for (j in kk){
  260.       ############################
  261.       if (file.exists(j) && nrow(eval(parse(text=howToRead))) != 0){
  262.         # print(j)
  263.         code <- sapply(strsplit(j, '/'), '[[', 8)
  264.         sport_type <- sapply(strsplit(j, '/'), '[[', 7)
  265.        
  266.         temp <-eval(parse(text=howToRead))
  267.        
  268.         #renaming wrongly named columns so I could select them
  269.         if("LINK" %in% colnames(temp)){
  270.           names(temp)[names(temp) == 'LINK'] <- 'MATCH_LINK'
  271.         }
  272.         if("SEASON" %in% colnames(temp)){
  273.           names(temp)[names(temp) == 'SEASON'] <- 'SEASON_PART'
  274.         }
  275.         if("HOME" %in% colnames(temp)){
  276.           names(temp)[names(temp) == 'HOME'] <- 'HOME_TEAM'
  277.         }
  278.         if("AWAY" %in% colnames(temp)){
  279.           names(temp)[names(temp) == 'AWAY'] <- 'AWAY_TEAM'
  280.         }
  281.         if("BC.Zaporizhya" %in% colnames(temp)){
  282.           names(temp)[names(temp) == 'BC.Zaporizhya'] <- 'AWAY_TEAM'
  283.         }
  284.         if("SCORE" %in% colnames(temp)){
  285.           names(temp)[names(temp) == 'SCORE'] <- 'RESULT'
  286.         }
  287.         if("Sportcomplex.Olymp" %in% colnames(temp)){
  288.           names(temp)[names(temp) == 'Sportcomplex.Olymp'] <- 'VENUE'
  289.         }
  290.        
  291.         #if this column name doesnt exist then add it (it's needed in order to bind dataframes)
  292.         if(!"SEASON_PART" %in% colnames(temp)){
  293.           temp$SEASON_PART = ''
  294.         }
  295.         if(!"VENUE" %in% colnames(temp)){
  296.           temp$VENUE = ''
  297.         }
  298.         if(!"MATCH_LINK" %in% colnames(temp)){
  299.           temp$MATCH_LINK = ''
  300.         }
  301.        
  302.         temp$SEASON_PART <- as.character(temp$SEASON_PART)
  303.        
  304.         temp <- data.frame(#is YEAR field realy needed here? Because date field is more informative
  305.           'code'        = code,
  306.           'sport_type'  = sport_type,
  307.           'DATE'        = as.character(temp[[1]]),
  308.           'SEASON_PART' = temp$SEASON_PART,
  309.           'MATCH_LINK'  = temp$MATCH_LINK,#this was not in the sport tables file
  310.           'HOME_TEAM'   = temp$HOME_TEAM,
  311.           'RESULT'      = temp$RESULT,
  312.           'AWAY_TEAM'   = temp$AWAY_TEAM,
  313.           'VENUE'       = temp$VENUE,
  314.           'ATTENDANCE'  = temp$ATTENDANCE
  315.         )
  316.         df <- rbind(df, na.omit(temp))
  317.         # df <- plyr::rbind.fill(df, na.omit(temp))
  318.       }
  319.     }
  320.   }
  321.   return(df)
  322. }
  323. ################################ MAIN STUFF ################################
  324. # install.packages('magrittr')
  325. # install.packages('dplyr')
  326. # install.packages('tidyverse')
  327. # install.packages('readxl')
  328. # install.packages('xlsx')
  329.  
  330.  
  331. library(magrittr)
  332. library(dplyr)
  333. library(tidyverse)
  334. library(readxl)
  335. library(xlsx)
  336.  
  337. #listing all paths recursively
  338. allPaths <- dir( "C:/Users/mindaugas.venskus/Documents/R/Sports", full.names = TRUE, recursive = TRUE )
  339.  
  340. #revome ~$ files
  341. allPaths <- allPaths[ which( !grepl( '[~$]', allPaths) ) ]
  342.  
  343. #revome files from irrelevant folders
  344. allPaths <- allPaths[ which( grepl( '-', sapply( strsplit(allPaths, '/'), '[[', 8 ) ) ) ]
  345.  
  346. ################################ prices ################################
  347.  
  348. price <- getPaths('price', FALSE )# name of folder, if remove duplicates
  349. prices <- gatherData( price, 'price' )# paths, switch_case to process them
  350. rm(price)
  351.  
  352. ################################ stadiums ################################
  353.  
  354. stadium <- getPaths('Stadiums', TRUE )
  355. stadiums <- gatherData( stadium, 'stadium' )
  356. rm(stadium)
  357.  
  358. ################################ totals ################################
  359.  
  360. totals <- doTotals(allPaths)
  361.  
  362. ################################ FIXED ################################
  363.  
  364. #generating all possible paths from allPaths variable
  365. k <- subset(unique(dirname(allPaths)), str_count(unique(dirname(allPaths)), '/') == 7)
  366. fixed <- doFixed(allPaths, k)
  367.  
  368. ################################ NOT FIXED ################################
  369.  
  370. # binding CSV and XLSX data together
  371. notFixed <- rbind(doNotFixed(allPaths, '.csv', k, 'read.csv(j)'), doNotFixed(allPaths, '.xlsx', k, 'read_xlsx(j)'))
  372. write.csv( notFixed, file = paste0('notFixed','.csv'), row.names=FALSE, na='' )
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top