Untitled


################################ ALL FUNCTIONS ################################
# get paths and deal with duplicates
getPaths <- function( getName, ifDpl ){

  #all paths (with duplicates)
  all <- allPaths[ which( grepl( getName, allPaths, ignore.case=TRUE ) ) ]

  if( ifDpl ){

    #paths without names
    path <- unlist(strsplit( all, '[^/]+$' ))
    #unique values
    uniq <- !duplicated(path)
    #names
    names <- unlist(basename( all[uniq] ))
    #default name
    name <- paste0(getName,'.csv')


    all <- c( paste0( path[uniq], names ) )

    if( length(path[!uniq]) != 0 )
      all <- c( all, paste0( path[!uniq], name ) )

  }

  return( all )

}

# gather data from .xlsx and .csv files to data.frame
gatherData <- function( all, func ){

  df <- data.frame()

  for( i in all ){
    # print(i)
    isCsv <- tail( unlist(strsplit( i, '[.]' )), n=1 ) == 'csv'

    gather <- function(name) {
      switch(name,
             price = do_price( i, isCsv ),
             stadium = do_stadiums( i, isCsv )
      )
    }

    temp <- gather(func)

    #if theres nothing to bind, break
    if( nrow(temp) == 0 ) next


    #mandatory fields with data from folder names
    temp[ , 'SPORT' ] <- sapply( strsplit(i, '/'), '[[', 7 )
    temp[ , 'CODE' ] <- sapply( strsplit(i, '/'), '[[', 8 )

    df <- rbind(df, na.omit(temp))

  }
  colnames(df)[3] <- 'AVERAGE.TICKET.PRICE'
  colnames(df)[5] <- 'AVERAGE.TICKET.PRICE_USD'

  write.csv( df, file = paste0(func,'.csv'), row.names=FALSE, na='' )
  return(df)
}

# process stadium table data
do_stadiums <- function( dataSet, isCsv ){

  temp <- data.frame()
  get <- c('NAME','CAPACITY','TEAM','COMMENT','LINK','DATE')

  if (isCsv){

    #get all files in this directory
    path <- unlist(strsplit( dataSet, '[^/]+$' ))
    all <- dir( path, full.names = TRUE, recursive = TRUE )

    #filter fixed files and get DATE list
    filtered <- all[ which( grepl( '[0-9]{2}_f.', all ) ) ]
    date <- sapply( strsplit(filtered, '_'), '[[', 2 )

    for (i in date)
      temp <- rbind( temp, na.omit(data.frame( read.csv(dataSet), DATE = i )[ , get ] ))

  }
  else{

    #get all sheet names
    sheetNames <- readxl::excel_sheets(dataSet)

    for (j in sheetNames){

      #check if sheet isn't empty
      num <- nrow( read_excel(dataSet, sheet = j) )
      if( num == 0 ) next

      sheet <- data.frame( read_excel( dataSet, sheet = j ), 'DATE' = j )

      #find missing columns and add them to data.frame
      missing <- get[ !is.element( get, names(sheet) ) ]
      sheet[,missing] <- NA

      temp <- rbind( temp, na.omit(sheet[, get]) )
    }
  }

  return(temp)

}

# process price table data
do_price <- function( dataSet, isCsv ){

  years <- c('2014', '2015', '2016', '2017', '2018', '2014-15', '2015-16', '2016-17', '2017-18')

  temp <- data.frame()

  if (isCsv){
    # no such files
  }
  else{

    #get all sheet names
    sheetNames <- readxl::excel_sheets(dataSet)
    sheetNames <- years[ is.element( years, sheetNames ) ]

    for (j in sheetNames){
      # print(j)
      #check if sheet isn't empty
      num <- nrow( read_excel(dataSet, sheet = j) ) #check first value. if empty skip?
      if( num == 0 ) next

      ifUS <- grepl( 'US', basename( dataSet ) )
      datafr <- data.frame( read_excel( dataSet, sheet = j ))

      temp <- rbind( temp, na.omit(data.frame(
        'DATE'                       = j,
        'CLUB'                       = datafr[[1]],
        if (typeof(datafr[[2]])      == "character"){
          'AVERAGE.TICKET.PRICE'     = datafr[[2]]
        }else{
          'AVERAGE.TICKET.PRICE'     = round(datafr[[2]], 2)
        },
        'CURRENCY'                   = datafr[[3]],
        if (typeof(datafr[[ ifelse( ifUS, 2, 4 ) ]]) == "character"){
          'AVERAGE.TICKET.PRICE_USD' = datafr[[ ifelse( ifUS, 2, 4 ) ]]
        }else{
          'AVERAGE.TICKET.PRICE_USD' = round(datafr[[ ifelse( ifUS, 2, 4 ) ]], 2)
        },
        stringsAsFactors = FALSE
      )))
    }
  }

  return(temp)
}

# process totals table data
doTotals <- function(allPaths){

  k <- subset(unique(dirname(allPaths)), str_count(unique(dirname(allPaths)), '/') == 7)
  #preparing full paths

  k <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), '_totals_f.csv')

  #paths that are wrong. NOTES
  # k <- k[!k %in% allPaths[allPaths %in% k]]

  #taking paths that exist
  k <- allPaths[allPaths %in% k]

  df <- data.frame()

  for( i in k ){
    code <- sapply(strsplit(i, '/'), '[[', 8)
    sport_type <- sapply(strsplit(i, '/'), '[[', 7)
    temp <- read.csv(file=i)

    #this step is important because at the begining R thinks that 2016 is a numeric value. Later for example if 2016-17 would be binded, R would import wrong data
    temp$YEAR = as.character(temp$YEAR)

    #adding few more columns
    temp$code = code
    temp$sport_type = sport_type

    #rbind(df, temp) wont work because of different columns in temp dataframes. plyr is cool
    df <- plyr::rbind.fill(df, na.omit(temp))
  }
  write.csv( df, file = paste0('totals','.csv'), row.names=FALSE, na='' )
  return(df)
}

# process fixed table data
doFixed <- function(allPaths, k){
  #all possible endings of each fixed and not fixed files
  possibilities_f = c('_2014_f', '_2015_f', '_2016_f', '_2017_f', '_2018_f', '_2014-15_f', '_2015-16_f', '_2016-17_f', '_2017-18_f')

  df <- data.frame()
  for(i in possibilities_f){
    kk <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), i, '.csv')

    for (j in kk){
      if (file.exists(j) && nrow(read.csv(file=j) != 0)){
        code <- sapply(strsplit(j, '/'), '[[', 8)
        sport_type <- sapply(strsplit(j, '/'), '[[', 7)

        temp <- read.csv(file=j)

        #renaming wrongly named columns so I could select them
        if("BC.Zaporizhya" %in% colnames(temp)){
          names(temp)[names(temp) == 'BC.Zaporizhya'] <- 'AWAY_TEAM'
        }
        if("SCORE" %in% colnames(temp)){
          names(temp)[names(temp) == 'SCORE'] <- 'RESULT'
        }

        #if this column name doesnt exist then add it (it's needed in order to bind dataframes)
        if(!"MATCH_LINK" %in% colnames(temp)){
          temp$MATCH_LINK = ''
        }

        temp$SEASON_PART <- as.character(temp$SEASON_PART)
        temp$MATCH_LINK <- as.character(temp$MATCH_LINK)

        temp <- data.frame(#is YEAR field realy needed here? Because date field is more informative
          'code'        = code,
          'sport_type'  = sport_type,
          'DATE'        = temp[[1]],
          'SEASON_PART' = temp$SEASON_PART,
          'MATCH_LINK'  = temp$MATCH_LINK,
          'HOME_TEAM'   = temp$HOME_TEAM,
          'RESULT'      = temp$RESULT,
          'AWAY_TEAM'   = temp$AWAY_TEAM,
          'VENUE'       = temp$VENUE,
          'ATTENDANCE'  = temp$ATTENDANCE,
          'COMMENT'     = temp$COMMENT,
          'CAPACITY'    = temp$CAPACITY
        )
        df <- rbind(df, na.omit(temp))
        # df <- plyr::rbind.fill(df, temp)
      }
    }
  }
  write.csv( df, file = paste0('fixed','.csv'), row.names=FALSE, na='' )
  return(df)
}

# process not fixed table data
doNotFixed <- function(allPaths, fileType, k, howToRead){
  #generating all possible paths from allPaths variable
  possibilities = c('_2014', '_2015', '_2016', '_2017', '_2018', '_2014-15', '_2015-16', '_2016-17', '_2017-18')

  df <- data.frame()
  for(i in possibilities){
    kk <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), i, fileType)

    for (j in kk){
      ############################
      if (file.exists(j) && nrow(eval(parse(text=howToRead))) != 0){
        # print(j)
        code <- sapply(strsplit(j, '/'), '[[', 8)
        sport_type <- sapply(strsplit(j, '/'), '[[', 7)

        temp <-eval(parse(text=howToRead))

        #renaming wrongly named columns so I could select them
        if("LINK" %in% colnames(temp)){
          names(temp)[names(temp) == 'LINK'] <- 'MATCH_LINK'
        }
        if("SEASON" %in% colnames(temp)){
          names(temp)[names(temp) == 'SEASON'] <- 'SEASON_PART'
        }
        if("HOME" %in% colnames(temp)){
          names(temp)[names(temp) == 'HOME'] <- 'HOME_TEAM'
        }
        if("AWAY" %in% colnames(temp)){
          names(temp)[names(temp) == 'AWAY'] <- 'AWAY_TEAM'
        }
        if("BC.Zaporizhya" %in% colnames(temp)){
          names(temp)[names(temp) == 'BC.Zaporizhya'] <- 'AWAY_TEAM'
        }
        if("SCORE" %in% colnames(temp)){
          names(temp)[names(temp) == 'SCORE'] <- 'RESULT'
        }
        if("Sportcomplex.Olymp" %in% colnames(temp)){
          names(temp)[names(temp) == 'Sportcomplex.Olymp'] <- 'VENUE'
        }

        #if this column name doesnt exist then add it (it's needed in order to bind dataframes)
        if(!"SEASON_PART" %in% colnames(temp)){
          temp$SEASON_PART = ''
        }
        if(!"VENUE" %in% colnames(temp)){
          temp$VENUE = ''
        }
        if(!"MATCH_LINK" %in% colnames(temp)){
          temp$MATCH_LINK = ''
        }

        temp$SEASON_PART <- as.character(temp$SEASON_PART)

        temp <- data.frame(#is YEAR field realy needed here? Because date field is more informative
          'code'        = code,
          'sport_type'  = sport_type,
          'DATE'        = as.character(temp[[1]]),
          'SEASON_PART' = temp$SEASON_PART,
          'MATCH_LINK'  = temp$MATCH_LINK,#this was not in the sport tables file
          'HOME_TEAM'   = temp$HOME_TEAM,
          'RESULT'      = temp$RESULT,
          'AWAY_TEAM'   = temp$AWAY_TEAM,
          'VENUE'       = temp$VENUE,
          'ATTENDANCE'  = temp$ATTENDANCE
        )
        df <- rbind(df, na.omit(temp))
        # df <- plyr::rbind.fill(df, na.omit(temp))
      }
    }
  }
  return(df)
}
################################ MAIN STUFF ################################
# install.packages('magrittr')
# install.packages('dplyr')
# install.packages('tidyverse')
# install.packages('readxl')
# install.packages('xlsx')


library(magrittr)
library(dplyr)
library(tidyverse)
library(readxl)
library(xlsx)

#listing all paths recursively
allPaths <- dir( "C:/Users/mindaugas.venskus/Documents/R/Sports", full.names = TRUE, recursive = TRUE )

#revome ~$ files
allPaths <- allPaths[ which( !grepl( '[~$]', allPaths) ) ]

#revome files from irrelevant folders
allPaths <- allPaths[ which( grepl( '-', sapply( strsplit(allPaths, '/'), '[[', 8 ) ) ) ]

################################ prices ################################

price <- getPaths('price', FALSE )# name of folder, if remove duplicates
prices <- gatherData( price, 'price' )# paths, switch_case to process them
rm(price)

################################ stadiums ################################

stadium <- getPaths('Stadiums', TRUE )
stadiums <- gatherData( stadium, 'stadium' )
rm(stadium)

################################ totals ################################

totals <- doTotals(allPaths)

################################ FIXED ################################

#generating all possible paths from allPaths variable
k <- subset(unique(dirname(allPaths)), str_count(unique(dirname(allPaths)), '/') == 7)
fixed <- doFixed(allPaths, k)

################################ NOT FIXED ################################

# binding CSV and XLSX data together
notFixed <- rbind(doNotFixed(allPaths, '.csv', k, 'read.csv(j)'), doNotFixed(allPaths, '.xlsx', k, 'read_xlsx(j)'))
write.csv( notFixed, file = paste0('notFixed','.csv'), row.names=FALSE, na='' )