Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ################################ ALL FUNCTIONS ################################
- # get paths and deal with duplicates
- getPaths <- function( getName, ifDpl ){
- #all paths (with duplicates)
- all <- allPaths[ which( grepl( getName, allPaths, ignore.case=TRUE ) ) ]
- if( ifDpl ){
- #paths without names
- path <- unlist(strsplit( all, '[^/]+$' ))
- #unique values
- uniq <- !duplicated(path)
- #names
- names <- unlist(basename( all[uniq] ))
- #default name
- name <- paste0(getName,'.csv')
- all <- c( paste0( path[uniq], names ) )
- if( length(path[!uniq]) != 0 )
- all <- c( all, paste0( path[!uniq], name ) )
- }
- return( all )
- }
- # gather data from .xlsx and .csv files to data.frame
- gatherData <- function( all, func ){
- df <- data.frame()
- for( i in all ){
- # print(i)
- isCsv <- tail( unlist(strsplit( i, '[.]' )), n=1 ) == 'csv'
- gather <- function(name) {
- switch(name,
- price = do_price( i, isCsv ),
- stadium = do_stadiums( i, isCsv )
- )
- }
- temp <- gather(func)
- #if theres nothing to bind, break
- if( nrow(temp) == 0 ) next
- #mandatory fields with data from folder names
- temp[ , 'SPORT' ] <- sapply( strsplit(i, '/'), '[[', 7 )
- temp[ , 'CODE' ] <- sapply( strsplit(i, '/'), '[[', 8 )
- df <- rbind(df, na.omit(temp))
- }
- colnames(df)[3] <- 'AVERAGE.TICKET.PRICE'
- colnames(df)[5] <- 'AVERAGE.TICKET.PRICE_USD'
- write.csv( df, file = paste0(func,'.csv'), row.names=FALSE, na='' )
- return(df)
- }
- # process stadium table data
- do_stadiums <- function( dataSet, isCsv ){
- temp <- data.frame()
- get <- c('NAME','CAPACITY','TEAM','COMMENT','LINK','DATE')
- if (isCsv){
- #get all files in this directory
- path <- unlist(strsplit( dataSet, '[^/]+$' ))
- all <- dir( path, full.names = TRUE, recursive = TRUE )
- #filter fixed files and get DATE list
- filtered <- all[ which( grepl( '[0-9]{2}_f.', all ) ) ]
- date <- sapply( strsplit(filtered, '_'), '[[', 2 )
- for (i in date)
- temp <- rbind( temp, na.omit(data.frame( read.csv(dataSet), DATE = i )[ , get ] ))
- }
- else{
- #get all sheet names
- sheetNames <- readxl::excel_sheets(dataSet)
- for (j in sheetNames){
- #check if sheet isn't empty
- num <- nrow( read_excel(dataSet, sheet = j) )
- if( num == 0 ) next
- sheet <- data.frame( read_excel( dataSet, sheet = j ), 'DATE' = j )
- #find missing columns and add them to data.frame
- missing <- get[ !is.element( get, names(sheet) ) ]
- sheet[,missing] <- NA
- temp <- rbind( temp, na.omit(sheet[, get]) )
- }
- }
- return(temp)
- }
- # process price table data
- do_price <- function( dataSet, isCsv ){
- years <- c('2014', '2015', '2016', '2017', '2018', '2014-15', '2015-16', '2016-17', '2017-18')
- temp <- data.frame()
- if (isCsv){
- # no such files
- }
- else{
- #get all sheet names
- sheetNames <- readxl::excel_sheets(dataSet)
- sheetNames <- years[ is.element( years, sheetNames ) ]
- for (j in sheetNames){
- # print(j)
- #check if sheet isn't empty
- num <- nrow( read_excel(dataSet, sheet = j) ) #check first value. if empty skip?
- if( num == 0 ) next
- ifUS <- grepl( 'US', basename( dataSet ) )
- datafr <- data.frame( read_excel( dataSet, sheet = j ))
- temp <- rbind( temp, na.omit(data.frame(
- 'DATE' = j,
- 'CLUB' = datafr[[1]],
- if (typeof(datafr[[2]]) == "character"){
- 'AVERAGE.TICKET.PRICE' = datafr[[2]]
- }else{
- 'AVERAGE.TICKET.PRICE' = round(datafr[[2]], 2)
- },
- 'CURRENCY' = datafr[[3]],
- if (typeof(datafr[[ ifelse( ifUS, 2, 4 ) ]]) == "character"){
- 'AVERAGE.TICKET.PRICE_USD' = datafr[[ ifelse( ifUS, 2, 4 ) ]]
- }else{
- 'AVERAGE.TICKET.PRICE_USD' = round(datafr[[ ifelse( ifUS, 2, 4 ) ]], 2)
- },
- stringsAsFactors = FALSE
- )))
- }
- }
- return(temp)
- }
- # process totals table data
- doTotals <- function(allPaths){
- k <- subset(unique(dirname(allPaths)), str_count(unique(dirname(allPaths)), '/') == 7)
- #preparing full paths
- k <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), '_totals_f.csv')
- #paths that are wrong. NOTES
- # k <- k[!k %in% allPaths[allPaths %in% k]]
- #taking paths that exist
- k <- allPaths[allPaths %in% k]
- df <- data.frame()
- for( i in k ){
- code <- sapply(strsplit(i, '/'), '[[', 8)
- sport_type <- sapply(strsplit(i, '/'), '[[', 7)
- temp <- read.csv(file=i)
- #this step is important because at the begining R thinks that 2016 is a numeric value. Later for example if 2016-17 would be binded, R would import wrong data
- temp$YEAR = as.character(temp$YEAR)
- #adding few more columns
- temp$code = code
- temp$sport_type = sport_type
- #rbind(df, temp) wont work because of different columns in temp dataframes. plyr is cool
- df <- plyr::rbind.fill(df, na.omit(temp))
- }
- write.csv( df, file = paste0('totals','.csv'), row.names=FALSE, na='' )
- return(df)
- }
- # process fixed table data
- doFixed <- function(allPaths, k){
- #all possible endings of each fixed and not fixed files
- possibilities_f = c('_2014_f', '_2015_f', '_2016_f', '_2017_f', '_2018_f', '_2014-15_f', '_2015-16_f', '_2016-17_f', '_2017-18_f')
- df <- data.frame()
- for(i in possibilities_f){
- kk <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), i, '.csv')
- for (j in kk){
- if (file.exists(j) && nrow(read.csv(file=j) != 0)){
- code <- sapply(strsplit(j, '/'), '[[', 8)
- sport_type <- sapply(strsplit(j, '/'), '[[', 7)
- temp <- read.csv(file=j)
- #renaming wrongly named columns so I could select them
- if("BC.Zaporizhya" %in% colnames(temp)){
- names(temp)[names(temp) == 'BC.Zaporizhya'] <- 'AWAY_TEAM'
- }
- if("SCORE" %in% colnames(temp)){
- names(temp)[names(temp) == 'SCORE'] <- 'RESULT'
- }
- #if this column name doesnt exist then add it (it's needed in order to bind dataframes)
- if(!"MATCH_LINK" %in% colnames(temp)){
- temp$MATCH_LINK = ''
- }
- temp$SEASON_PART <- as.character(temp$SEASON_PART)
- temp$MATCH_LINK <- as.character(temp$MATCH_LINK)
- temp <- data.frame(#is YEAR field realy needed here? Because date field is more informative
- 'code' = code,
- 'sport_type' = sport_type,
- 'DATE' = temp[[1]],
- 'SEASON_PART' = temp$SEASON_PART,
- 'MATCH_LINK' = temp$MATCH_LINK,
- 'HOME_TEAM' = temp$HOME_TEAM,
- 'RESULT' = temp$RESULT,
- 'AWAY_TEAM' = temp$AWAY_TEAM,
- 'VENUE' = temp$VENUE,
- 'ATTENDANCE' = temp$ATTENDANCE,
- 'COMMENT' = temp$COMMENT,
- 'CAPACITY' = temp$CAPACITY
- )
- df <- rbind(df, na.omit(temp))
- # df <- plyr::rbind.fill(df, temp)
- }
- }
- }
- write.csv( df, file = paste0('fixed','.csv'), row.names=FALSE, na='' )
- return(df)
- }
- # process not fixed table data
- doNotFixed <- function(allPaths, fileType, k, howToRead){
- #generating all possible paths from allPaths variable
- possibilities = c('_2014', '_2015', '_2016', '_2017', '_2018', '_2014-15', '_2015-16', '_2016-17', '_2017-18')
- df <- data.frame()
- for(i in possibilities){
- kk <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), i, fileType)
- for (j in kk){
- ############################
- if (file.exists(j) && nrow(eval(parse(text=howToRead))) != 0){
- # print(j)
- code <- sapply(strsplit(j, '/'), '[[', 8)
- sport_type <- sapply(strsplit(j, '/'), '[[', 7)
- temp <-eval(parse(text=howToRead))
- #renaming wrongly named columns so I could select them
- if("LINK" %in% colnames(temp)){
- names(temp)[names(temp) == 'LINK'] <- 'MATCH_LINK'
- }
- if("SEASON" %in% colnames(temp)){
- names(temp)[names(temp) == 'SEASON'] <- 'SEASON_PART'
- }
- if("HOME" %in% colnames(temp)){
- names(temp)[names(temp) == 'HOME'] <- 'HOME_TEAM'
- }
- if("AWAY" %in% colnames(temp)){
- names(temp)[names(temp) == 'AWAY'] <- 'AWAY_TEAM'
- }
- if("BC.Zaporizhya" %in% colnames(temp)){
- names(temp)[names(temp) == 'BC.Zaporizhya'] <- 'AWAY_TEAM'
- }
- if("SCORE" %in% colnames(temp)){
- names(temp)[names(temp) == 'SCORE'] <- 'RESULT'
- }
- if("Sportcomplex.Olymp" %in% colnames(temp)){
- names(temp)[names(temp) == 'Sportcomplex.Olymp'] <- 'VENUE'
- }
- #if this column name doesnt exist then add it (it's needed in order to bind dataframes)
- if(!"SEASON_PART" %in% colnames(temp)){
- temp$SEASON_PART = ''
- }
- if(!"VENUE" %in% colnames(temp)){
- temp$VENUE = ''
- }
- if(!"MATCH_LINK" %in% colnames(temp)){
- temp$MATCH_LINK = ''
- }
- temp$SEASON_PART <- as.character(temp$SEASON_PART)
- temp <- data.frame(#is YEAR field realy needed here? Because date field is more informative
- 'code' = code,
- 'sport_type' = sport_type,
- 'DATE' = as.character(temp[[1]]),
- 'SEASON_PART' = temp$SEASON_PART,
- 'MATCH_LINK' = temp$MATCH_LINK,#this was not in the sport tables file
- 'HOME_TEAM' = temp$HOME_TEAM,
- 'RESULT' = temp$RESULT,
- 'AWAY_TEAM' = temp$AWAY_TEAM,
- 'VENUE' = temp$VENUE,
- 'ATTENDANCE' = temp$ATTENDANCE
- )
- df <- rbind(df, na.omit(temp))
- # df <- plyr::rbind.fill(df, na.omit(temp))
- }
- }
- }
- return(df)
- }
- ################################ MAIN STUFF ################################
- # install.packages('magrittr')
- # install.packages('dplyr')
- # install.packages('tidyverse')
- # install.packages('readxl')
- # install.packages('xlsx')
- library(magrittr)
- library(dplyr)
- library(tidyverse)
- library(readxl)
- library(xlsx)
- #listing all paths recursively
- allPaths <- dir( "C:/Users/mindaugas.venskus/Documents/R/Sports", full.names = TRUE, recursive = TRUE )
- #revome ~$ files
- allPaths <- allPaths[ which( !grepl( '[~$]', allPaths) ) ]
- #revome files from irrelevant folders
- allPaths <- allPaths[ which( grepl( '-', sapply( strsplit(allPaths, '/'), '[[', 8 ) ) ) ]
- ################################ prices ################################
- price <- getPaths('price', FALSE )# name of folder, if remove duplicates
- prices <- gatherData( price, 'price' )# paths, switch_case to process them
- rm(price)
- ################################ stadiums ################################
- stadium <- getPaths('Stadiums', TRUE )
- stadiums <- gatherData( stadium, 'stadium' )
- rm(stadium)
- ################################ totals ################################
- totals <- doTotals(allPaths)
- ################################ FIXED ################################
- #generating all possible paths from allPaths variable
- k <- subset(unique(dirname(allPaths)), str_count(unique(dirname(allPaths)), '/') == 7)
- fixed <- doFixed(allPaths, k)
- ################################ NOT FIXED ################################
- # binding CSV and XLSX data together
- notFixed <- rbind(doNotFixed(allPaths, '.csv', k, 'read.csv(j)'), doNotFixed(allPaths, '.xlsx', k, 'read_xlsx(j)'))
- write.csv( notFixed, file = paste0('notFixed','.csv'), row.names=FALSE, na='' )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement