Advertisement
Guest User

Untitled

a guest
Dec 13th, 2018
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.78 KB | None | 0 0
  1.  
  2. ################################ ALL FUNCTIONS ################################
  3. # get paths and deal with duplicates
  4. getPaths <- function( getName, ifDpl ){
  5.  
  6. #all paths (with duplicates)
  7. all <- allPaths[ which( grepl( getName, allPaths, ignore.case=TRUE ) ) ]
  8.  
  9. if( ifDpl ){
  10.  
  11. #paths without names
  12. path <- unlist(strsplit( all, '[^/]+$' ))
  13. #unique values
  14. uniq <- !duplicated(path)
  15. #names
  16. names <- unlist(basename( all[uniq] ))
  17. #default name
  18. name <- paste0(getName,'.csv')
  19.  
  20.  
  21. all <- c( paste0( path[uniq], names ) )
  22.  
  23. if( length(path[!uniq]) != 0 )
  24. all <- c( all, paste0( path[!uniq], name ) )
  25.  
  26. }
  27.  
  28. return( all )
  29.  
  30. }
  31.  
  32. # gather data from .xlsx and .csv files to data.frame
  33. gatherData <- function( all, func ){
  34.  
  35. df <- data.frame()
  36.  
  37. for( i in all ){
  38. # print(i)
  39. isCsv <- tail( unlist(strsplit( i, '[.]' )), n=1 ) == 'csv'
  40.  
  41. gather <- function(name) {
  42. switch(name,
  43. price = do_price( i, isCsv ),
  44. stadium = do_stadiums( i, isCsv )
  45. )
  46. }
  47.  
  48. temp <- gather(func)
  49.  
  50. #if theres nothing to bind, break
  51. if( nrow(temp) == 0 ) next
  52.  
  53.  
  54. #mandatory fields with data from folder names
  55. temp[ , 'SPORT' ] <- sapply( strsplit(i, '/'), '[[', 7 )
  56. temp[ , 'CODE' ] <- sapply( strsplit(i, '/'), '[[', 8 )
  57.  
  58. df <- rbind(df, na.omit(temp))
  59.  
  60. }
  61. colnames(df)[3] <- 'AVERAGE.TICKET.PRICE'
  62. colnames(df)[5] <- 'AVERAGE.TICKET.PRICE_USD'
  63.  
  64. write.csv( df, file = paste0(func,'.csv'), row.names=FALSE, na='' )
  65. return(df)
  66. }
  67.  
  68. # process stadium table data
  69. do_stadiums <- function( dataSet, isCsv ){
  70.  
  71. temp <- data.frame()
  72. get <- c('NAME','CAPACITY','TEAM','COMMENT','LINK','DATE')
  73.  
  74. if (isCsv){
  75.  
  76. #get all files in this directory
  77. path <- unlist(strsplit( dataSet, '[^/]+$' ))
  78. all <- dir( path, full.names = TRUE, recursive = TRUE )
  79.  
  80. #filter fixed files and get DATE list
  81. filtered <- all[ which( grepl( '[0-9]{2}_f.', all ) ) ]
  82. date <- sapply( strsplit(filtered, '_'), '[[', 2 )
  83.  
  84. for (i in date)
  85. temp <- rbind( temp, na.omit(data.frame( read.csv(dataSet), DATE = i )[ , get ] ))
  86.  
  87. }
  88. else{
  89.  
  90. #get all sheet names
  91. sheetNames <- readxl::excel_sheets(dataSet)
  92.  
  93. for (j in sheetNames){
  94.  
  95. #check if sheet isn't empty
  96. num <- nrow( read_excel(dataSet, sheet = j) )
  97. if( num == 0 ) next
  98.  
  99. sheet <- data.frame( read_excel( dataSet, sheet = j ), 'DATE' = j )
  100.  
  101. #find missing columns and add them to data.frame
  102. missing <- get[ !is.element( get, names(sheet) ) ]
  103. sheet[,missing] <- NA
  104.  
  105. temp <- rbind( temp, na.omit(sheet[, get]) )
  106. }
  107. }
  108.  
  109. return(temp)
  110.  
  111. }
  112.  
  113. # process price table data
  114. do_price <- function( dataSet, isCsv ){
  115.  
  116. years <- c('2014', '2015', '2016', '2017', '2018', '2014-15', '2015-16', '2016-17', '2017-18')
  117.  
  118. temp <- data.frame()
  119.  
  120. if (isCsv){
  121. # no such files
  122. }
  123. else{
  124.  
  125. #get all sheet names
  126. sheetNames <- readxl::excel_sheets(dataSet)
  127. sheetNames <- years[ is.element( years, sheetNames ) ]
  128.  
  129. for (j in sheetNames){
  130. # print(j)
  131. #check if sheet isn't empty
  132. num <- nrow( read_excel(dataSet, sheet = j) ) #check first value. if empty skip?
  133. if( num == 0 ) next
  134.  
  135. ifUS <- grepl( 'US', basename( dataSet ) )
  136. datafr <- data.frame( read_excel( dataSet, sheet = j ))
  137.  
  138. temp <- rbind( temp, na.omit(data.frame(
  139. 'DATE' = j,
  140. 'CLUB' = datafr[[1]],
  141. if (typeof(datafr[[2]]) == "character"){
  142. 'AVERAGE.TICKET.PRICE' = datafr[[2]]
  143. }else{
  144. 'AVERAGE.TICKET.PRICE' = round(datafr[[2]], 2)
  145. },
  146. 'CURRENCY' = datafr[[3]],
  147. if (typeof(datafr[[ ifelse( ifUS, 2, 4 ) ]]) == "character"){
  148. 'AVERAGE.TICKET.PRICE_USD' = datafr[[ ifelse( ifUS, 2, 4 ) ]]
  149. }else{
  150. 'AVERAGE.TICKET.PRICE_USD' = round(datafr[[ ifelse( ifUS, 2, 4 ) ]], 2)
  151. },
  152. stringsAsFactors = FALSE
  153. )))
  154. }
  155. }
  156.  
  157. return(temp)
  158. }
  159.  
  160. # process totals table data
  161. doTotals <- function(allPaths){
  162.  
  163. k <- subset(unique(dirname(allPaths)), str_count(unique(dirname(allPaths)), '/') == 7)
  164. #preparing full paths
  165.  
  166. k <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), '_totals_f.csv')
  167.  
  168. #paths that are wrong. NOTES
  169. # k <- k[!k %in% allPaths[allPaths %in% k]]
  170.  
  171. #taking paths that exist
  172. k <- allPaths[allPaths %in% k]
  173.  
  174. df <- data.frame()
  175.  
  176. for( i in k ){
  177. code <- sapply(strsplit(i, '/'), '[[', 8)
  178. sport_type <- sapply(strsplit(i, '/'), '[[', 7)
  179. temp <- read.csv(file=i)
  180.  
  181. #this step is important because at the begining R thinks that 2016 is a numeric value. Later for example if 2016-17 would be binded, R would import wrong data
  182. temp$YEAR = as.character(temp$YEAR)
  183.  
  184. #adding few more columns
  185. temp$code = code
  186. temp$sport_type = sport_type
  187.  
  188. #rbind(df, temp) wont work because of different columns in temp dataframes. plyr is cool
  189. df <- plyr::rbind.fill(df, na.omit(temp))
  190. }
  191. write.csv( df, file = paste0('totals','.csv'), row.names=FALSE, na='' )
  192. return(df)
  193. }
  194.  
  195. # process fixed table data
  196. doFixed <- function(allPaths, k){
  197. #all possible endings of each fixed and not fixed files
  198. possibilities_f = c('_2014_f', '_2015_f', '_2016_f', '_2017_f', '_2018_f', '_2014-15_f', '_2015-16_f', '_2016-17_f', '_2017-18_f')
  199.  
  200. df <- data.frame()
  201. for(i in possibilities_f){
  202. kk <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), i, '.csv')
  203.  
  204. for (j in kk){
  205. if (file.exists(j) && nrow(read.csv(file=j) != 0)){
  206. code <- sapply(strsplit(j, '/'), '[[', 8)
  207. sport_type <- sapply(strsplit(j, '/'), '[[', 7)
  208.  
  209. temp <- read.csv(file=j)
  210.  
  211. #renaming wrongly named columns so I could select them
  212. if("BC.Zaporizhya" %in% colnames(temp)){
  213. names(temp)[names(temp) == 'BC.Zaporizhya'] <- 'AWAY_TEAM'
  214. }
  215. if("SCORE" %in% colnames(temp)){
  216. names(temp)[names(temp) == 'SCORE'] <- 'RESULT'
  217. }
  218.  
  219. #if this column name doesnt exist then add it (it's needed in order to bind dataframes)
  220. if(!"MATCH_LINK" %in% colnames(temp)){
  221. temp$MATCH_LINK = ''
  222. }
  223.  
  224. temp$SEASON_PART <- as.character(temp$SEASON_PART)
  225. temp$MATCH_LINK <- as.character(temp$MATCH_LINK)
  226.  
  227. temp <- data.frame(#is YEAR field realy needed here? Because date field is more informative
  228. 'code' = code,
  229. 'sport_type' = sport_type,
  230. 'DATE' = temp[[1]],
  231. 'SEASON_PART' = temp$SEASON_PART,
  232. 'MATCH_LINK' = temp$MATCH_LINK,
  233. 'HOME_TEAM' = temp$HOME_TEAM,
  234. 'RESULT' = temp$RESULT,
  235. 'AWAY_TEAM' = temp$AWAY_TEAM,
  236. 'VENUE' = temp$VENUE,
  237. 'ATTENDANCE' = temp$ATTENDANCE,
  238. 'COMMENT' = temp$COMMENT,
  239. 'CAPACITY' = temp$CAPACITY
  240. )
  241. df <- rbind(df, na.omit(temp))
  242. # df <- plyr::rbind.fill(df, temp)
  243. }
  244. }
  245. }
  246. write.csv( df, file = paste0('fixed','.csv'), row.names=FALSE, na='' )
  247. return(df)
  248. }
  249.  
  250. # process not fixed table data
  251. doNotFixed <- function(allPaths, fileType, k, howToRead){
  252. #generating all possible paths from allPaths variable
  253. possibilities = c('_2014', '_2015', '_2016', '_2017', '_2018', '_2014-15', '_2015-16', '_2016-17', '_2017-18')
  254.  
  255. df <- data.frame()
  256. for(i in possibilities){
  257. kk <- paste0(k, '/', sapply(strsplit(k, '/'), '[[', 8), i, fileType)
  258.  
  259. for (j in kk){
  260. ############################
  261. if (file.exists(j) && nrow(eval(parse(text=howToRead))) != 0){
  262. # print(j)
  263. code <- sapply(strsplit(j, '/'), '[[', 8)
  264. sport_type <- sapply(strsplit(j, '/'), '[[', 7)
  265.  
  266. temp <-eval(parse(text=howToRead))
  267.  
  268. #renaming wrongly named columns so I could select them
  269. if("LINK" %in% colnames(temp)){
  270. names(temp)[names(temp) == 'LINK'] <- 'MATCH_LINK'
  271. }
  272. if("SEASON" %in% colnames(temp)){
  273. names(temp)[names(temp) == 'SEASON'] <- 'SEASON_PART'
  274. }
  275. if("HOME" %in% colnames(temp)){
  276. names(temp)[names(temp) == 'HOME'] <- 'HOME_TEAM'
  277. }
  278. if("AWAY" %in% colnames(temp)){
  279. names(temp)[names(temp) == 'AWAY'] <- 'AWAY_TEAM'
  280. }
  281. if("BC.Zaporizhya" %in% colnames(temp)){
  282. names(temp)[names(temp) == 'BC.Zaporizhya'] <- 'AWAY_TEAM'
  283. }
  284. if("SCORE" %in% colnames(temp)){
  285. names(temp)[names(temp) == 'SCORE'] <- 'RESULT'
  286. }
  287. if("Sportcomplex.Olymp" %in% colnames(temp)){
  288. names(temp)[names(temp) == 'Sportcomplex.Olymp'] <- 'VENUE'
  289. }
  290.  
  291. #if this column name doesnt exist then add it (it's needed in order to bind dataframes)
  292. if(!"SEASON_PART" %in% colnames(temp)){
  293. temp$SEASON_PART = ''
  294. }
  295. if(!"VENUE" %in% colnames(temp)){
  296. temp$VENUE = ''
  297. }
  298. if(!"MATCH_LINK" %in% colnames(temp)){
  299. temp$MATCH_LINK = ''
  300. }
  301.  
  302. temp$SEASON_PART <- as.character(temp$SEASON_PART)
  303.  
  304. temp <- data.frame(#is YEAR field realy needed here? Because date field is more informative
  305. 'code' = code,
  306. 'sport_type' = sport_type,
  307. 'DATE' = as.character(temp[[1]]),
  308. 'SEASON_PART' = temp$SEASON_PART,
  309. 'MATCH_LINK' = temp$MATCH_LINK,#this was not in the sport tables file
  310. 'HOME_TEAM' = temp$HOME_TEAM,
  311. 'RESULT' = temp$RESULT,
  312. 'AWAY_TEAM' = temp$AWAY_TEAM,
  313. 'VENUE' = temp$VENUE,
  314. 'ATTENDANCE' = temp$ATTENDANCE
  315. )
  316. df <- rbind(df, na.omit(temp))
  317. # df <- plyr::rbind.fill(df, na.omit(temp))
  318. }
  319. }
  320. }
  321. return(df)
  322. }
  323. ################################ MAIN STUFF ################################
  324. # install.packages('magrittr')
  325. # install.packages('dplyr')
  326. # install.packages('tidyverse')
  327. # install.packages('readxl')
  328. # install.packages('xlsx')
  329.  
  330.  
  331. library(magrittr)
  332. library(dplyr)
  333. library(tidyverse)
  334. library(readxl)
  335. library(xlsx)
  336.  
  337. #listing all paths recursively
  338. allPaths <- dir( "C:/Users/mindaugas.venskus/Documents/R/Sports", full.names = TRUE, recursive = TRUE )
  339.  
  340. #revome ~$ files
  341. allPaths <- allPaths[ which( !grepl( '[~$]', allPaths) ) ]
  342.  
  343. #revome files from irrelevant folders
  344. allPaths <- allPaths[ which( grepl( '-', sapply( strsplit(allPaths, '/'), '[[', 8 ) ) ) ]
  345.  
  346. ################################ prices ################################
  347.  
  348. price <- getPaths('price', FALSE )# name of folder, if remove duplicates
  349. prices <- gatherData( price, 'price' )# paths, switch_case to process them
  350. rm(price)
  351.  
  352. ################################ stadiums ################################
  353.  
  354. stadium <- getPaths('Stadiums', TRUE )
  355. stadiums <- gatherData( stadium, 'stadium' )
  356. rm(stadium)
  357.  
  358. ################################ totals ################################
  359.  
  360. totals <- doTotals(allPaths)
  361.  
  362. ################################ FIXED ################################
  363.  
  364. #generating all possible paths from allPaths variable
  365. k <- subset(unique(dirname(allPaths)), str_count(unique(dirname(allPaths)), '/') == 7)
  366. fixed <- doFixed(allPaths, k)
  367.  
  368. ################################ NOT FIXED ################################
  369.  
  370. # binding CSV and XLSX data together
  371. notFixed <- rbind(doNotFixed(allPaths, '.csv', k, 'read.csv(j)'), doNotFixed(allPaths, '.xlsx', k, 'read_xlsx(j)'))
  372. write.csv( notFixed, file = paste0('notFixed','.csv'), row.names=FALSE, na='' )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement