Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # ---
- # STEP 0: Preparations
- start_time <- Sys.time()
- ## 1. Set working directory in R
- setwd("D:/Dropbox/Masterthesis Bram Vanroy/data/rawdata")
- ## 2. Load required library/libraries
- library(dplyr)
- library(mclm)
- library(stringi)
- ## 3. Create directory where we'll save our dataset(s)
- dir.create("../R/dataset", showWarnings = FALSE)
- # ---
- # STEP 1: Loop through files, get data from the filename
- ## 1. Create first dataframe, based on filename of all files
- files <- list.files(pattern="*.lst", full.names=T, recursive=FALSE)
- d <- data.frame(fileName = unname(sapply(files, basename)), stringsAsFactors = FALSE)
- ## 2. Create additional columns (word & component) based on filename
- d$node <- sub("\\..+", "", d$fileName, perl=TRUE)
- d$node <- tolower(d$node)
- d$component <- gsub("^[^\\.]+\\.|\\.lst$", "", d$fileName, perl=TRUE)
- # ---
- # STEP 2: Loop through files again, but now also through its contents
- # In other words: get the sentences
- ## 1. Create second set which is an rbind of multiple frames
- ## One two-column data.frame per file
- ## First column is fileName, second column is data from each file
- e <- do.call(rbind, lapply(files, function(x) {
- data.frame(fileName = x, sentence = readLines(x, encoding="UTF-8"), stringsAsFactors = FALSE)
- }))
- ## 2. Clean fileName
- e$fileName <- sub("^\\.\\/", "", e$fileName, perl=TRUE)
- ## 3. Get the sentence and clean
- e$sentence <- gsub(".*?<sentence>(.*?)</sentence>", "\\1", e$sentence, perl=TRUE)
- e$sentence <- tolower(e$sentence)
- # Remove floating space before/after punctuation
- e$sentence <- gsub("\\s(?:(?=[.,:;?!) ])|(?<=\\( ))", "\\1", e$sentence, perl=TRUE)
- # Add space after triple dots ...
- e$sentence <- gsub("\\.{3}(?=[^\\s])", "... ", e$sentence, perl=TRUE)
- # Transform HTML entities into characters
- # It is unfortunate that there's no easier way to do this
- # E.g. Python provides the HTML package which can unescape (decode) HTML
- # characters
- e$sentence <- gsub("'", "'", e$sentence, perl=TRUE)
- e$sentence <- gsub("&", "&", e$sentence, perl=TRUE)
- # Avoid R from wrongly interpreting ", so replace by single quotes
- e$sentence <- gsub(""|\"", "'", e$sentence, perl=TRUE)
- # Get rid of some characters we can't use such as ³ and ¾
- e$sentence <- gsub("[^[:graph:]\\s]", "", e$sentence, perl=TRUE)
- # ---
- # STEP 3:
- # Create final dataframe
- ## 1. Merge d and e by common column name fileName
- df <- merge(d, e, by="fileName", all=TRUE)
- ## 2. Make sure that only those sentences in which df$node is present in df$sentence are taken into account
- matchFunction <- function(x, y) any(x == y)
- matchedFrame <- with(df, mapply(matchFunction, node, stri_split_regex(sentence, "[ :?.,]")))
- df <- df[matchedFrame, ]
- ## 3. Create leftContext based on the split of the word and the sentence
- # Use paste0 to make sure we are looking for the node, not a compound
- # node can only be preceded by a space, but can be followed by punctuation as well
- contexts <- strsplit(df$sentence, paste0("(^| )", df$node, "( |[!\",.:;?})\\]])"), perl=TRUE)
- df$leftContext <- sapply(contexts, `[`, 1)
- ## 4. Get the word preceding the node
- df$precedingWord <- gsub("^.*\\b(?<!-)(\\w+(?:-\\w+)*)[^\\w]*$","\\1", df$leftContext, perl=TRUE)
- ## 5. Improve readability by sorting columns
- df <- df[c("fileName", "component", "precedingWord", "node", "leftContext", "sentence")]
- ## 6. Write dataset to dataset dir
- write.dataset(df,"../R/dataset/r-dataset.csv")
- # ---
- # STEP 4:
- # Create dataset with frequencies
- ## 1. Define neuter and nonNeuter classes
- neuter <- c("het")
- non.neuter<- c("de")
- ## 2. Mutate df to fit into usable frame
- freq <- mutate(df, gender = ifelse(!df$precedingWord %in% c(neuter, non.neuter), "unspecified",
- ifelse(df$precedingWord %in% neuter, "neuter", "non_neuter")))
- ## 3. Transform into table, but still usable as data frame (i.e. matrix)
- ## Also add column name "node"
- freqTable <- table(freq$node, freq$gender) %>%
- as.data.frame.matrix %>%
- mutate(node = row.names(.))
- ## 4. Small adjustements
- freqTable <- freqTable[,c(4,1:3)]
- ## 5. Write dataset to dataset dir
- write.dataset(freqTable,"../R/dataset/r-frequencies.csv")
- diff <- Sys.time() - start_time # calculate difference
- print(diff) # print in nice format
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement