Advertisement
Guest User

Untitled

a guest
Aug 20th, 2015
315
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 4.46 KB | None | 0 0
  1. # ---
  2. # STEP 0: Preparations
  3.   start_time <- Sys.time()
  4.   ## 1. Set working directory in R
  5.     setwd("D:/Dropbox/Masterthesis Bram Vanroy/data/rawdata")
  6.  
  7.   ## 2. Load required library/libraries
  8.     library(dplyr)
  9.     library(mclm)
  10.     library(stringi)
  11.  
  12.   ## 3. Create directory where we'll save our dataset(s)
  13.     dir.create("../R/dataset", showWarnings = FALSE)
  14.  
  15.  
  16. # ---
  17. # STEP 1: Loop through files, get data from the filename
  18.  
  19.     ## 1. Create first dataframe, based on filename of all files
  20.     files <- list.files(pattern="*.lst", full.names=T, recursive=FALSE)
  21.     d <- data.frame(fileName = unname(sapply(files, basename)), stringsAsFactors = FALSE)
  22.  
  23.     ## 2. Create additional columns (word & component) based on filename
  24.     d$node <- sub("\\..+", "", d$fileName, perl=TRUE)
  25.     d$node <- tolower(d$node)
  26.     d$component <- gsub("^[^\\.]+\\.|\\.lst$", "", d$fileName, perl=TRUE)
  27.  
  28.  
  29. # ---
  30. # STEP 2: Loop through files again, but now also through its contents
  31. # In other words: get the sentences
  32.  
  33.     ## 1. Create second set which is an rbind of multiple frames
  34.     ## One two-column data.frame per file
  35.     ## First column is fileName, second column is data from each file
  36.     e <- do.call(rbind, lapply(files, function(x) {
  37.         data.frame(fileName = x, sentence = readLines(x, encoding="UTF-8"), stringsAsFactors = FALSE)
  38.     }))
  39.  
  40.     ## 2. Clean fileName
  41.      e$fileName <- sub("^\\.\\/", "", e$fileName, perl=TRUE)
  42.  
  43.     ## 3. Get the sentence and clean
  44.     e$sentence <- gsub(".*?<sentence>(.*?)</sentence>", "\\1", e$sentence, perl=TRUE)
  45.     e$sentence <- tolower(e$sentence)
  46.         # Remove floating space before/after punctuation
  47.         e$sentence <- gsub("\\s(?:(?=[.,:;?!) ])|(?<=\\( ))", "\\1", e$sentence, perl=TRUE)
  48.     # Add space after triple dots ...
  49.       e$sentence <- gsub("\\.{3}(?=[^\\s])", "... ", e$sentence, perl=TRUE)
  50.  
  51.     # Transform HTML entities into characters
  52.     # It is unfortunate that there's no easier way to do this
  53.     # E.g. Python provides the HTML package which can unescape (decode) HTML
  54.     # characters
  55.         e$sentence <- gsub("&apos;", "'", e$sentence, perl=TRUE)
  56.         e$sentence <- gsub("&amp;", "&", e$sentence, perl=TRUE)
  57.       # Avoid R from wrongly interpreting ", so replace by single quotes
  58.         e$sentence <- gsub("&quot;|\"", "'", e$sentence, perl=TRUE)
  59.  
  60.       # Get rid of some characters we can't use such as ³ and ¾
  61.       e$sentence <- gsub("[^[:graph:]\\s]", "", e$sentence, perl=TRUE)
  62.  
  63.  
  64. # ---
  65. # STEP 3:
  66. # Create final dataframe
  67.  
  68.   ## 1. Merge d and e by common column name fileName
  69.     df <- merge(d, e, by="fileName", all=TRUE)
  70.  
  71.   ## 2. Make sure that only those sentences in which df$node is present in df$sentence are taken into account
  72.     matchFunction <- function(x, y) any(x == y)
  73.     matchedFrame <- with(df, mapply(matchFunction, node, stri_split_regex(sentence, "[ :?.,]")))
  74.     df <- df[matchedFrame, ]
  75.  
  76.   ## 3. Create leftContext based on the split of the word and the sentence
  77.     # Use paste0 to make sure we are looking for the node, not a compound
  78.     # node can only be preceded by a space, but can be followed by punctuation as well
  79.     contexts <- strsplit(df$sentence, paste0("(^| )", df$node, "( |[!\",.:;?})\\]])"), perl=TRUE)
  80.     df$leftContext <- sapply(contexts, `[`, 1)
  81.  
  82.   ## 4. Get the word preceding the node
  83.     df$precedingWord <- gsub("^.*\\b(?<!-)(\\w+(?:-\\w+)*)[^\\w]*$","\\1", df$leftContext, perl=TRUE)
  84.  
  85.   ## 5. Improve readability by sorting columns
  86.     df <- df[c("fileName", "component", "precedingWord", "node", "leftContext", "sentence")]
  87.  
  88.   ## 6. Write dataset to dataset dir
  89.     write.dataset(df,"../R/dataset/r-dataset.csv")
  90.  
  91.  
  92. # ---
  93. # STEP 4:
  94. # Create dataset with frequencies
  95.  
  96.   ## 1. Define neuter and nonNeuter classes
  97.     neuter <- c("het")
  98.     non.neuter<- c("de")
  99.  
  100.   ## 2. Mutate df to fit into usable frame
  101.     freq <- mutate(df, gender = ifelse(!df$precedingWord %in% c(neuter, non.neuter), "unspecified",
  102.       ifelse(df$precedingWord %in% neuter, "neuter", "non_neuter")))
  103.  
  104.   ## 3. Transform into table, but still usable as data frame (i.e. matrix)
  105.   ## Also add column name "node"
  106.     freqTable <- table(freq$node, freq$gender) %>%
  107.       as.data.frame.matrix %>%
  108.       mutate(node = row.names(.))
  109.  
  110.   ## 4. Small adjustements
  111.     freqTable <- freqTable[,c(4,1:3)]
  112.  
  113.   ## 5. Write dataset to dataset dir
  114.     write.dataset(freqTable,"../R/dataset/r-frequencies.csv")
  115.  
  116.  
  117.     diff <- Sys.time() - start_time # calculate difference
  118.     print(diff) # print in nice format
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement