Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2018
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 5.20 KB | None | 0 0
  1. library(tidyverse)
  2. library(tidytext)
  3.  
  4. # Load data ----
  5.  
  6. # Slightly different to https://juliasilge.com/blog/word-vectors-take-two/
  7. #  just because I have this data locally
  8.  
  9. austen_text <- janeaustenr::northangerabbey %>%
  10.   as_tibble() %>%
  11.   rename(text = value) %>%
  12.   mutate(text = str_replace_all(text, "&#x27;|&quot;|&#x2F;", "'"),   ## weird encoding
  13.          text = str_replace_all(text, "<a(.*?)>", " "),               ## links
  14.          text = str_replace_all(text, "&gt;|&lt;|&amp;", " "),        ## html yuck
  15.          text = str_replace_all(text, "&#[:digit:]+;", " "),          ## html yuck
  16.          text = str_replace_all(text, "<[^>]*>", " "),                ## mmmmm, more html yuck
  17.          text = str_to_lower(text),                                   ## BT EDIT
  18.          postID = row_number()) %>%                                   ## Actually more like line ID...
  19.   filter(text != "")
  20.  
  21. # Dr Silge ----
  22.  
  23. # From https://juliasilge.com/blog/tidy-word-vectors/
  24. make_windows_js <- function(tbl, doc_var, window_size) {
  25.   tbl %>%
  26.     unnest_tokens(ngram, !!doc_var, token = "ngrams", n = window_size) %>%
  27.     mutate(window_id = row_number()) %>%                      # Rename for consistency with other methods
  28.     unite(skipgramID, postID, window_id, remove = FALSE) %>%  # Added remove = F for comparison with other methods
  29.     unnest_tokens(word, ngram)
  30. }
  31.  
  32. # From https://juliasilge.com/blog/word-vectors-take-two/
  33. slide_windows_js <- function(tbl, doc_var, window_size) {
  34.   # each word gets a skipgram (window_size words) starting on the first
  35.   # e.g. skipgram 1 starts on word 1, skipgram 2 starts on word 2
  36.  
  37.   each_total <- tbl %>%
  38.     group_by(!!doc_var) %>%
  39.     mutate(doc_total = n(),
  40.            each_total = pmin(doc_total, window_size, na.rm = TRUE)) %>%
  41.     pull(each_total)
  42.  
  43.   rle_each <- rle(each_total)
  44.   counts <- rle_each[["lengths"]]
  45.   counts[rle_each$values != window_size] <- 1
  46.  
  47.   # each word get a skipgram window, starting on the first
  48.   # account for documents shorter than window
  49.   id_counts <- rep(rle_each$values, counts)
  50.   window_id <- rep(seq_along(id_counts), id_counts)
  51.  
  52.  
  53.   # within each skipgram, there are window_size many offsets
  54.   indexer <- (seq_along(rle_each[["values"]]) - 1) %>%
  55.     map2(rle_each[["values"]] - 1,
  56.          ~ seq.int(.x, .x + .y)) %>%
  57.     map2(counts, ~ rep(.x, .y)) %>%
  58.     flatten_int() +
  59.     window_id
  60.  
  61.   tbl[indexer, ] %>%
  62.     bind_cols(data_frame(window_id)) %>%
  63.     group_by(window_id) %>%
  64.     filter(n_distinct(!!doc_var) == 1) %>%
  65.     ungroup
  66. }
  67.  
  68. # Me ----
  69.  
  70. lag_words <- function(tbl, col, offset) {
  71.   # Adds a new column with a lagged output and appropriate
  72.   #  column name
  73.   colname <- paste0(quo_name(col), offset)
  74.  
  75.   tbl %>%
  76.     mutate(!!colname := lag(!!col, offset))
  77. }
  78.  
  79. create_window_wide <- function(tbl, col, window_size) {
  80.   # Adds columns containing previous words to create a window (in wide form)
  81.   max_offset <- window_size - 1
  82.  
  83.   # Create a list of functions (using map) and apply iteratively
  84.   #  over tbl
  85.   map(1:max_offset, .f = ~ function(t) lag_words(t, col, .x)) %>%
  86.     reduce(~ .y(.x), .init = tbl)
  87. }
  88.  
  89. slide_windows_bt <- function(tbl, word_var, doc_var, window_size) {
  90.   tbl %>%
  91.     # Add a marker to remove windows smaller than `window_size` later on
  92.     group_by(!!doc_var) %>%
  93.     mutate(word_position = row_number()) %>%
  94.     # Add lagged columns to tbl (still grouped)
  95.     create_window_wide(word_var, window_size) %>%
  96.     ungroup() %>%
  97.     mutate(window_id = row_number()) %>%
  98.     # Remove small windows
  99.     filter(word_position >= window_size) %>%
  100.     # Make tidy (wide -> long tbl)
  101.     gather(key = position, value = word, -(!!doc_var), -window_id, -word_position) %>%
  102.     # Remove unnecessary columns
  103.     select(-position, -word_position) %>%
  104.     arrange(window_id) %>%
  105.     # Undo reverse ordering caused by `gather` (make words read top to bottom)
  106.     group_by(window_id) %>%
  107.     arrange(desc(row_number()))
  108. }
  109.  
  110. # Create windows using each method ----
  111.  
  112. window_size <- 8
  113.  
  114. unnest_and_filter <- . %>%
  115.   unnest_tokens(word, text)
  116.  
  117. windows_js1 <- austen_text %>%
  118.   make_windows_js(quo(text), window_size)
  119.  
  120. windows_js2 <- austen_text %>%
  121.   unnest_and_filter() %>%
  122.   slide_windows_js(quo(postID), window_size)
  123.  
  124. windows_bt <- austen_text %>%
  125.   unnest_and_filter() %>%
  126.   slide_windows_bt(quo(word), quo(postID), window_size)
  127.  
  128. # Compare methods' windows ----
  129.  
  130. count_windows <- . %>%
  131.   group_by(postID) %>%
  132.   summarise(n = length(unique(window_id)))
  133.  
  134. comparison <- list(windows_js1, windows_js2, windows_bt) %>%
  135.   map(count_windows) %>%
  136.   reduce(full_join, by = c("postID"), suffix = c("_js1", "_js2")) %>%
  137.   rename(n_bt = n) %>%
  138.   mutate(match1 = (n_bt == n_js1),
  139.          match2 = (n_bt == n_js2),
  140.          match = match1 & match2)
  141.  
  142. # View lines which have a different number of windows
  143. #  between the two methods
  144. unmatched_id <- comparison %>%
  145.   # filter(n_js < n_bt) %>%
  146.   filter(!match) %>%
  147.   .$postID %>%
  148.   first()
  149.  
  150. austen_text %>%
  151.   filter(postID == unmatched_id)
  152.  
  153. windows_js1 %>%
  154.   filter(postID == unmatched_id)
  155.  
  156. windows_js2 %>%
  157.   filter(postID == unmatched_id)
  158.  
  159. windows_bt %>%
  160.   filter(postID == unmatched_id)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement