Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(tidyverse)
- library(tidytext)
- # Load data ----
- # Slightly different to https://juliasilge.com/blog/word-vectors-take-two/
- # just because I have this data locally
- austen_text <- janeaustenr::northangerabbey %>%
- as_tibble() %>%
- rename(text = value) %>%
- mutate(text = str_replace_all(text, "'|"|/", "'"), ## weird encoding
- text = str_replace_all(text, "<a(.*?)>", " "), ## links
- text = str_replace_all(text, ">|<|&", " "), ## html yuck
- text = str_replace_all(text, "&#[:digit:]+;", " "), ## html yuck
- text = str_replace_all(text, "<[^>]*>", " "), ## mmmmm, more html yuck
- text = str_to_lower(text), ## BT EDIT
- postID = row_number()) %>% ## Actually more like line ID...
- filter(text != "")
- # Dr Silge ----
- # From https://juliasilge.com/blog/tidy-word-vectors/
- make_windows_js <- function(tbl, doc_var, window_size) {
- tbl %>%
- unnest_tokens(ngram, !!doc_var, token = "ngrams", n = window_size) %>%
- mutate(window_id = row_number()) %>% # Rename for consistency with other methods
- unite(skipgramID, postID, window_id, remove = FALSE) %>% # Added remove = F for comparison with other methods
- unnest_tokens(word, ngram)
- }
- # From https://juliasilge.com/blog/word-vectors-take-two/
- slide_windows_js <- function(tbl, doc_var, window_size) {
- # each word gets a skipgram (window_size words) starting on the first
- # e.g. skipgram 1 starts on word 1, skipgram 2 starts on word 2
- each_total <- tbl %>%
- group_by(!!doc_var) %>%
- mutate(doc_total = n(),
- each_total = pmin(doc_total, window_size, na.rm = TRUE)) %>%
- pull(each_total)
- rle_each <- rle(each_total)
- counts <- rle_each[["lengths"]]
- counts[rle_each$values != window_size] <- 1
- # each word get a skipgram window, starting on the first
- # account for documents shorter than window
- id_counts <- rep(rle_each$values, counts)
- window_id <- rep(seq_along(id_counts), id_counts)
- # within each skipgram, there are window_size many offsets
- indexer <- (seq_along(rle_each[["values"]]) - 1) %>%
- map2(rle_each[["values"]] - 1,
- ~ seq.int(.x, .x + .y)) %>%
- map2(counts, ~ rep(.x, .y)) %>%
- flatten_int() +
- window_id
- tbl[indexer, ] %>%
- bind_cols(data_frame(window_id)) %>%
- group_by(window_id) %>%
- filter(n_distinct(!!doc_var) == 1) %>%
- ungroup
- }
- # Me ----
- lag_words <- function(tbl, col, offset) {
- # Adds a new column with a lagged output and appropriate
- # column name
- colname <- paste0(quo_name(col), offset)
- tbl %>%
- mutate(!!colname := lag(!!col, offset))
- }
- create_window_wide <- function(tbl, col, window_size) {
- # Adds columns containing previous words to create a window (in wide form)
- max_offset <- window_size - 1
- # Create a list of functions (using map) and apply iteratively
- # over tbl
- map(1:max_offset, .f = ~ function(t) lag_words(t, col, .x)) %>%
- reduce(~ .y(.x), .init = tbl)
- }
- slide_windows_bt <- function(tbl, word_var, doc_var, window_size) {
- tbl %>%
- # Add a marker to remove windows smaller than `window_size` later on
- group_by(!!doc_var) %>%
- mutate(word_position = row_number()) %>%
- # Add lagged columns to tbl (still grouped)
- create_window_wide(word_var, window_size) %>%
- ungroup() %>%
- mutate(window_id = row_number()) %>%
- # Remove small windows
- filter(word_position >= window_size) %>%
- # Make tidy (wide -> long tbl)
- gather(key = position, value = word, -(!!doc_var), -window_id, -word_position) %>%
- # Remove unnecessary columns
- select(-position, -word_position) %>%
- arrange(window_id) %>%
- # Undo reverse ordering caused by `gather` (make words read top to bottom)
- group_by(window_id) %>%
- arrange(desc(row_number()))
- }
- # Create windows using each method ----
- window_size <- 8
- unnest_and_filter <- . %>%
- unnest_tokens(word, text)
- windows_js1 <- austen_text %>%
- make_windows_js(quo(text), window_size)
- windows_js2 <- austen_text %>%
- unnest_and_filter() %>%
- slide_windows_js(quo(postID), window_size)
- windows_bt <- austen_text %>%
- unnest_and_filter() %>%
- slide_windows_bt(quo(word), quo(postID), window_size)
- # Compare methods' windows ----
- count_windows <- . %>%
- group_by(postID) %>%
- summarise(n = length(unique(window_id)))
- comparison <- list(windows_js1, windows_js2, windows_bt) %>%
- map(count_windows) %>%
- reduce(full_join, by = c("postID"), suffix = c("_js1", "_js2")) %>%
- rename(n_bt = n) %>%
- mutate(match1 = (n_bt == n_js1),
- match2 = (n_bt == n_js2),
- match = match1 & match2)
- # View lines which have a different number of windows
- # between the two methods
- unmatched_id <- comparison %>%
- # filter(n_js < n_bt) %>%
- filter(!match) %>%
- .$postID %>%
- first()
- austen_text %>%
- filter(postID == unmatched_id)
- windows_js1 %>%
- filter(postID == unmatched_id)
- windows_js2 %>%
- filter(postID == unmatched_id)
- windows_bt %>%
- filter(postID == unmatched_id)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement