Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(gutenbergr)
- library(sparklyr)
- library(dplyr)
- sc <- spark_connect(master = "local")
- twain_path <- paste0("file:///", getwd(), "/mark_twain.txt")
- twain <- spark_read_text(sc, "twain", twain_path)
- doyle_path <- paste0("file:///", getwd(), "/arthur_doyle.txt")
- doyle <- spark_read_text(sc, "doyle", doyle_path)
- all_words <- doyle %>%
- mutate(author = "doyle") %>%
- sdf_bind_rows({
- twain %>%
- mutate(author = "twain")}) %>%
- filter(nchar(line) > 0)
- all_words <- all_words %>%
- mutate(line = regexp_replace(line, "[_\"\'():;,.!?\\-]", " "))
- all_words <- all_words %>%
- ft_tokenizer(input_col = "line",
- output_col = "word_list")
- head(all_words, 4)
- all_words <- all_words %>%
- ft_stop_words_remover(input_col = "word_list",
- output_col = "wo_stop_words")
- all_words <- all_words %>%
- mutate(word = explode(wo_stop_words)) %>%
- select(word, author) %>%
- filter(nchar(word) > 2)
- head(all_words, 4)
- head(all_words, 4)
- all_words <- all_words %>%
- compute("all_words")
- word_count <- all_words %>%
- group_by(author, word) %>%
- tally() %>%
- arrange(desc(n))
- word_count
- doyle_unique <- filter(word_count, author == "doyle") %>%
- anti_join(filter(word_count, author == "twain"), by = "word") %>%
- arrange(desc(n)) %>%
- compute("doyle_unique")
- doyle_unique
- doyle_unique %>%
- head(100) %>%
- collect() %>%
- with(wordcloud::wordcloud(
- word,
- n,
- colors = c("#999999", "#E69F00", "#56B4E9","#56B4E9")))
- all_words %>%
- filter(author == "twain",
- word == "sherlock") %>%
- tally()
- twain %>%
- mutate(line = lower(line)) %>%
- filter(instr(line, "sherlock") > 0) %>%
- pull(line)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement