Advertisement
Guest User

Untitled

a guest
Oct 16th, 2019
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.71 KB | None | 0 0
  1. library(gutenbergr)
  2. library(sparklyr)
  3. library(dplyr)
  4.  
  5. sc <- spark_connect(master = "local")
  6.  
  7. twain_path <- paste0("file:///", getwd(), "/mark_twain.txt")
  8. twain <- spark_read_text(sc, "twain", twain_path)
  9.  
  10. doyle_path <- paste0("file:///", getwd(), "/arthur_doyle.txt")
  11. doyle <- spark_read_text(sc, "doyle", doyle_path)
  12.  
  13. all_words <- doyle %>%
  14. mutate(author = "doyle") %>%
  15. sdf_bind_rows({
  16. twain %>%
  17. mutate(author = "twain")}) %>%
  18. filter(nchar(line) > 0)
  19.  
  20. all_words <- all_words %>%
  21. mutate(line = regexp_replace(line, "[_\"\'():;,.!?\\-]", " "))
  22.  
  23. all_words <- all_words %>%
  24. ft_tokenizer(input_col = "line",
  25. output_col = "word_list")
  26.  
  27. head(all_words, 4)
  28.  
  29. all_words <- all_words %>%
  30. ft_stop_words_remover(input_col = "word_list",
  31. output_col = "wo_stop_words")
  32.  
  33.  
  34. all_words <- all_words %>%
  35. mutate(word = explode(wo_stop_words)) %>%
  36. select(word, author) %>%
  37. filter(nchar(word) > 2)
  38.  
  39. head(all_words, 4)
  40. head(all_words, 4)
  41.  
  42. all_words <- all_words %>%
  43. compute("all_words")
  44.  
  45.  
  46. word_count <- all_words %>%
  47. group_by(author, word) %>%
  48. tally() %>%
  49. arrange(desc(n))
  50.  
  51. word_count
  52.  
  53. doyle_unique <- filter(word_count, author == "doyle") %>%
  54. anti_join(filter(word_count, author == "twain"), by = "word") %>%
  55. arrange(desc(n)) %>%
  56. compute("doyle_unique")
  57.  
  58. doyle_unique
  59.  
  60. doyle_unique %>%
  61. head(100) %>%
  62. collect() %>%
  63. with(wordcloud::wordcloud(
  64. word,
  65. n,
  66. colors = c("#999999", "#E69F00", "#56B4E9","#56B4E9")))
  67.  
  68. all_words %>%
  69. filter(author == "twain",
  70. word == "sherlock") %>%
  71. tally()
  72.  
  73. twain %>%
  74. mutate(line = lower(line)) %>%
  75. filter(instr(line, "sherlock") > 0) %>%
  76. pull(line)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement