Advertisement
Guest User

Untitled

a guest
Sep 22nd, 2019
162
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.55 KB | None | 0 0
  1. library(stringr)
  2. library(dplyr)
  3. library(purrr)
  4. library(rlang)
  5. library(text2vec) # for movie reviews data
  6.  
  7. # favor stringr over base R because stringr handles NAs whereas base R returns
  8. # zero length, base R below:
  9. # regmatches(string, regexpr(x, string, ignore.case = ignore_case))
  10. pattern_context <- function(string, pattern, n_before = 10, n_after = 10,
  11. ignore_case = TRUE, first = TRUE) {
  12. x <- paste0(".{0,", n_before, "}", pattern, ".{0,", n_after, "}")
  13.  
  14. if (first)
  15. stringr::str_extract(string, stringr::regex(pattern = x, ignore_case = ignore_case))
  16. else
  17. stringr::str_extract_all(string, stringr::regex(pattern = x, ignore_case = ignore_case))
  18. }
  19.  
  20. pattern_context_df <- function(.data, ..., cols = everything()) {
  21. df_context <- .data %>%
  22. dplyr::select({{ cols }}) %>%
  23. purrr::map(pattern_context, ...) %>%
  24. dplyr::bind_cols()
  25.  
  26. names(df_context) <- paste0(names(df_context), "_context")
  27.  
  28. df <- dplyr::bind_cols(.data, df_context)
  29. df[, order(colnames(df))]
  30. }
  31.  
  32. # create variables
  33. movie_review %>%
  34. as_tibble() %>%
  35. mutate(
  36. bad_context = pattern_context(review, "bad"),
  37. good_context = pattern_context(review, "good"),
  38. ) %>%
  39. select(review:good_context)
  40. #> # A tibble: 5,000 x 3
  41. #> review bad_context good_context
  42. #> <chr> <chr> <chr>
  43. #> 1 With all this stuff going down at ~ drugs are bad m'~ <NA>
  44. #> 2 "\\\"The Classic War of the Worlds~ <NA> <NA>
  45. #> 3 The film starts with a manager (Ni~ oments is badly ~ ivers the goods w~
  46. #> 4 "It must be assumed that those who~ <NA> ng of the Good Fr~
  47. #> 5 "Superbly trashy and wondrously un~ <NA> <NA>
  48. #> 6 I dont know why people think this ~ is such a bad mo~ " a pretty good p~
  49. #> 7 This movie could have been very go~ "me really bad m~ been very good, b~
  50. #> 8 I watched this video at a friend's~ cience is bad, a~ <NA>
  51. #> 9 A friend of mine bought this film ~ <NA> <NA>
  52. #> 10 "<br /><br />This movie is full of~ <NA> <NA>
  53. #> # ... with 4,990 more rows
  54.  
  55. # create variable for every column
  56. movie_review %>%
  57. as_tibble() %>%
  58. select(id, review) %>%
  59. pattern_context_df("good")
  60. #> # A tibble: 5,000 x 4
  61. #> id id_context review review_context
  62. #> <chr> <chr> <chr> <chr>
  63. #> 1 5814_8 <NA> With all this stuff going down at~ <NA>
  64. #> 2 2381_9 <NA> "\\\"The Classic War of the World~ <NA>
  65. #> 3 7759_3 <NA> The film starts with a manager (N~ ivers the goods wi~
  66. #> 4 3630_4 <NA> "It must be assumed that those wh~ ng of the Good Fri~
  67. #> 5 9495_8 <NA> "Superbly trashy and wondrously u~ <NA>
  68. #> 6 8196_8 <NA> I dont know why people think this~ " a pretty good pl~
  69. #> 7 7166_2 <NA> This movie could have been very g~ been very good, bu~
  70. #> 8 10633~ <NA> I watched this video at a friend'~ <NA>
  71. #> 9 319_1 <NA> A friend of mine bought this film~ <NA>
  72. #> 10 8713_~ <NA> "<br /><br />This movie is full o~ <NA>
  73. #> # ... with 4,990 more rows
  74.  
  75. # create variable for specified columns
  76. movie_review %>%
  77. as_tibble() %>%
  78. select(id, review) %>%
  79. pattern_context_df("good", cols = review)
  80. #> # A tibble: 5,000 x 3
  81. #> id review review_context
  82. #> <chr> <chr> <chr>
  83. #> 1 5814_8 With all this stuff going down at the mome~ <NA>
  84. #> 2 2381_9 "\\\"The Classic War of the Worlds\\\" by ~ <NA>
  85. #> 3 7759_3 The film starts with a manager (Nicholas B~ ivers the goods wit~
  86. #> 4 3630_4 "It must be assumed that those who praised~ ng of the Good Frid~
  87. #> 5 9495_8 "Superbly trashy and wondrously unpretenti~ <NA>
  88. #> 6 8196_8 I dont know why people think this is such ~ " a pretty good plo~
  89. #> 7 7166_2 This movie could have been very good, but ~ been very good, but~
  90. #> 8 10633_1 I watched this video at a friend's house. ~ <NA>
  91. #> 9 319_1 A friend of mine bought this film for 1, a~ <NA>
  92. #> 10 8713_10 "<br /><br />This movie is full of referen~ <NA>
  93. #> # ... with 4,990 more rows
  94.  
  95. # create variable for all columns except specified
  96. movie_review %>%
  97. as_tibble() %>%
  98. select(id, review) %>%
  99. pattern_context_df("good", cols = -id)
  100. #> # A tibble: 5,000 x 3
  101. #> id review review_context
  102. #> <chr> <chr> <chr>
  103. #> 1 5814_8 With all this stuff going down at the mome~ <NA>
  104. #> 2 2381_9 "\\\"The Classic War of the Worlds\\\" by ~ <NA>
  105. #> 3 7759_3 The film starts with a manager (Nicholas B~ ivers the goods wit~
  106. #> 4 3630_4 "It must be assumed that those who praised~ ng of the Good Frid~
  107. #> 5 9495_8 "Superbly trashy and wondrously unpretenti~ <NA>
  108. #> 6 8196_8 I dont know why people think this is such ~ " a pretty good plo~
  109. #> 7 7166_2 This movie could have been very good, but ~ been very good, but~
  110. #> 8 10633_1 I watched this video at a friend's house. ~ <NA>
  111. #> 9 319_1 A friend of mine bought this film for 1, a~ <NA>
  112. #> 10 8713_10 "<br /><br />This movie is full of referen~ <NA>
  113. #> # ... with 4,990 more rows
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement