Guest User

Untitled

a guest
Jun 17th, 2018
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.20 KB | None | 0 0
  1. library(stringi)
  2. library(stringr)
  3.  
  4. # Read play in using stri_read_lines()
  5. earnest <- stri_read_lines("importance-of-being-earnest.txt")
  6.  
  7. # Detect start and end lines
  8. start <- which(str_detect(earnest, pattern = "START OF THE PROJECT"))
  9. end <- which(str_detect(earnest, pattern = "END OF THE PROJECT"))
  10.  
  11. # Get rid of gutenberg intro text
  12. earnest_sub <- earnest[(start+1):(end-1)]
  13.  
  14. # Detect first act
  15. lines_start <- which(str_detect(earnest_sub, pattern = "FIRST ACT"))
  16. lines_start
  17. # Set up index
  18. intro_line_index <- 1:(lines_start-1)
  19.  
  20. # Split play into intro and play
  21. intro_text <- earnest_sub[intro_line_index]
  22. play_text <- earnest_sub[-intro_line_index]
  23.  
  24. # Take a look at the first 20 lines
  25. writeLines(head(play_text,n=20))
  26.  
  27. # Pattern for start word then .
  28. pattern_1 <- START %R% one_or_more(WRD)%R% DOT
  29. # Test pattern_1
  30. str_view(play_lines, pattern = pattern_1, match = T)
  31. str_view(play_lines, pattern = pattern_1, match = F)
  32.  
  33. # Pattern for start, capital, word then .
  34. pattern_2 <- START %R% ascii_upper() %R% one_or_more(WRD) %R% DOT
  35.  
  36. # View matches of pattern_2
  37. str_view(play_lines,pattern=pattern_2,match=T)
  38.  
  39. # View non-matches of pattern_2
  40. str_view(play_lines,pattern=pattern_2,match=F)
  41.  
  42. # Get subset of lines that match
  43. lines <- str_subset(play_lines,pattern=pattern_2)
  44.  
  45. # Extract match from lines
  46. who <- str_extract(lines,pattern=pattern_2)
  47.  
  48. # Let's see what we have
  49. unique(who)
  50.  
  51. str_view(lines,pattern=START %R%"Jack.")
  52. str_subset(lines,pattern=START %R%"Jack.")
  53. Jlines<-str_subset(lines,pattern=START%R%"Jack.")
  54. length(Jlines)
  55.  
  56. JWords<-str_split(Jlines,pattern=fixed(" "))
  57. WL<-sapply(JWords,length)
  58. WL<-WL-1
  59. mean(WL)
  60.  
  61. # Create vector of characters
  62. characters <- c("Algernon", "Jack", "Lane", "Cecily", "Gwendolen", "Chasuble",
  63. "Merriman", "Lady Bracknell", "Miss Prism")
  64.  
  65. # Match start, then character name, then .
  66. pattern_3 <- START %R% or1(characters) %R% DOT
  67.  
  68. # View matches of pattern_3
  69. str_view(play_lines, pattern=pattern_3, match=T)
  70.  
  71. # View non-matches of pattern_3
  72. str_view(play_lines, pattern=pattern_3, match=F)
  73.  
  74. # Pull out matches
  75. lines <- str_subset(play_lines,pattern=pattern_3)
  76.  
  77. # Extract match from lines
  78. who <- str_extract(lines,pattern=pattern_3)
  79.  
  80. # Unique characters
  81. unique(who)
  82.  
  83. # Count lines per character
  84. table(who)
Add Comment
Please, Sign In to add comment