Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(stringi)
- library(stringr)
- # Read play in using stri_read_lines()
- earnest <- stri_read_lines("importance-of-being-earnest.txt")
- # Detect start and end lines
- start <- which(str_detect(earnest, pattern = "START OF THE PROJECT"))
- end <- which(str_detect(earnest, pattern = "END OF THE PROJECT"))
- # Get rid of gutenberg intro text
- earnest_sub <- earnest[(start+1):(end-1)]
- # Detect first act
- lines_start <- which(str_detect(earnest_sub, pattern = "FIRST ACT"))
- lines_start
- # Set up index
- intro_line_index <- 1:(lines_start-1)
- # Split play into intro and play
- intro_text <- earnest_sub[intro_line_index]
- play_text <- earnest_sub[-intro_line_index]
- # Take a look at the first 20 lines
- writeLines(head(play_text,n=20))
- # Pattern for start word then .
- pattern_1 <- START %R% one_or_more(WRD)%R% DOT
- # Test pattern_1
- str_view(play_lines, pattern = pattern_1, match = T)
- str_view(play_lines, pattern = pattern_1, match = F)
- # Pattern for start, capital, word then .
- pattern_2 <- START %R% ascii_upper() %R% one_or_more(WRD) %R% DOT
- # View matches of pattern_2
- str_view(play_lines,pattern=pattern_2,match=T)
- # View non-matches of pattern_2
- str_view(play_lines,pattern=pattern_2,match=F)
- # Get subset of lines that match
- lines <- str_subset(play_lines,pattern=pattern_2)
- # Extract match from lines
- who <- str_extract(lines,pattern=pattern_2)
- # Let's see what we have
- unique(who)
- str_view(lines,pattern=START %R%"Jack.")
- str_subset(lines,pattern=START %R%"Jack.")
- Jlines<-str_subset(lines,pattern=START%R%"Jack.")
- length(Jlines)
- JWords<-str_split(Jlines,pattern=fixed(" "))
- WL<-sapply(JWords,length)
- WL<-WL-1
- mean(WL)
- # Create vector of characters
- characters <- c("Algernon", "Jack", "Lane", "Cecily", "Gwendolen", "Chasuble",
- "Merriman", "Lady Bracknell", "Miss Prism")
- # Match start, then character name, then .
- pattern_3 <- START %R% or1(characters) %R% DOT
- # View matches of pattern_3
- str_view(play_lines, pattern=pattern_3, match=T)
- # View non-matches of pattern_3
- str_view(play_lines, pattern=pattern_3, match=F)
- # Pull out matches
- lines <- str_subset(play_lines,pattern=pattern_3)
- # Extract match from lines
- who <- str_extract(lines,pattern=pattern_3)
- # Unique characters
- unique(who)
- # Count lines per character
- table(who)
Add Comment
Please, Sign In to add comment