Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ---
- title: "warmup07-vincent-chen"
- author: "Vincent Chen"
- date: "November 13, 2018"
- output: html_document
- ---
- ```{r setup, include=FALSE}
- knitr::opts_chunk$set(echo = TRUE)
- ```
- ## 1) Number of characters per tweet
- ```{r}
- data <- read.csv(file="text_emotion.csv", header=TRUE, sep=",")
- num_char <- nchar(as.character(data$content))
- summary(num_char)
- # Create histogram of size 5.
- hist(num_char, xlab="Number of Characters", main="Frequency of Characters per Tweet", w=5)
- # Are there any tweets with 0 characters?
- sum(num_char == 0)
- # Are there any tweets with one character?
- # 1. How many?
- sum(num_char == 0)
- # 2. Display its content
- data$content[which(num_char == 1)];
- # 3. What is its location (ie index or position)?
- which(num_char == 1)
- ```
- ## 2) Sentiment
- ```{r}
- # What are the different types of sentiments (ie categories)?
- levels(data$sentiment)
- # Compute the frequencies (counts) of each sentiment (and display these frequencies)
- table(data$sentiment)
- # Graph the relative frequencies with a horizontal barplot in decreasing order,in lcudingf names of sentiment types
- sentiment_count <- table(data$sentiment)
- sentiment_count <- sentiment_count[order(sentiment_count)]
- sentiment_count <- sentiment_count / sum(sentiment_count)
- barplot(sentiment_count, horiz=TRUE, names.arg = names(sentiment_count),las=1, xlab='Relative Frequency', main='Relative Frequency of Sentiment Types')
- # Sentiment and length of tweets: compute a tale with the average length of characters per sentiment (ie avg # of characters for neutral tweets, happy tweets) display this table.
- avg_sentiment <- aggregate(num_char, by=list(data$sentiment), FUN=mean)
- names(avg_sentiment) <- c("sentiment", "mean length")
- avg_sentiment
- ```
- ## 3) Author (usernames)
- ```{r}
- data$author <- as.character(data$author)
- # No longer than 15 characters; if longer display them. True, there are no names that are longer than 15 characters.
- sum(nchar(data$author) > 15) == 0
- # Contain alphanumeric characters and underscores (if contrains other symbols, display them)
- data$author[grepl("^[A-Za-z0-9_]+$", data$author, perl = T) == FALSE]
- # What is the number of characters of the shortest usernames?
- min(nchar(data$author))
- # What are the names of these authors? Write commands for these questions.
- data$author[nchar(data$author) == 2]
- ```
- ## 4) Various Symbols and Strings
- ```{r}
- # How may tweets contain at least one "^"?
- sum(grepl("\\^", data$content))
- # How many contains three or more consecutive dollar symbols "$"?
- sum(grepl("\\$$$", data$content))
- # How many tweets do NOT contain the characters "a" or "A"?
- # Display the first 10 elements of the tweets that do NOT contain the characters "a" or "A"
- # # of exclamation symbols "!": compoute a vector with the number of exclamation symbols in each twet, and display its summary()
- # What's the tweet ()content) with the largest number of explanation symbols!? Display itrs content.
- # How many tweets contain the individual strings "omg" or "OMG""
- ```
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement