Untitled

---
title: "warmup07-vincent-chen"
author: "Vincent Chen"
date: "November 13, 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## 1) Number of characters per tweet
```{r}
data <- read.csv(file="text_emotion.csv", header=TRUE, sep=",")
num_char <- nchar(as.character(data$content))
summary(num_char)

# Create histogram of size 5.
hist(num_char, xlab="Number of Characters", main="Frequency of Characters per Tweet", w=5)

# Are there any tweets with 0 characters?
sum(num_char == 0)

# Are there any tweets with one character?
# 1. How many?
sum(num_char == 0)

# 2. Display its content
data$content[which(num_char == 1)];

# 3. What is its location (ie index or position)?
which(num_char == 1)
```

## 2) Sentiment
```{r}
# What are the different types of sentiments (ie categories)?
levels(data$sentiment)

# Compute the frequencies (counts) of each sentiment (and display these frequencies)
table(data$sentiment)

# Graph the relative frequencies with a horizontal barplot in decreasing order,in lcudingf names of sentiment types
sentiment_count <- table(data$sentiment)
sentiment_count <- sentiment_count[order(sentiment_count)]
sentiment_count <- sentiment_count / sum(sentiment_count)
barplot(sentiment_count, horiz=TRUE, names.arg = names(sentiment_count),las=1, xlab='Relative Frequency', main='Relative Frequency of Sentiment Types')

# Sentiment and length of tweets: compute a tale with the average length of characters per sentiment (ie avg # of characters for neutral tweets, happy tweets) display this table.
avg_sentiment <- aggregate(num_char, by=list(data$sentiment), FUN=mean)
names(avg_sentiment) <- c("sentiment", "mean length")
avg_sentiment

```


## 3) Author (usernames)
```{r}
data$author <- as.character(data$author)
# No longer than 15 characters; if longer display them. True, there are no names that are longer than 15 characters.
sum(nchar(data$author) > 15) == 0

# Contain alphanumeric characters and underscores (if contrains other symbols, display them)
data$author[grepl("^[A-Za-z0-9_]+$", data$author, perl = T) == FALSE]

# What is the number of characters of the shortest usernames?
min(nchar(data$author))

# What are the names of these authors? Write commands for these questions.
data$author[nchar(data$author) == 2]
```

## 4) Various Symbols and Strings
```{r}
# How may tweets contain at least one "^"?
sum(grepl("\\^", data$content))
# How many contains three or more consecutive dollar symbols "$"?
sum(grepl("\\$$$", data$content))

# How many tweets do NOT contain the characters "a" or "A"?

# Display the first 10 elements of the tweets that do NOT contain the characters "a" or "A"

# # of exclamation symbols "!": compoute a vector with the number of exclamation symbols in each twet, and display its summary()

# What's the tweet ()content) with the largest number of explanation symbols!? Display itrs content.

# How many tweets contain the individual strings "omg" or "OMG""
```