commit 3460700d98c26b1e0e255a54e10be722489b9391 Author: unknown Date: Sun Apr 21 12:41:01 2013 -0500 Updated Twitter to use OAuth and fixed UTF conversion diff --git a/R/2_run.R b/R/2_run.R index 164c0cf..0f7e3cc 100644 --- a/R/2_run.R +++ b/R/2_run.R @@ -9,20 +9,18 @@ if (VERBOSE) # we do end up with lots of objects in memory to play with (it _is_ # a tutorial, after all :) -american.text = laply(american.tweets, function(t) t$getText() ) -delta.text = laply(delta.tweets, function(t) t$getText() ) -jetblue.text = laply(jetblue.tweets, function(t) t$getText() ) -southwest.text = laply(southwest.tweets, function(t) t$getText() ) -united.text = laply(united.tweets, function(t) t$getText() ) -us.text = laply(us.tweets, function(t) t$getText() ) - +american.text = laply(american.tweets, function(t) iconv(t$getText(), to="UTF8")) +delta.text = laply(delta.tweets, function(t) iconv(t$getText(), to="UTF8") ) +jetblue.text = laply(jetblue.tweets, function(t) iconv(t$getText(), to="UTF8") ) +southwest.text = laply(southwest.tweets, function(t) iconv(t$getText(), to="UTF8") ) +united.text = laply(united.tweets, function(t) iconv(t$getText(), to="UTF8") ) +us.text = laply(us.tweets, function(t) iconv(t$getText(), to="UTF8") ) american.scores = score.sentiment(american.text, pos.words, neg.words, .progress='text') delta.scores = score.sentiment(delta.text, pos.words, neg.words, .progress='text') jetblue.scores = score.sentiment(jetblue.text, pos.words, neg.words, .progress='text') southwest.scores = score.sentiment(southwest.text, pos.words, neg.words, .progress='text') united.scores = score.sentiment(united.text, pos.words, neg.words, .progress='text') us.scores = score.sentiment(us.text, pos.words, neg.words, .progress='text') - american.scores$airline = 'American' american.scores$code = 'AA' delta.scores$airline = 'Delta' diff --git a/R/scrape.R b/R/scrape.R index 56dcf2b..a138e92 100644 --- a/R/scrape.R +++ b/R/scrape.R @@ -9,23 +9,51 @@ if (VERBOSE) print("Searching Twitter for airline tweets and saving to disk") require(twitteR) - -american.tweets = searchTwitter('@americanair', n=1500) +library(RCurl) +library(ROAuth) + +#Need to make sure we have a caert + +if (!file.exists("cacert.pem")) + download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem") + +if (file.exists("twitter authentication.Rdata")){ + load("twitter authentication.Rdata") +} else +{ + requestURL <- "https://api.twitter.com/oauth/request_token" + accessURL = "http://api.twitter.com/oauth/access_token" + authURL = "http://api.twitter.com/oauth/authorize" + consumerKey = "FILLINWITHCONSUMERKEY" + consumerSecret = "FILLINWITHCONSUMERSECRET" + Cred <- OAuthFactory$new(consumerKey=consumerKey, + consumerSecret=consumerSecret, + requestURL=requestURL, + accessURL=accessURL, + authURL=authURL) + #The next command provides a URL which you will need to copy and paste into your favourite browser + #Assuming you are logged into Twitter you will then be provided a PIN number to type into the R command line + Cred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl") ) + # Checks that you are authorised + save(Cred, file="twitter authentication.Rdata") +} +registerTwitterOAuth(Cred) +american.tweets = searchTwitter('@americanair', n=1500, cainfo="cacert.pem") save(american.tweets, file=file.path(dataDir, 'american.tweets.RData' ), ascii=T) -delta.tweets = searchTwitter('@delta', n=1500) +delta.tweets = searchTwitter('@delta', n=1500, cainfo="cacert.pem") save(delta.tweets, file=file.path(dataDir, 'delta.tweets.RData' ), ascii=T) -jetblue.tweets = searchTwitter('@jetblue', n=1500) +jetblue.tweets = searchTwitter('@jetblue', n=1500, cainfo="cacert.pem") save(jetblue.tweets, file=file.path(dataDir, 'jetblue.tweets.RData' ), ascii=T) -southwest.tweets = searchTwitter('@southwestair', n=1500) +southwest.tweets = searchTwitter('@southwestair', n=1500, cainfo="cacert.pem") save(southwest.tweets, file=file.path(dataDir, 'southwest.tweets.RData' ), ascii=T) -united.tweets = searchTwitter('@united', n=1500) +united.tweets = searchTwitter('@united', n=1500, cainfo="cacert.pem") save(united.tweets, file=file.path(dataDir, 'united.tweets.RData' ), ascii=T) -us.tweets = searchTwitter('@usairways', n=1500) +us.tweets = searchTwitter('@usairways', n=1500, cainfo="cacert.pem") save(us.tweets, file=file.path(dataDir, 'us.tweets.RData' ), ascii=T) @@ -45,7 +73,7 @@ acsi.df = acsi.raw.df[,c(1,19)] colnames(acsi.df) = c('airline', 'score') # add codes for later matching, and make sure score is treated as a number (not a string) -acsi.df$code = c('WN', NA, NA, 'CO', 'AA', 'UA', 'US', 'DL', 'NW') +acsi.df$code = c('B6', 'WN', NA, NA, 'DL', 'US', 'AA', 'UA', NA, 'NW') acsi.df$score = as.numeric(acsi.df$score) save(acsi.raw.df, file=file.path(dataDir, 'acsi.raw.df.RData'), ascii=T) diff --git a/R/sentiment.R b/R/sentiment.R index 4b2be84..f389fff 100644 --- a/R/sentiment.R +++ b/R/sentiment.R @@ -14,7 +14,7 @@ score.sentiment = function(sentences, pos.words, neg.words, .progress='none') { require(plyr) require(stringr) - + # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply: scores = laply(sentences, function(sentence, pos.words, neg.words) { diff --git a/data/acsi.df.RData b/data/acsi.df.RData index 97d6f46..8b7511a 100644 --- a/data/acsi.df.RData +++ b/data/acsi.df.RData @@ -1,115 +1,121 @@ RDA2 A 2 -134400 +196608 131840 1026 1 -9 +262153 7 acsi.df 787 3 16 -9 -9 +10 +262153 +7 +JetBlue +262153 9 Southwest -9 +262153 10 All\040Others -9 +262153 8 Airlines -9 -11 -Continental -9 +262153 +5 +Delta +262153 +10 +US\040Airways +262153 8 American -9 +262153 6 United -9 -10 -US\040Airways -9 -5 -Delta -9 +262153 +11 +Continental +262153 18 Northwest\040Airlines 14 -9 +10 +NA 81 76 65 -64 -63 +56 61 +63 61 -56 +64 NA 16 -9 -9 +10 +262153 +2 +B6 +262153 2 WN 9 -1 9 -1 -9 +262153 2 -CO -9 +DL +262153 +2 +US +262153 2 AA -9 +262153 2 UA 9 -2 -US -9 -2 -DL -9 +-1 +262153 2 NW 1026 1 -9 +262153 5 names 16 3 -9 +262153 7 airline -9 +262153 5 score -9 +262153 4 code 1026 1 -9 +262153 9 row.names 13 2 NA --9 +-10 1026 1 -9 +262153 5 class 16 1 -9 +262153 10 data.frame 254 diff --git a/data/acsi.raw.df.RData b/data/acsi.raw.df.RData index 546910e..c9e98ff 100644 --- a/data/acsi.raw.df.RData +++ b/data/acsi.raw.df.RData @@ -1,711 +1,844 @@ RDA2 A 2 -134400 +196608 131840 1026 1 -9 +262153 11 acsi.raw.df 787 -21 +23 16 -9 -9 +10 +262153 +7 +JetBlue +262153 9 Southwest -9 +262153 10 All\040Others -9 +262153 8 Airlines -9 -11 -Continental -9 +262153 +5 +Delta +262153 +10 +US\040Airways +262153 8 American -9 +262153 6 United -9 -10 -US\040Airways -9 -5 -Delta -9 +262153 +11 +Continental +262153 18 Northwest\040Airlines 16 -9 -9 +10 +262153 +2 +NM +262153 2 78 -9 +262153 2 NM -9 +262153 2 72 -9 +262153 2 -67 -9 +77 +262153 +2 +72 +262153 2 70 -9 +262153 2 71 -9 -2 -72 -9 +262153 2 -77 -9 +67 +262153 2 69 16 -9 -9 +10 +262153 +2 +NM +262153 2 76 -9 +262153 2 70 -9 +262153 2 69 -9 -2 -64 -9 +262153 2 -71 -9 +72 +262153 2 67 -9 +262153 +2 +71 +262153 2 67 -9 +262153 2 -72 -9 +64 +262153 2 71 16 -9 -9 +10 +262153 +2 +NM +262153 2 76 -9 +262153 2 74 -9 +262153 2 69 -9 +262153 +2 +67 +262153 2 66 -9 +262153 2 71 -9 +262153 2 70 -9 +262153 2 66 -9 -2 -67 -9 +262153 2 67 16 -9 -9 +10 +262153 +2 +NM +262153 2 76 -9 +262153 2 70 -9 +262153 2 67 -9 +262153 2 -64 -9 -2 -62 -9 +69 +262153 2 68 -9 +262153 +2 +62 +262153 2 68 -9 +262153 2 -69 -9 +64 +262153 2 64 16 -9 -9 +10 +262153 +2 +NM +262153 2 74 -9 +262153 2 62 -9 +262153 2 65 -9 -2 -66 -9 -2 -67 -9 +262153 2 65 -9 +262153 2 65 -9 +262153 +2 +67 +262153 2 65 -9 +262153 +2 +66 +262153 2 63 16 -9 -9 +10 +262153 +2 +NM +262153 2 72 -9 +262153 2 67 -9 +262153 2 63 -9 +262153 2 -64 -9 +68 +262153 +2 +61 +262153 2 64 -9 +262153 2 62 -9 -2 -61 -9 +262153 2 -68 -9 +64 +262153 2 53 16 -9 -9 +10 +262153 +2 +NM +262153 2 70 -9 +262153 2 63 -9 +262153 2 63 -9 +262153 +2 +66 +262153 2 62 -9 +262153 2 63 -9 +262153 2 62 -9 +262153 2 62 -9 -2 -66 -9 +262153 2 62 16 -9 -9 +10 +262153 +2 +NM +262153 2 70 -9 +262153 2 64 -9 +262153 2 61 -9 +262153 2 -67 -9 +61 +262153 +2 +60 +262153 2 62 -9 +262153 2 59 -9 +262153 2 -60 -9 -2 -61 -9 +67 +262153 2 56 16 -9 -9 +10 +262153 +2 +NM +262153 2 74 -9 +262153 2 72 -9 +262153 2 66 -9 +262153 2 -68 -9 +66 +262153 2 63 -9 -2 -64 -9 +262153 2 63 -9 +262153 2 -66 -9 +64 +262153 +2 +68 +262153 2 65 16 -9 -9 +10 +262153 +2 +NM +262153 2 75 -9 +262153 2 74 -9 +262153 2 67 -9 -2 -68 -9 +262153 2 67 -9 -2 -63 -9 +262153 2 64 -9 +262153 2 67 -9 +262153 +2 +63 +262153 +2 +68 +262153 2 64 16 -9 -9 +10 +262153 +2 +NM +262153 2 73 -9 +262153 2 73 -9 +262153 2 66 -9 +262153 2 67 -9 +262153 +2 +62 +262153 2 66 -9 +262153 2 64 -9 -2 -62 -9 +262153 2 67 -9 +262153 2 64 16 -9 -9 +10 +262153 +2 +NM +262153 2 74 -9 +262153 2 74 -9 +262153 2 66 -9 +262153 2 -70 -9 +65 +262153 +2 +57 +262153 2 64 -9 +262153 2 61 -9 -2 -57 -9 +262153 2 -65 -9 +70 +262153 2 64 16 -9 -9 +10 +262153 +2 +NM +262153 2 74 -9 +262153 2 74 -9 +262153 2 65 -9 +262153 2 -67 -9 +64 +262153 2 62 -9 -2 -63 -9 +262153 2 62 -9 +262153 2 -64 -9 +63 +262153 +2 +67 +262153 2 61 16 -9 -9 +10 +262153 +2 +NM +262153 2 76 -9 +262153 2 75 -9 +262153 2 63 -9 +262153 2 -69 -9 +59 +262153 +2 +61 +262153 2 60 -9 +262153 2 56 -9 -2 -61 -9 +262153 2 -59 -9 +69 +262153 2 61 16 -9 -9 +10 +262153 +2 +NM +262153 2 79 -9 +262153 2 75 -9 +262153 2 62 -9 +262153 2 -62 -9 +60 +262153 +2 +54 +262153 2 62 -9 +262153 2 56 -9 -2 -54 -9 +262153 2 -60 -9 +62 +262153 2 57 16 -9 -9 +10 +262153 +2 +NM +262153 2 81 -9 +262153 2 77 -9 +262153 2 64 -9 +262153 2 -68 -9 +64 +262153 +2 +59 +262153 2 60 -9 +262153 2 56 -9 -2 -59 -9 +262153 2 -64 -9 +68 +262153 2 57 16 -9 -9 +10 +262153 +2 +NM +262153 2 79 -9 +262153 2 75 -9 +262153 2 66 -9 +262153 2 -71 -9 +62 +262153 +2 +62 +262153 2 63 -9 +262153 2 60 -9 -2 -62 -9 +262153 2 -62 -9 +71 +262153 2 61 16 -9 -9 +10 +262153 +2 +NM +262153 2 81 -9 +262153 2 76 -9 +262153 2 65 -9 +262153 2 -64 -9 +56 +262153 +2 +61 +262153 2 63 -9 +262153 2 61 -9 +262153 2 -61 -9 +64 +262153 +1 +# +16 +10 +262153 2 -56 -9 +81 +262153 +2 +77 +262153 +2 +74 +262153 +2 +67 +262153 +2 +65 +262153 +2 +65 +262153 +2 +64 +262153 +2 +62 +262153 1 # +262153 +0 + 16 -9 -9 -3 -2.5 -9 +10 +262153 +0 + +262153 +0 + +262153 +0 + +262153 +0 + +262153 +0 + +262153 +0 + +262153 +0 + +262153 +0 + +262153 +0 + +262153 +0 + +16 +10 +262153 3 -1.3 -9 +N/A +262153 4 --1.5 -9 +-4.9 +262153 4 --9.9 -9 -3 -0.0 -9 +-2.6 +262153 3 -1.7 -9 -4 --1.6 -9 +3.1 +262153 4 --9.7 -9 +16.1 +262153 +3 +6.6 +262153 +3 +1.6 +262153 +3 +1.6 +262153 +3 +N/A +262153 3 N/A 16 -9 -9 +10 +262153 3 -3.8 -9 +N/A +262153 +4 +-1.3 +262153 3 -8.6 -9 +5.7 +262153 +4 +-6.9 +262153 +5 +-15.6 +262153 4 -9.7 -9 +262153 4 --4.5 -9 -5 --10.0 -9 -5 --14.1 -9 -5 --15.3 -9 +-8.6 +262153 5 --27.3 -9 +-12.7 +262153 +3 +N/A +262153 3 N/A 1026 1 -9 +262153 5 names 16 -21 -9 +23 +262153 0 -9 +262153 9 Base-line -9 +262153 2 95 -9 +262153 2 96 -9 +262153 2 97 -9 +262153 2 98 -9 +262153 2 99 -9 +262153 2 00 -9 +262153 2 01 -9 +262153 2 02 -9 +262153 2 03 -9 +262153 2 04 -9 +262153 2 05 -9 +262153 2 06 -9 +262153 2 07 -9 +262153 2 08 -9 +262153 2 09 -9 +262153 2 10 -9 +262153 2 11 -9 +262153 +2 +12 +262153 +2 +13 +262153 19 PreviousYear%Change -9 +262153 16 FirstYear%Change 1026 1 -9 +262153 9 row.names 13 2 NA --9 +-10 1026 1 -9 +262153 5 class 16 1 -9 +262153 10 data.frame 254 diff --git a/output/twitter_acsi_comparison.pdf b/output/twitter_acsi_comparison.pdf index fc66b1d..73b4c04 100644 --- a/output/twitter_acsi_comparison.pdf +++ b/output/twitter_acsi_comparison.pdf @@ -1,11 +1,12 @@ - airline - q - 80 q American - q Delta - q Southwest - q US Airways - q United - 75 + airline + q + 80 + q American + q Delta + q JetBlue + q Southwest + q United + 75 q US Airways @@ -19,16 +20,16 @@ score.acsi 65 - q + q - q q + q q 60 - q - - 40 45 50 55 60 65 70 - score.twitter + q + 55 + 40 60 80 100 + score.twitter \ No newline at end of file diff --git a/output/twitter_acsi_comparison_with_fit.pdf b/output/twitter_acsi_comparison_with_fit.pdf index 2905f0a..4a43e36 100644 --- a/output/twitter_acsi_comparison_with_fit.pdf +++ b/output/twitter_acsi_comparison_with_fit.pdf @@ -1,11 +1,12 @@ - airline - q - 80 q American - q Delta - q Southwest - q US Airways - q United - 75 + airline + q + 80 + q American + q Delta + q JetBlue + q Southwest + q United + 75 q US Airways @@ -19,16 +20,16 @@ score.acsi 65 - q + q - q q + q q 60 - q - - 40 45 50 55 60 65 70 - score.twitter + q + 55 + 40 60 80 100 + score.twitter \ No newline at end of file diff --git a/output/twitter_score_histograms.pdf b/output/twitter_score_histograms.pdf index 55f4562..7ce1a1d 100644 --- a/output/twitter_score_histograms.pdf +++ b/output/twitter_score_histograms.pdf @@ -1,66 +1,52 @@ + American + 1000 500 + 0 + 1000 - American - 400 - 300 - 200 - 100 - 0 + + Delta 500 - 400 + 0 + airline - Delta - 300 - 200 - 100 - 0 - 500 airline + JetBlue + 1000 American + 500 Delta +count - JetBlue - 400 - 300 American - 200 - 100 Delta - 0 -count + 0 JetBlue - JetBlue + Southwest + 1000 Southwest + 500 United + 0 US Airways - Southwest US Airways - 500 - 400 Southwest - 300 - 200 - 100 US Airways - 0 - United + + + United + 1000 500 - 400 - 300 - 200 - 100 0 + US Airways + 1000 500 - 400 United - 300 - 200 - 100 0 - −6 −4 −2 0 2 4 6 - score + −5.0 −2.5 0.0 2.5 5.0 7.5 + score \ No newline at end of file