Advertisement
Guest User

Untitled

a guest
Jul 25th, 2016
171
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.42 KB | None | 0 0
  1.  
  2. dothis <- function (a,b,c,candidatename,myURL) {
  3. #file_loc <- "C:/Data/" #for testing
  4. file_loc <- "./"
  5.  
  6. library(stringr)
  7. #adapted from http://stackoverflow.com/questions/1395528/scraping-html-tables-into-r-data-frames-using-the-xml-package
  8.  
  9. #import the most current polling information into a data table for analysis
  10. require(XML)
  11. #myURL <- "http://www.realclearpolitics.com/epolls/other/trump_favorableunfavorable-5493.html"
  12.  
  13. tables <- readHTMLTable(myURL) # read the poll web page
  14. n.rows <- unlist(lapply(tables, function(t) dim(t)[1])) #format to table
  15. #a=which.max(n.rows)
  16.  
  17. dfPoll <- tables[[a]] #assign the poll data to a data frame
  18.  
  19. write.csv(dfPoll, file = paste(file_loc, "Poll.csv",sep = ""),row.names=TRUE, na="")
  20.  
  21.  
  22. #########33
  23. #suppressMessages(library(dplyr))
  24. #suppressWarnings(library(tidyr))
  25.  
  26.  
  27.  
  28. suppressWarnings(library(stringr))
  29.  
  30. poll <- read.csv(file = paste(file_loc,"Poll.csv",sep = ""), stringsAsFactors = FALSE) #read the poll data into memory
  31. poll <- dfPoll
  32. #poll <- poll[1:20,] #limit dataset to timeframe of available press sentiment analysis (hard coded)
  33. poll <- poll[-1,] #remove top line which is a summary of entire period
  34. colnames(poll)[b:c] <- c("Favorable","Unfavorable") # remove periods from these column names
  35.  
  36. #convert the poll dates to r dates
  37. poll_date <- substr(poll$Date,1,str_locate(poll$Date,"-")-2)
  38. poll_date <- as.Date(poll_date,"%m/%d") # ** not scalable ** breaks at year boundary
  39.  
  40.  
  41. si=nrow(poll)
  42. print('nrow')
  43. print(si)
  44. for(i in 2:nrow(poll)) {
  45. if (poll_date[i] > as.Date("7/7","%m/%d")) {
  46. if (poll_date[i-1] < as.Date("7/7","%m/%d")) {
  47. si = i
  48. break}}
  49. }
  50.  
  51. if(candidatename=="Johnson")
  52. si=nrow(poll)+1
  53.  
  54. for(i in 1:nrow(poll)) {
  55. if (i >= si) {
  56. poll_date[i] <- poll_date[i] - 365
  57. }
  58. }
  59.  
  60. si2 = nrow(poll)
  61.  
  62. there_yes = 0
  63. for(i in (si):nrow(poll)) {
  64. print(i)
  65. if(i==si)
  66. {}
  67. else{
  68. if (poll_date[i] > as.Date("7/7", "%m/%d")) {
  69. if (poll_date[i-1] < as.Date("7/7","%m/%d")) {
  70. si2 = i
  71. there_yes=1
  72. break}}}
  73. }
  74.  
  75. #
  76. # if(there_yes) {
  77. # for(i in 1:nrow(poll)) {
  78. # if (i >= si2) {
  79. # poll_date[i] <- poll_date[i] - 365
  80. # }
  81. # }
  82. #
  83. # }
  84.  
  85. print(si2)
  86. poll <- poll[1:si2,]
  87. poll_date <- poll_date[1:si2]
  88.  
  89. poll <- cbind(poll,poll_date)
  90.  
  91. candidate = replicate(nrow(poll),candidatename)
  92. poll <- cbind(poll,candidate)
  93.  
  94. # f = poll$Favorable
  95. # poll$Favorable = as.numeric(levels(f))[f]
  96. #
  97. # f = poll$Unavorable
  98. # poll$Unavorable = as.numeric(levels(f))[f]
  99.  
  100.  
  101.  
  102. # #add the week number to each row so that we can analyze by week
  103. # poll <- mutate(poll,start_date = as.Date("2016-01-01")) #add column holding first day of year
  104. # week_no <- as.numeric(round(difftime(poll$poll_date, poll$start_date, units = "weeks"),digits = 0)) #calc week of year
  105. # poll <- cbind(poll,week_no) #add week of year column to daily sentiment data frame
  106. #
  107. # #calculate average sentiment by week
  108. # fav <- poll %>%
  109. # group_by(week_no) %>%
  110. # summarise(fav_week = round(mean(Favorable), digits = 0))
  111. # unfav <- poll %>%
  112. # group_by(week_no) %>%
  113. # summarise(unfav_week = round(mean(Unfavorable), digits = 0))
  114. #
  115. # poll_weekly <- cbind(fav, unfav$unfav_week)
  116. # colnames(poll_weekly) <- c("week_no","fav_pct","unfav_pct") #change the column names to be more legible
  117. # str(poll_weekly)
  118.  
  119.  
  120. #######
  121. #p = list(poll,poll_date)
  122. poll
  123. }
  124.  
  125. pa <- list(dothis(4,4,5,"Trump","http://www.realclearpolitics.com/epolls/other/trump_favorableunfavorable-5493.html"), dothis(2,4,5,"Sanders","http://www.realclearpolitics.com/epolls/other/sanders_favorableunfavorable-5263.html"), dothis(4,4,5,"Clinton","http://www.realclearpolitics.com/epolls/other/clinton_favorableunfavorable-1131.html"),
  126. dothis(2,4,5,"Kasich","http://www.realclearpolitics.com/epolls/other/kasich_favorableunfavorable-4260.html"),
  127. dothis(2,4,5,"Rubio","http://www.realclearpolitics.com/epolls/other/rubio_favorableunfavorable-3467.html"),
  128. dothis(2,4,5,"Cruz","http://www.realclearpolitics.com/epolls/other/cruz_favorableunfavorable-3887.html"),
  129. dothis(2,4,5,"Carson","http://www.realclearpolitics.com/epolls/other/carson_favorableunfavorable-5295.html"),
  130. dothis(2,4,5,"Christie","http://www.realclearpolitics.com/epolls/other/christie_favorableunfavorable-3471.html"),
  131. dothis(2,4,5,"O Malley","http://www.realclearpolitics.com/epolls/other/omalley_favorableunfavorable-3475.html"),
  132. dothis(2,4,5,"Stein","http://www.realclearpolitics.com/epolls/other/stein_favorableunfavorable-5979.html"),
  133. dothis(2,4,5,"Johnson","http://www.realclearpolitics.com/epolls/other/johnson_favorableunfavorable-5843.html"))
  134.  
  135.  
  136.  
  137. suppressWarnings(library(ggplot2))
  138.  
  139. #p=ggplot()
  140.  
  141. ploti = function (i) { poll <- pa[[i]]
  142. #poll_date <- pa[[i]][[2]]
  143.  
  144.  
  145. #p <- p+
  146. ggplot(poll, aes(as.Date(poll_date), y = value, color = variable)) +
  147. geom_line(aes(y = Favorable, col = "Favorable")) +
  148. geom_line(aes(y = Unfavorable, col = "Unfavorable"))
  149. }
  150.  
  151. for(i in 1:length(pa)) {
  152. ploti(i)
  153. }
  154. # ggplot(pa[[2]], aes(as.Date(poll_date), x = value, color = variable)) +
  155. # geom_line(aes(y = Favorable, col = "Favorable")) +
  156. # geom_line(aes(y = Unfavorable, col = "Unfavorable"))
  157.  
  158.  
  159. library(ggplot2)
  160. library(reshape2)
  161.  
  162. # df = pa
  163. # df_melt = melt(df, id.vars = 'poll_date')
  164. # ggplot(df_melt, aes(x = date, y = value)) +
  165. # geom_line() +
  166. # facet_wrap(~ variable, scales = 'free_y', ncol = 1)
  167.  
  168.  
  169. plot(pa[[1]]$poll_date,pa[[1]]$Favorable)
  170. points(pa[[1]]$poll_date,pa[[1]]$Unavorable)
  171. points(pa[[2]]$poll_date,pa[[2]]$Favorable)
  172. points(pa[[2]]$poll_date,pa[[2]]$Unavorable)
  173.  
  174. d = rbind(pa[[1]],pa[[2]],pa[[3]],pa[[4]],pa[[5]],pa[[6]],pa[[7]],pa[[8]],pa[[9]],pa[[10]],pa[[11]])
  175. ggplot(d, aes(as.Date(d$poll_date), y=value, color = variable)) +
  176. geom_line(aes(y = Favorable, col = "Favorable")) +
  177. geom_line(aes(y = Unfavorable, col = "Unfavorable"))
  178.  
  179.  
  180. f = d$Favorable
  181. d$Favorable = as.numeric(levels(f))[f]
  182.  
  183. f = d$Unfavorable
  184. d$Unfavorable = as.numeric(levels(f))[f]
  185.  
  186. d$gain = (d$Favorable - d$Unfavorable)/(d$Favorable + d$Unfavorable)*100
  187. d$net = (d$Favorable - d$Unfavorable)
  188.  
  189. qplot(as.Date(poll_date),net, data=d, geom = c('smooth'),colour=candidate,method='loess',lwd=2,span=.5,se=FALSE)
  190. #, method = "lm"
  191.  
  192. library(zoo)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement