daily pastebin goal
36%
SHARE
TWEET

ElectionPolls2017

a guest May 7th, 2017 87 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. library(reshape)
  2. library(RCurl)
  3. library(XML)
  4. library(stringr)
  5.  
  6. theurl <- getURL("https://en.wikipedia.org/wiki/Opinion_polling_in_the_43rd_Canadian_federal_election", ssl.verifyPeer=FALSE)
  7. tables <- readHTMLTable(theurl)
  8.  
  9. #get the nth tables on the page
  10. df <- tables[[1]]
  11.  
  12. # Remove empty rows (wikipedia tables sometime use empty rows for spacing)
  13. df <- df[!apply(df == "", 1, all),]
  14.  
  15. #remove the last election results and two blank rows
  16. #df <- head(df, -)
  17.  
  18. # Last row will be the election, to use it we need to add a sample size...
  19. # ?
  20.  
  21. #df <- df[1:nrow(df), c(1:11) ]
  22. #df2 <- tables[[2]]
  23.  
  24. # get the first x rwos
  25. #df2 <- head(df2, 10)
  26.  
  27. #join the data
  28. #df <- rbind(df, df2)
  29.  
  30. #get columns 1-2 and 4-9.  Col 3 is a link to poll source
  31. df <- df[1:nrow(df), c(1:2, 4:10) ]
  32.  
  33. #add column headings
  34. c.names <- c("Firm", "Date", "Liberal", "Conservative", "NDP", "BQ", "Green", "Error", "Sample_Size")
  35. names(df) <- c.names
  36.  
  37. #print resulting table to console for debugging
  38. df
  39.  
  40. # DATE FORMATTING
  41. # format Date column
  42. df$Date = as.Date(substr(df[, 2], 9, 18))
  43.  
  44. #as.numeric gives date since 1970-01-01
  45. # add days from 1900-01-01 to match spreadsheet data, +1 for 1970-01-01 day, and +1 for leapyear bug in 1900
  46. df$Date = as.numeric(df$Date) - as.numeric(as.Date('1900-01-01')) +2
  47.  
  48. # SAMPLE SIZES
  49. ## EXTRACTING ROLLING POLL INFO
  50. # Samples sizes are given as e.g.: 1000 (1/4), need to multiple sample size by fraction.
  51.  
  52. # Get number in parentheses
  53. Rolling_Poll <- str_extract(df$Sample_Size, "(?<=\\().*(?=\\))")
  54. # convert fraction text "1/4" to decimal number 0.25
  55. Rolling_Poll = sapply(Rolling_Poll, function(x) eval(parse(text=x)))
  56. # change NA to 1
  57. Rolling_Poll[is.na(Rolling_Poll)] <- 1
  58.  
  59. #matches <- regexpr("\\(.*?\\)", df$Sample_Size)
  60. #fractions_with_parens <- regmatches(df$Sample_Size, matches)
  61. #fractions <- gsub("[\\(\\)]", "", more)
  62.  
  63. # extract number from Error text ("+/-3.1 pp" --> 3.1)
  64. #df$Error = as.numeric(gsub("[^0-9.]", "", unlist(df$Error)))
  65. #df$Sample_Size = as.numeric(gsub(",", "", unlist(df$Sample_Size)))
  66.  
  67.  
  68. # Extract sample size
  69. df$Sample_Size = gsub(",", "", unlist(df$Sample_Size)) # remove commas in numbers
  70. df$Sample_Size = sub("^$", "99999999", df$Sample_Size)
  71. # Fancy regex to remove parenthetical notes, eg 1,000 (1/4) needs to remove comma and (1/4) to convert to number
  72. df$Sample_Size = as.numeric(gsub("\\s*\\([^\\)]+\\)", "", unlist(df$Sample_Size)))
  73.  
  74. df$Sample_Size = df$Sample_Size * Rolling_Poll
  75.  
  76. df
  77.  
  78. df$Error = 1/sqrt(df$Sample_Size)
  79.  
  80. # reorganize data
  81. mdata <- melt(df, id=c("Date", "Firm", "Error", "Sample_Size"))
  82.  
  83. # sort data
  84. mdata <- mdata[with(mdata, order(Date)), ]
  85.  
  86. # relabel data after reorganization
  87. c.names <- c("Date", "Firm", "Error", "Sample_Size", "Party", "Popular_Support")
  88. names(mdata) <- c.names
  89. mdata
  90. mdata$Popular_Support <- str_trim(mdata$Popular_Support)
  91. mdata$Popular_Support <- as.numeric(mdata$Popular_Support)
  92.  
  93. #mdata$Sample_Size <- as.numeric(mdata$Sample_Size)
  94.  
  95. polls <- mdata
  96.  
  97. # print to console
  98. #polls
  99.  
  100.  
  101. # Last election data
  102. last_election_date_value <- 42296 # 2015/10/19
  103. Date = rep.int(last_election_date_value, 5)
  104. Party = c('Liberal','Conservative','NDP','BQ','Green')
  105. Popular_Support = c(39.5,31.9,19.7,4.7,3.4)
  106. Error = rep.int(0,5)
  107. LastElection = data.frame(Date, Party, Popular_Support, Error)
  108.  
  109. # thiselection data
  110. next_election_date_value <- 43759 # 2019/10/21
  111. #Date = rep.int(42211, 5)
  112. #Party = c('Conservative','Liberal','NDP','BQ','Green')
  113. #Popular_Support = c(39.6,18.9,30.6,6,3.9)
  114. #Error = rep.int(0,5)
  115. #ThisElection = data.frame(Date, Party, Popular_Support, Error)
  116.  
  117. # Use this if including previous data as part of the smoothing, but don't want it displayed!
  118. election_polls <- polls[polls$Date > (last_election_date_value + 10),]
  119. #election_polls <- polls
  120.  
  121. #colors <- c(Conservative="blue", Liberal="red", NDP="orange", Green="green3", BQ="turquoise4")
  122.  
  123. colors <- c("red", "blue", "orange", "turquoise4", "green3")
  124.  
  125. library(ggplot2)
  126.  
  127. main_aes = aes(x = Date, y = Popular_Support, colour=Party, size=1/Error, weight=1/Error)
  128.  
  129. plot <- ggplot(election_polls)
  130. plot2 <- plot +  geom_point(main_aes)
  131. plot2 <- plot2 + scale_colour_manual(values = colors)
  132.  
  133. # Add smooth trendline
  134. plot_smooth <- plot2 + stat_smooth(data=polls, span = .35, show_guide= F, main_aes)
  135.  
  136. # Extract the data so we can work on it
  137. smooth_data <- ggplot_build(plot_smooth)$data[[2]]
  138.  
  139. # Use this if including previous data as part of the smoothing, but don't want it displayed!
  140. smooth_data <- smooth_data[smooth_data$x > last_election_date_value + 10, ]
  141.  
  142. # Format and add trendlines for each party/color
  143. for(color in colors) {
  144.   party_trend <- subset(smooth_data, colour == color)
  145.   plot <- plot + geom_ribbon(data = party_trend, aes(x=x, ymin=ymin, ymax = ymax), alpha = .25)
  146.   plot <- plot + geom_line(data = party_trend, colour=color, aes(x = x, y = y))
  147. }
  148.  
  149. # Legend (Party)
  150. plot <- plot + scale_colour_manual(values = colors)
  151.  
  152. # The scatterplot points
  153. plot <- plot + geom_point(main_aes, alpha=0.8)
  154.  
  155. # Vertical line if seperation in dates between election and polls
  156. #plot <- plot + geom_vline(xintercept = 42296, linetype = 5, color='darkgray',show_guide= F)
  157.  
  158. # Legend (Sample Size)
  159. plot <- plot + scale_size_area(max_size=3, breaks=seq(20,60,10), labels=seq(20,60,10)^2, name="Sample Size")
  160. plot <- plot + guides(color = guide_legend(order=-1) )
  161.  
  162. # Last election data points and text                                
  163. plot <- plot + geom_point(data=LastElection, size=3, shape=5, show_guide = F, main_aes)
  164. plot <- plot + geom_point(data=LastElection, size=2, show_guide=F, main_aes)
  165. plot <- plot + geom_text(data=LastElection, show_guide=F,
  166.             aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=1.5, vjust=-0.4)
  167.  
  168. # this election
  169. #plot <- plot + geom_point(data=ThisElection, size=3, shape=5, show_guide=F, main_aes) +
  170. #  geom_point(data=ThisElection, size=2, show_guide=F, main_aes) +
  171. #  geom_text(data=ThisElection, show_guide=F,
  172. #            aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=-.2, vjust=-0.4)
  173.  
  174. last_election_date_value <- 42296 # 2015/10/19
  175. next_election_date_value <- 43759 # 2019/10/21
  176. start_date_value <- last_election_date_value
  177. end_date_value <- next_election_date_value
  178.  
  179. date_labels <- as.character(seq(as.Date("2015/10/19"), as.Date("2019/10/21"), by=77))
  180. date_labels[1] <- "Election\n2015/10/19"
  181. date_labels[length(date_labels)] <- "Election\n2019/10/21"
  182.  
  183. min_gridlines <- seq(last_election_date_value, next_election_date_value, by=11)
  184. maj_gridlines <- seq(last_election_date_value, next_election_date_value, by=77)
  185.  
  186. # X-axis
  187. plot <- plot + scale_x_continuous(name = "Date", limits=c(start_date_value, end_date_value),
  188.                                   minor_breaks = min_gridlines,
  189.                                   breaks = maj_gridlines,
  190.                                   labels = date_labels
  191.                                   )
  192.                                  
  193.                                   #labels=c('2 May 11\nElection','', '5 Aug','10 Aug','15 Aug','20 Aug','25 Aug','30 Aug','4 Sep','9 Sep','14 Sep','19 Sep','24 Sep','29 Sep','4 Oct','9 Oct','14 Oct','19 Oct\nElection'))
  194.                                   #minor_breaks = seq(42216, 42296, by=1), breaks = seq(42291, 43759, by=90))
  195. plot <- plot + theme(axis.text.x = element_text(size = 11, vjust=0.5, hjust=0, angle = 90, colour="#333333"))
  196. plot <- plot + theme(axis.title.x = element_blank())
  197.  
  198. # Y-axis
  199. plot <- plot + scale_y_continuous(name = "% Popular Support", lim=c(0,56), expand=c(0,0))
  200. plot <- plot + theme(axis.text.y = element_text(size = 11))
  201. plot <- plot + theme(axis.title.y = element_text(size = 11, angle = 90, colour="#333333"))
  202.  
  203. # Legend location
  204. #theme(legend.justification=c(1,1), legend.position=c(1,1))
  205.  
  206. # Done!
  207. print(plot)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top