Need a unique gift idea?
A Pastebin account makes a great Christmas gift
SHARE
TWEET

Untitled

a guest May 7th, 2017 116 Never
Upgrade to PRO!
ENDING IN00days00hours00mins00secs
 
  1. library(reshape)
  2. library(RCurl)
  3. library(XML)
  4. library(stringr)
  5.  
  6. theurl <- getURL("https://en.wikipedia.org/wiki/Opinion_polling_in_the_43rd_Canadian_federal_election", ssl.verifyPeer=FALSE)
  7. tables <- readHTMLTable(theurl)
  8.  
  9. #get the nth tables on the page
  10. df <- tables[[1]]
  11.  
  12. # Remove empty rows (wikipedia tables sometime use empty rows for spacing)
  13. df <- df[!apply(df == "", 1, all),]
  14.  
  15. #remove the last election results and two blank rows
  16. #df <- head(df, -)
  17.  
  18. # Last row will be the election, to use it we need to add a sample size...
  19. # ?
  20.  
  21. #df <- df[1:nrow(df), c(1:11) ]
  22. #df2 <- tables[[2]]
  23.  
  24. # get the first x rwos
  25. #df2 <- head(df2, 10)
  26.  
  27. #join the data
  28. #df <- rbind(df, df2)
  29.  
  30. #get columns 1-2 and 4-9.  Col 3 is a link to poll source
  31. df <- df[1:nrow(df), c(1:2, 4:10) ]
  32.  
  33. #add column headings
  34. c.names <- c("Firm", "Date", "Liberal", "Conservative", "NDP", "BQ", "Green", "Error", "Sample_Size")
  35. names(df) <- c.names
  36.  
  37. #print resulting table to console for debugging
  38. df
  39.  
  40. # DATE FORMATTING
  41. # format Date column
  42. df$Date = as.Date(substr(df[, 2], 9, 18))
  43.  
  44. #as.numeric gives date since 1970-01-01
  45. # add days from 1900-01-01 to match spreadsheet data, +1 for 1970-01-01 day, and +1 for leapyear bug in 1900
  46. df$Date = as.numeric(df$Date) - as.numeric(as.Date('1900-01-01')) +2
  47.  
  48. # SAMPLE SIZES
  49. ## EXTRACTING ROLLING POLL INFO
  50. # Samples sizes are given as e.g.: 1000 (1/4), need to multiple sample size by fraction.
  51.  
  52. # Get number in parentheses
  53. Rolling_Poll <- str_extract(df$Sample_Size, "(?<=\\().*(?=\\))")
  54. # convert fraction text "1/4" to decimal number 0.25
  55. Rolling_Poll = sapply(Rolling_Poll, function(x) eval(parse(text=x)))
  56. # change NA to 1
  57. Rolling_Poll[is.na(Rolling_Poll)] <- 1
  58.  
  59. #matches <- regexpr("\\(.*?\\)", df$Sample_Size)
  60. #fractions_with_parens <- regmatches(df$Sample_Size, matches)
  61. #fractions <- gsub("[\\(\\)]", "", more)
  62.  
  63. # extract number from Error text ("+/-3.1 pp" --> 3.1)
  64. #df$Error = as.numeric(gsub("[^0-9.]", "", unlist(df$Error)))
  65. #df$Sample_Size = as.numeric(gsub(",", "", unlist(df$Sample_Size)))
  66.  
  67.  
  68. # Extract sample size
  69. df$Sample_Size = gsub(",", "", unlist(df$Sample_Size)) # remove commas in numbers
  70. df$Sample_Size = sub("^$", "99999999", df$Sample_Size)
  71. # Fancy regex to remove parenthetical notes, eg 1,000 (1/4) needs to remove comma and (1/4) to convert to number
  72. df$Sample_Size = as.numeric(gsub("\\s*\\([^\\)]+\\)", "", unlist(df$Sample_Size)))
  73.  
  74. df$Sample_Size = df$Sample_Size * Rolling_Poll
  75.  
  76. df
  77.  
  78. df$Error = 1/sqrt(df$Sample_Size)
  79.  
  80. # reorganize data
  81. mdata <- melt(df, id=c("Date", "Firm", "Error", "Sample_Size"))
  82.  
  83. # sort data
  84. mdata <- mdata[with(mdata, order(Date)), ]
  85.  
  86. # relabel data after reorganization
  87. c.names <- c("Date", "Firm", "Error", "Sample_Size", "Party", "Popular_Support")
  88. names(mdata) <- c.names
  89. mdata
  90. mdata$Popular_Support <- str_trim(mdata$Popular_Support)
  91. mdata$Popular_Support <- as.numeric(mdata$Popular_Support)
  92.  
  93. #mdata$Sample_Size <- as.numeric(mdata$Sample_Size)
  94.  
  95. polls <- mdata
  96.  
  97. # print to console
  98. #polls
  99.  
  100.  
  101. # Last election data
  102. last_election_date_value <- 42296 # 2015/10/19
  103. Date = rep.int(last_election_date_value, 5)
  104. Party = c('Liberal','Conservative','NDP','BQ','Green')
  105. Popular_Support = c(39.5,31.9,19.7,4.7,3.4)
  106. Error = rep.int(0,5)
  107. LastElection = data.frame(Date, Party, Popular_Support, Error)
  108.  
  109. # thiselection data
  110. next_election_date_value <- 43759 # 2019/10/21
  111. #Date = rep.int(42211, 5)
  112. #Party = c('Conservative','Liberal','NDP','BQ','Green')
  113. #Popular_Support = c(39.6,18.9,30.6,6,3.9)
  114. #Error = rep.int(0,5)
  115. #ThisElection = data.frame(Date, Party, Popular_Support, Error)
  116.  
  117. # Use this if including previous data as part of the smoothing, but don't want it displayed!
  118. election_polls <- polls[polls$Date > (last_election_date_value + 10),]
  119. #election_polls <- polls
  120.  
  121. #colors <- c(Conservative="blue", Liberal="red", NDP="orange", Green="green3", BQ="turquoise4")
  122.  
  123. colors <- c("red", "blue", "orange", "turquoise4", "green3")
  124.  
  125. library(ggplot2)
  126.  
  127. main_aes = aes(x = Date, y = Popular_Support, colour=Party, size=1/Error, weight=1/Error)
  128.  
  129. # Set plot file settings.  This need to be before we generate the plot.
  130. svg(filename="PollsPlot.svg",
  131.     width=15, # inches, think 72px/inch: want 1080px / 72 ppi = 15 inches
  132.     height=7,
  133.     pointsize=12
  134. )
  135.  
  136. plot <- ggplot(election_polls)
  137. plot2 <- plot +  geom_point(main_aes)
  138. plot2 <- plot2 + scale_colour_manual(values = colors)
  139.  
  140. # Add smooth trendline
  141. plot_smooth <- plot2 + stat_smooth(data=polls, span = .35, show_guide= F, main_aes)
  142.  
  143. # Extract the data so we can work on it
  144. smooth_data <- ggplot_build(plot_smooth)$data[[2]]
  145.  
  146. # Use this if including previous data as part of the smoothing, but don't want it displayed!
  147. smooth_data <- smooth_data[smooth_data$x > last_election_date_value + 10, ]
  148.  
  149. # Format and add trendlines for each party/color
  150. for(color in colors) {
  151.   party_trend <- subset(smooth_data, colour == color)
  152.   plot <- plot + geom_ribbon(data = party_trend, aes(x=x, ymin=ymin, ymax = ymax), alpha = .25)
  153.   plot <- plot + geom_line(data = party_trend, colour=color, aes(x = x, y = y))
  154. }
  155.  
  156. # Legend (Party)
  157. plot <- plot + scale_colour_manual(values = colors)
  158.  
  159. # The scatterplot points
  160. plot <- plot + geom_point(main_aes, alpha=0.8)
  161.  
  162. # Vertical line if seperation in dates between election and polls
  163. #plot <- plot + geom_vline(xintercept = 42296, linetype = 5, color='darkgray',show_guide= F)
  164.  
  165. # Legend (Sample Size)
  166. plot <- plot + scale_size_area(max_size=3, breaks=seq(20,60,10), labels=seq(20,60,10)^2, name="Sample Size")
  167. plot <- plot + guides(color = guide_legend(order=-1) )
  168.  
  169. # Last election data points and text                                
  170. plot <- plot + geom_point(data=LastElection, size=3, shape=5, show_guide = F, main_aes)
  171. plot <- plot + geom_point(data=LastElection, size=2, show_guide=F, main_aes)
  172. plot <- plot + geom_text(data=LastElection, show_guide=F,
  173.             aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=1.5, vjust=-0.4)
  174.  
  175. # this election
  176. #plot <- plot + geom_point(data=ThisElection, size=3, shape=5, show_guide=F, main_aes) +
  177. #  geom_point(data=ThisElection, size=2, show_guide=F, main_aes) +
  178. #  geom_text(data=ThisElection, show_guide=F,
  179. #            aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=-.2, vjust=-0.4)
  180.  
  181. last_election_date_value <- 42296 # 2015/10/19
  182. next_election_date_value <- 43759 # 2019/10/21
  183. start_date_value <- last_election_date_value
  184. end_date_value <- next_election_date_value
  185.  
  186. date_labels <- as.character(seq(as.Date("2015/10/19"), as.Date("2019/10/21"), by=77))
  187. date_labels[1] <- "Election\n2015/10/19"
  188. date_labels[length(date_labels)] <- "Election\n2019/10/21"
  189.  
  190. min_gridlines <- seq(last_election_date_value, next_election_date_value, by=11)
  191. maj_gridlines <- seq(last_election_date_value, next_election_date_value, by=77)
  192.  
  193. # X-axis
  194. plot <- plot + scale_x_continuous(name = "Date", limits=c(start_date_value, end_date_value),
  195.                                   minor_breaks = min_gridlines,
  196.                                   breaks = maj_gridlines,
  197.                                   labels = date_labels
  198.                                   )
  199.                                  
  200.                                   #labels=c('2 May 11\nElection','', '5 Aug','10 Aug','15 Aug','20 Aug','25 Aug','30 Aug','4 Sep','9 Sep','14 Sep','19 Sep','24 Sep','29 Sep','4 Oct','9 Oct','14 Oct','19 Oct\nElection'))
  201.                                   #minor_breaks = seq(42216, 42296, by=1), breaks = seq(42291, 43759, by=90))
  202. plot <- plot + theme(axis.text.x = element_text(size = 11, vjust=0.5, hjust=0, angle = 90, colour="#333333"))
  203. plot <- plot + theme(axis.title.x = element_blank())
  204.  
  205. # Y-axis
  206. plot <- plot + scale_y_continuous(name = "% Popular Support", lim=c(0,56), expand=c(0,0))
  207. plot <- plot + theme(axis.text.y = element_text(size = 11))
  208. plot <- plot + theme(axis.title.y = element_text(size = 11, angle = 90, colour="#333333"))
  209.  
  210. # Legend location
  211. #theme(legend.justification=c(1,1), legend.position=c(1,1))
  212.  
  213. # Done!
  214. print(plot) #display plot
  215.  
  216. dev.off() # saves plot to R project's directory, using settings we set way up there.
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top