Advertisement
Guest User

Untitled

a guest
May 7th, 2017
233
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.08 KB | None | 0 0
  1. library(reshape)
  2. library(RCurl)
  3. library(XML)
  4. library(stringr)
  5.  
  6. theurl <- getURL("https://en.wikipedia.org/wiki/Opinion_polling_in_the_43rd_Canadian_federal_election", ssl.verifyPeer=FALSE)
  7. tables <- readHTMLTable(theurl)
  8.  
  9. #get the nth tables on the page
  10. df <- tables[[1]]
  11.  
  12. # Remove empty rows (wikipedia tables sometime use empty rows for spacing)
  13. df <- df[!apply(df == "", 1, all),]
  14.  
  15. #remove the last election results and two blank rows
  16. #df <- head(df, -)
  17.  
  18. # Last row will be the election, to use it we need to add a sample size...
  19. # ?
  20.  
  21. #df <- df[1:nrow(df), c(1:11) ]
  22. #df2 <- tables[[2]]
  23.  
  24. # get the first x rwos
  25. #df2 <- head(df2, 10)
  26.  
  27. #join the data
  28. #df <- rbind(df, df2)
  29.  
  30. #get columns 1-2 and 4-9. Col 3 is a link to poll source
  31. df <- df[1:nrow(df), c(1:2, 4:10) ]
  32.  
  33. #add column headings
  34. c.names <- c("Firm", "Date", "Liberal", "Conservative", "NDP", "BQ", "Green", "Error", "Sample_Size")
  35. names(df) <- c.names
  36.  
  37. #print resulting table to console for debugging
  38. df
  39.  
  40. # DATE FORMATTING
  41. # format Date column
  42. df$Date = as.Date(substr(df[, 2], 9, 18))
  43.  
  44. #as.numeric gives date since 1970-01-01
  45. # add days from 1900-01-01 to match spreadsheet data, +1 for 1970-01-01 day, and +1 for leapyear bug in 1900
  46. df$Date = as.numeric(df$Date) - as.numeric(as.Date('1900-01-01')) +2
  47.  
  48. # SAMPLE SIZES
  49. ## EXTRACTING ROLLING POLL INFO
  50. # Samples sizes are given as e.g.: 1000 (1/4), need to multiple sample size by fraction.
  51.  
  52. # Get number in parentheses
  53. Rolling_Poll <- str_extract(df$Sample_Size, "(?<=\\().*(?=\\))")
  54. # convert fraction text "1/4" to decimal number 0.25
  55. Rolling_Poll = sapply(Rolling_Poll, function(x) eval(parse(text=x)))
  56. # change NA to 1
  57. Rolling_Poll[is.na(Rolling_Poll)] <- 1
  58.  
  59. #matches <- regexpr("\\(.*?\\)", df$Sample_Size)
  60. #fractions_with_parens <- regmatches(df$Sample_Size, matches)
  61. #fractions <- gsub("[\\(\\)]", "", more)
  62.  
  63. # extract number from Error text ("+/-3.1 pp" --> 3.1)
  64. #df$Error = as.numeric(gsub("[^0-9.]", "", unlist(df$Error)))
  65. #df$Sample_Size = as.numeric(gsub(",", "", unlist(df$Sample_Size)))
  66.  
  67.  
  68. # Extract sample size
  69. df$Sample_Size = gsub(",", "", unlist(df$Sample_Size)) # remove commas in numbers
  70. df$Sample_Size = sub("^$", "99999999", df$Sample_Size)
  71. # Fancy regex to remove parenthetical notes, eg 1,000 (1/4) needs to remove comma and (1/4) to convert to number
  72. df$Sample_Size = as.numeric(gsub("\\s*\\([^\\)]+\\)", "", unlist(df$Sample_Size)))
  73.  
  74. df$Sample_Size = df$Sample_Size * Rolling_Poll
  75.  
  76. df
  77.  
  78. df$Error = 1/sqrt(df$Sample_Size)
  79.  
  80. # reorganize data
  81. mdata <- melt(df, id=c("Date", "Firm", "Error", "Sample_Size"))
  82.  
  83. # sort data
  84. mdata <- mdata[with(mdata, order(Date)), ]
  85.  
  86. # relabel data after reorganization
  87. c.names <- c("Date", "Firm", "Error", "Sample_Size", "Party", "Popular_Support")
  88. names(mdata) <- c.names
  89. mdata
  90. mdata$Popular_Support <- str_trim(mdata$Popular_Support)
  91. mdata$Popular_Support <- as.numeric(mdata$Popular_Support)
  92.  
  93. #mdata$Sample_Size <- as.numeric(mdata$Sample_Size)
  94.  
  95. polls <- mdata
  96.  
  97. # print to console
  98. #polls
  99.  
  100.  
  101. # Last election data
  102. last_election_date_value <- 42296 # 2015/10/19
  103. Date = rep.int(last_election_date_value, 5)
  104. Party = c('Liberal','Conservative','NDP','BQ','Green')
  105. Popular_Support = c(39.5,31.9,19.7,4.7,3.4)
  106. Error = rep.int(0,5)
  107. LastElection = data.frame(Date, Party, Popular_Support, Error)
  108.  
  109. # thiselection data
  110. next_election_date_value <- 43759 # 2019/10/21
  111. #Date = rep.int(42211, 5)
  112. #Party = c('Conservative','Liberal','NDP','BQ','Green')
  113. #Popular_Support = c(39.6,18.9,30.6,6,3.9)
  114. #Error = rep.int(0,5)
  115. #ThisElection = data.frame(Date, Party, Popular_Support, Error)
  116.  
  117. # Use this if including previous data as part of the smoothing, but don't want it displayed!
  118. election_polls <- polls[polls$Date > (last_election_date_value + 10),]
  119. #election_polls <- polls
  120.  
  121. #colors <- c(Conservative="blue", Liberal="red", NDP="orange", Green="green3", BQ="turquoise4")
  122.  
  123. colors <- c("red", "blue", "orange", "turquoise4", "green3")
  124.  
  125. library(ggplot2)
  126.  
  127. main_aes = aes(x = Date, y = Popular_Support, colour=Party, size=1/Error, weight=1/Error)
  128.  
  129. # Set plot file settings. This need to be before we generate the plot.
  130. svg(filename="PollsPlot.svg",
  131. width=15, # inches, think 72px/inch: want 1080px / 72 ppi = 15 inches
  132. height=7,
  133. pointsize=12
  134. )
  135.  
  136. plot <- ggplot(election_polls)
  137. plot2 <- plot + geom_point(main_aes)
  138. plot2 <- plot2 + scale_colour_manual(values = colors)
  139.  
  140. # Add smooth trendline
  141. plot_smooth <- plot2 + stat_smooth(data=polls, span = .35, show_guide= F, main_aes)
  142.  
  143. # Extract the data so we can work on it
  144. smooth_data <- ggplot_build(plot_smooth)$data[[2]]
  145.  
  146. # Use this if including previous data as part of the smoothing, but don't want it displayed!
  147. smooth_data <- smooth_data[smooth_data$x > last_election_date_value + 10, ]
  148.  
  149. # Format and add trendlines for each party/color
  150. for(color in colors) {
  151. party_trend <- subset(smooth_data, colour == color)
  152. plot <- plot + geom_ribbon(data = party_trend, aes(x=x, ymin=ymin, ymax = ymax), alpha = .25)
  153. plot <- plot + geom_line(data = party_trend, colour=color, aes(x = x, y = y))
  154. }
  155.  
  156. # Legend (Party)
  157. plot <- plot + scale_colour_manual(values = colors)
  158.  
  159. # The scatterplot points
  160. plot <- plot + geom_point(main_aes, alpha=0.8)
  161.  
  162. # Vertical line if seperation in dates between election and polls
  163. #plot <- plot + geom_vline(xintercept = 42296, linetype = 5, color='darkgray',show_guide= F)
  164.  
  165. # Legend (Sample Size)
  166. plot <- plot + scale_size_area(max_size=3, breaks=seq(20,60,10), labels=seq(20,60,10)^2, name="Sample Size")
  167. plot <- plot + guides(color = guide_legend(order=-1) )
  168.  
  169. # Last election data points and text
  170. plot <- plot + geom_point(data=LastElection, size=3, shape=5, show_guide = F, main_aes)
  171. plot <- plot + geom_point(data=LastElection, size=2, show_guide=F, main_aes)
  172. plot <- plot + geom_text(data=LastElection, show_guide=F,
  173. aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=1.5, vjust=-0.4)
  174.  
  175. # this election
  176. #plot <- plot + geom_point(data=ThisElection, size=3, shape=5, show_guide=F, main_aes) +
  177. # geom_point(data=ThisElection, size=2, show_guide=F, main_aes) +
  178. # geom_text(data=ThisElection, show_guide=F,
  179. # aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=-.2, vjust=-0.4)
  180.  
  181. last_election_date_value <- 42296 # 2015/10/19
  182. next_election_date_value <- 43759 # 2019/10/21
  183. start_date_value <- last_election_date_value
  184. end_date_value <- next_election_date_value
  185.  
  186. date_labels <- as.character(seq(as.Date("2015/10/19"), as.Date("2019/10/21"), by=77))
  187. date_labels[1] <- "Election\n2015/10/19"
  188. date_labels[length(date_labels)] <- "Election\n2019/10/21"
  189.  
  190. min_gridlines <- seq(last_election_date_value, next_election_date_value, by=11)
  191. maj_gridlines <- seq(last_election_date_value, next_election_date_value, by=77)
  192.  
  193. # X-axis
  194. plot <- plot + scale_x_continuous(name = "Date", limits=c(start_date_value, end_date_value),
  195. minor_breaks = min_gridlines,
  196. breaks = maj_gridlines,
  197. labels = date_labels
  198. )
  199.  
  200. #labels=c('2 May 11\nElection','', '5 Aug','10 Aug','15 Aug','20 Aug','25 Aug','30 Aug','4 Sep','9 Sep','14 Sep','19 Sep','24 Sep','29 Sep','4 Oct','9 Oct','14 Oct','19 Oct\nElection'))
  201. #minor_breaks = seq(42216, 42296, by=1), breaks = seq(42291, 43759, by=90))
  202. plot <- plot + theme(axis.text.x = element_text(size = 11, vjust=0.5, hjust=0, angle = 90, colour="#333333"))
  203. plot <- plot + theme(axis.title.x = element_blank())
  204.  
  205. # Y-axis
  206. plot <- plot + scale_y_continuous(name = "% Popular Support", lim=c(0,56), expand=c(0,0))
  207. plot <- plot + theme(axis.text.y = element_text(size = 11))
  208. plot <- plot + theme(axis.title.y = element_text(size = 11, angle = 90, colour="#333333"))
  209.  
  210. # Legend location
  211. #theme(legend.justification=c(1,1), legend.position=c(1,1))
  212.  
  213. # Done!
  214. print(plot) #display plot
  215.  
  216. dev.off() # saves plot to R project's directory, using settings we set way up there.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement