Guest User

Download/parse historical NZ opinion poll results

a guest
May 13th, 2012
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.75 KB | None | 0 0
  1. def get_mediawiki_source(article_title):
  2.     import urllib, json
  3.     params = { "format":"json", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
  4.     params["titles"] = "%s" % urllib.quote(article_title.encode("utf8"))
  5.     qs = "&".join("%s=%s" % (k, v)  for k, v in params.items())
  6.     url = "http://en.wikipedia.org/w/api.php?%s" % qs
  7.     fp = urllib.urlopen(url)
  8.     jsondata = json.load(fp)
  9.     return jsondata['query']['pages'].values()[0]['revisions'][-1]['*']
  10.  
  11. def prettyprint(cell, depth=0):
  12.     print '  ' * depth, cell
  13.     for child in cell.children:
  14.         prettyprint(child, depth+2)
  15.  
  16. def cellvalue(cell):
  17.     from mwlib import parser
  18.     articlelink = cell.find(parser.ArticleLink)
  19.     if articlelink:
  20.         cell = articlelink[0]
  21.     while not isinstance(cell, parser.Text):
  22.         if cell.children:
  23.             cell = cell.children[0]
  24.         else:
  25.             break
  26.     value = cell.asText().strip()
  27.     try:
  28.         value = float(value)
  29.     except:
  30.         pass
  31.     return value
  32.  
  33. import sys, csv, codecs, cStringIO
  34. class UnicodeWriter:
  35.     """
  36.    A CSV writer which will write rows to CSV file "f",
  37.    which is encoded in the given encoding.
  38.    """
  39.  
  40.     def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  41.         # Redirect output to a queue
  42.         self.queue = cStringIO.StringIO()
  43.         self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  44.         self.stream = f
  45.         self.encoder = codecs.getincrementalencoder(encoding)()
  46.  
  47.     def writerow(self, row):
  48.         self.writer.writerow([s.encode("utf-8") for s in row])
  49.         # Fetch UTF-8 output from the queue ...
  50.         data = self.queue.getvalue()
  51.         data = data.decode("utf-8")
  52.         # ... and reencode it into the target encoding
  53.         data = self.encoder.encode(data)
  54.         # write to the target stream
  55.         self.stream.write(data)
  56.         # empty queue
  57.         self.queue.truncate(0)
  58.  
  59.     def writerows(self, rows):
  60.         for row in rows:
  61.             self.writerow(row)
  62.  
  63. if __name__=="__main__":
  64.     import sys
  65.     from mwlib.refine import compat as mw
  66.     from mwlib import parser
  67.     title = "Opinion polling for the New Zealand general election, 2005"
  68.     pollresult_section_title = 'Individual polls'
  69.     article = get_mediawiki_source(title)
  70.     #get the rows from the first table found
  71.     rows = mw.parse_txt(article).find(parser.Table)[0].find(parser.Row)
  72.     nicerows = [[cellvalue(cell) for cell in r.find(parser.Cell)] for r in rows]
  73.     finalrows = [row for row in nicerows if len(row) > 2] #remove non-data rows
  74.    
  75.     writer = UnicodeWriter(sys.stdout)
  76.     for row in finalrows:
  77.         row = [unicode(item) for item in row]
  78.         writer.writerow(row)
Add Comment
Please, Sign In to add comment