Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_mediawiki_source(article_title):
- import urllib, json
- params = { "format":"json", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
- params["titles"] = "%s" % urllib.quote(article_title.encode("utf8"))
- qs = "&".join("%s=%s" % (k, v) for k, v in params.items())
- url = "http://en.wikipedia.org/w/api.php?%s" % qs
- fp = urllib.urlopen(url)
- jsondata = json.load(fp)
- return jsondata['query']['pages'].values()[0]['revisions'][-1]['*']
- def prettyprint(cell, depth=0):
- print ' ' * depth, cell
- for child in cell.children:
- prettyprint(child, depth+2)
- def cellvalue(cell):
- from mwlib import parser
- articlelink = cell.find(parser.ArticleLink)
- if articlelink:
- cell = articlelink[0]
- while not isinstance(cell, parser.Text):
- if cell.children:
- cell = cell.children[0]
- else:
- break
- value = cell.asText().strip()
- try:
- value = float(value)
- except:
- pass
- return value
- import sys, csv, codecs, cStringIO
- class UnicodeWriter:
- """
- A CSV writer which will write rows to CSV file "f",
- which is encoded in the given encoding.
- """
- def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
- # Redirect output to a queue
- self.queue = cStringIO.StringIO()
- self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
- self.stream = f
- self.encoder = codecs.getincrementalencoder(encoding)()
- def writerow(self, row):
- self.writer.writerow([s.encode("utf-8") for s in row])
- # Fetch UTF-8 output from the queue ...
- data = self.queue.getvalue()
- data = data.decode("utf-8")
- # ... and reencode it into the target encoding
- data = self.encoder.encode(data)
- # write to the target stream
- self.stream.write(data)
- # empty queue
- self.queue.truncate(0)
- def writerows(self, rows):
- for row in rows:
- self.writerow(row)
- if __name__=="__main__":
- import sys
- from mwlib.refine import compat as mw
- from mwlib import parser
- title = "Opinion polling for the New Zealand general election, 2005"
- pollresult_section_title = 'Individual polls'
- article = get_mediawiki_source(title)
- #get the rows from the first table found
- rows = mw.parse_txt(article).find(parser.Table)[0].find(parser.Row)
- nicerows = [[cellvalue(cell) for cell in r.find(parser.Cell)] for r in rows]
- finalrows = [row for row in nicerows if len(row) > 2] #remove non-data rows
- writer = UnicodeWriter(sys.stdout)
- for row in finalrows:
- row = [unicode(item) for item in row]
- writer.writerow(row)
Add Comment
Please, Sign In to add comment