Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on May 7th, 2012  |  syntax: None  |  size: 0.87 KB  |  hits: 12  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #! /usr/bin/env python
  2. #-*- coding:utf-8 -*-
  3.  
  4. import re
  5. import urllib2
  6. import contextlib
  7. import collections
  8. from pyquery import PyQuery as pq
  9.  
  10.  
  11. def get_html(url, enc='utf-8'):
  12.     with contextlib.closing(urllib2.urlopen(url)) as html:
  13.         html = html.read().decode(enc)
  14.         return html
  15.  
  16. def extract_text(source):
  17.     Newlines = re.compile(r'[\r\n]\s+')
  18.     d = pq(source)
  19.     body = d("body")
  20.     text = body.text()
  21.     return Newlines.sub('\n', text)
  22.  
  23. def set_textfile(text, enc='utf-8'):
  24.     with open('output.txt', 'w') as f:
  25.         words = text.encode(enc).split(' ')
  26.         c = collections.Counter(words)
  27.         for word in c.most_common():
  28.             f.write("{0:<30}:{1:>3}\n".format(word[0], word[1]))
  29.  
  30. if __name__ == '__main__':
  31.     url = "http://docs.python.org/library/functions.html"
  32.     source = get_html(url)
  33.     text = extract_text(source)
  34.     set_textfile(text)