
Untitled
By: a guest on
May 7th, 2012 | syntax:
None | size: 0.87 KB | hits: 12 | expires: Never
#! /usr/bin/env python
#-*- coding:utf-8 -*-
import re
import urllib2
import contextlib
import collections
from pyquery import PyQuery as pq
def get_html(url, enc='utf-8'):
with contextlib.closing(urllib2.urlopen(url)) as html:
html = html.read().decode(enc)
return html
def extract_text(source):
Newlines = re.compile(r'[\r\n]\s+')
d = pq(source)
body = d("body")
text = body.text()
return Newlines.sub('\n', text)
def set_textfile(text, enc='utf-8'):
with open('output.txt', 'w') as f:
words = text.encode(enc).split(' ')
c = collections.Counter(words)
for word in c.most_common():
f.write("{0:<30}:{1:>3}\n".format(word[0], word[1]))
if __name__ == '__main__':
url = "http://docs.python.org/library/functions.html"
source = get_html(url)
text = extract_text(source)
set_textfile(text)