SHOW:
|
|
- or go back to the newest paste.
| 1 | #!/usr/bin/python | |
| 2 | ||
| 3 | from bz2 import BZ2File | |
| 4 | from glob import glob | |
| 5 | from contextlib import closing | |
| 6 | from pprint import pprint | |
| 7 | import re | |
| 8 | import codecs | |
| 9 | import sys | |
| 10 | import operator | |
| 11 | ||
| 12 | file = glob('*.bz2')[0]
| |
| 13 | ||
| 14 | blackchars = ['{','<','|','}']
| |
| 15 | - | spacestrip = re.compile(r'^\s+') |
| 15 | + | spacestrip = re.compile(r'(?:^\s+|\s+\n?$|\n$)') |
| 16 | strip = re.compile(ur'(?:&.*?;|<.*?>|[^\w\s]|\d)',re.UNICODE) | |
| 17 | - | tagstrip =re.compile(r'<.*?>') |
| 17 | + | |
| 18 | - | puncstrip = re.compile(ur'[^\w\s]',re.UNICODE) |
| 18 | + | |
| 19 | - | digstrip = re.compile(ur'\d',re.UNICODE) |
| 19 | + | |
| 20 | line = re.sub(spacestrip,'',line.decode('utf-8'))
| |
| 21 | if line.strip() == '' or line[0] in blackchars: | |
| 22 | continue | |
| 23 | - | line = line.strip().decode('utf-8')
|
| 23 | + | |
| 24 | line = re.sub(strip,'',line) | |
| 25 | for word in line.split(' '):
| |
| 26 | if word not in data: | |
| 27 | data[word] = 0 | |
| 28 | data[word] += 1 | |
| 29 | - | if word.strip(): |
| 29 | + | |
| 30 | - | if word not in data: |
| 30 | + | |
| 31 | - | data[word] = 0 |
| 31 | + | |
| 32 | - | data[word] += 1 |
| 32 | + | |
| 33 | data = sorted(data.iteritems(), key=operator.itemgetter(1)) | |
| 34 | for i in range(1,100): | |
| 35 | print str(i)+": "+unicode(data[-i][0])+" - "+str(data[-i][1]) |