SHOW:
|
|
- or go back to the newest paste.
1 | #!/usr/bin/python | |
2 | ||
3 | from bz2 import BZ2File | |
4 | from glob import glob | |
5 | from contextlib import closing | |
6 | from pprint import pprint | |
7 | import re | |
8 | import codecs | |
9 | import sys | |
10 | import operator | |
11 | ||
12 | file = glob('*.bz2')[0] | |
13 | ||
14 | blackchars = ['{','<','|','}'] | |
15 | - | spacestrip = re.compile(r'^\s+') |
15 | + | spacestrip = re.compile(r'(?:^\s+|\s+\n?$|\n$)') |
16 | strip = re.compile(ur'(?:&.*?;|<.*?>|[^\w\s]|\d)',re.UNICODE) | |
17 | - | tagstrip =re.compile(r'<.*?>') |
17 | + | |
18 | - | puncstrip = re.compile(ur'[^\w\s]',re.UNICODE) |
18 | + | |
19 | - | digstrip = re.compile(ur'\d',re.UNICODE) |
19 | + | |
20 | line = re.sub(spacestrip,'',line.decode('utf-8')) | |
21 | if line.strip() == '' or line[0] in blackchars: | |
22 | continue | |
23 | - | line = line.strip().decode('utf-8') |
23 | + | |
24 | line = re.sub(strip,'',line) | |
25 | for word in line.split(' '): | |
26 | if word not in data: | |
27 | data[word] = 0 | |
28 | data[word] += 1 | |
29 | - | if word.strip(): |
29 | + | |
30 | - | if word not in data: |
30 | + | |
31 | - | data[word] = 0 |
31 | + | |
32 | - | data[word] += 1 |
32 | + | |
33 | data = sorted(data.iteritems(), key=operator.itemgetter(1)) | |
34 | for i in range(1,100): | |
35 | print str(i)+": "+unicode(data[-i][0])+" - "+str(data[-i][1]) |