View difference between Paste ID: hwWszcQU and 5j0R321Y
SHOW: | | - or go back to the newest paste.
1
#!/usr/bin/python
2
3
from bz2 import BZ2File
4
from glob import glob
5
from contextlib import closing
6
from pprint import pprint
7
import re
8
import codecs
9
import sys 
10
import operator
11
12
file = glob('*.bz2')[0]
13
14
blackchars = ['{','<','|','}']
15-
spacestrip = re.compile(r'^\s+')
15+
spacestrip = re.compile(r'(?:^\s+|\s+\n?$|\n$)')
16
strip = re.compile(ur'(?:&.*?;|<.*?>|[^\w\s]|\d)',re.UNICODE)
17-
tagstrip =re.compile(r'<.*?>')
17+
18-
puncstrip = re.compile(ur'[^\w\s]',re.UNICODE)
18+
19-
digstrip = re.compile(ur'\d',re.UNICODE)
19+
20
		line = re.sub(spacestrip,'',line.decode('utf-8'))
21
		if line.strip() == '' or line[0] in blackchars:
22
			continue
23-
		line = line.strip().decode('utf-8')
23+
24
			line = re.sub(strip,'',line)
25
			for word in line.split(' '):
26
				if word not in data:
27
					data[word] = 0
28
				data[word] += 1
29-
				if word.strip():
29+
30-
					if word not in data:
30+
31-
						data[word] = 0
31+
32-
					data[word] += 1
32+
33
data = sorted(data.iteritems(), key=operator.itemgetter(1))
34
for i in range(1,100):
35
	print str(i)+": "+unicode(data[-i][0])+" - "+str(data[-i][1])