View difference between Paste ID: <a href="/hwWszcQU">hwWszcQU</a> and <a href="/5j0R321Y">5j0R321Y</a>

#!/usr/bin/python
1		#!/usr/bin/python
2
3		from bz2 import BZ2File
4		from glob import glob
5		from contextlib import closing
6		from pprint import pprint
7		import re
8		import codecs
9		import sys
10		import operator
11
12		file = glob('*.bz2')[0]
13
14		blackchars = ['{','<','\|','}']
15	-	spacestrip = re.compile(r'^\s+')
15	+	spacestrip = re.compile(r'(?:^\s+\|\s+\n?$\|\n$)')
16		strip = re.compile(ur'(?:&.?;\|<.?>\|[^\w\s]\|\d)',re.UNICODE)
17	-	tagstrip =re.compile(r'<.*?>')
17	+
18	-	puncstrip = re.compile(ur'[^\w\s]',re.UNICODE)
18	+
19	-	digstrip = re.compile(ur'\d',re.UNICODE)
19	+
20		line = re.sub(spacestrip,'',line.decode('utf-8'))
21		if line.strip() == '' or line[0] in blackchars:
22		continue
23	-	line = line.strip().decode('utf-8')
23	+
24		line = re.sub(strip,'',line)
25		for word in line.split(' '):
26		if word not in data:
27		data[word] = 0
28		data[word] += 1
29	-	if word.strip():
29	+
30	-	if word not in data:
30	+
31	-	data[word] = 0
31	+
32	-	data[word] += 1
32	+
33		data = sorted(data.iteritems(), key=operator.itemgetter(1))
34		for i in range(1,100):
35		print str(i)+": "+unicode(data[-i][0])+" - "+str(data[-i][1])