Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- from bz2 import BZ2File
- from glob import glob
- from contextlib import closing
- from pprint import pprint
- import re
- import codecs
- import sys
- import operator
- file = glob('*.bz2')[0]
- blackchars = ['{','<','|','}']
- spacestrip = re.compile(r'(?:^\s+|\s+\n?$|\n$)')
- strip = re.compile(ur'(?:&.*?;|<.*?>|[^\w\s]|\d)',re.UNICODE)
- data = {}
- with closing(BZ2File(file)) as f:
- for line in f:
- line = re.sub(spacestrip,'',line.decode('utf-8'))
- if line.strip() == '' or line[0] in blackchars:
- continue
- else:
- line = re.sub(strip,'',line)
- for word in line.split(' '):
- if word not in data:
- data[word] = 0
- data[word] += 1
- UTF8Writer = codecs.getwriter('utf8')
- sys.stdout = UTF8Writer(sys.stdout)
- data = sorted(data.iteritems(), key=operator.itemgetter(1))
- for i in range(1,100):
- print str(i)+": "+unicode(data[-i][0])+" - "+str(data[-i][1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement