Advertisement
Guest User

Untitled

a guest
Aug 14th, 2014
499
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2.  
  3. from bz2 import BZ2File
  4. from glob import glob
  5. from contextlib import closing
  6. from pprint import pprint
  7. import re
  8. import codecs
  9. import sys
  10. import operator
  11.  
  12. file = glob('*.bz2')[0]
  13.  
  14. blackchars = ['{','<','|','}']
  15. spacestrip = re.compile(r'(?:^\s+|\s+\n?$|\n$)')
  16. strip = re.compile(ur'(?:&.*?;|<.*?>|[^\w\s]|\d)',re.UNICODE)
  17. data = {}
  18. with closing(BZ2File(file)) as f:
  19.     for line in f:
  20.         line = re.sub(spacestrip,'',line.decode('utf-8'))
  21.         if line.strip() == '' or line[0] in blackchars:
  22.             continue
  23.         else:
  24.             line = re.sub(strip,'',line)
  25.             for word in line.split(' '):
  26.                 if word not in data:
  27.                     data[word] = 0
  28.                 data[word] += 1
  29.  
  30. UTF8Writer = codecs.getwriter('utf8')
  31. sys.stdout = UTF8Writer(sys.stdout)
  32.  
  33. data = sorted(data.iteritems(), key=operator.itemgetter(1))
  34. for i in range(1,100):
  35.     print str(i)+": "+unicode(data[-i][0])+" - "+str(data[-i][1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement