Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- from __future__ import with_statement
- import sys
- import re
- import unicodedata
- from collections import defaultdict
- def proceed(filename):
- with open(filename) as f:
- texte = f.read()
- pass
- texte = texte.strip()
- texte = texte.replace("’", " ")
- texte = texte.replace("œ", "oe")
- texte = texte.decode('utf-8')
- texte = unicodedata.normalize('NFKD', texte)
- texte = texte.encode('ASCII', 'ignore')
- texte = " ".join(re.split(r'\W+',texte))
- texte = texte.strip().lower()
- dictword = defaultdict(list)
- pos = 0
- for m in re.finditer(r'\w+', texte):
- dictword[m.group(0)].append(pos)
- pos += 1
- pass
- for key,value in sorted(dictword.iteritems(), key = lambda (k,v):(len(v),v)):
- print "%s: %s" % (key, ', '.join([str(x) for x in value]))
- if __name__ == "__main__":
- proceed(sys.argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement