Advertisement
Guest User

Untitled

a guest
Dec 16th, 2013
674
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.13 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3.  
  4.  
  5. import sys
  6. import unicodedata
  7. from collections import defaultdict
  8.  
  9. def proceed(filename):
  10. f = open(filename)
  11. string = f.read()
  12. string = string.strip()
  13. string = string.replace("-", " ")
  14. string = string.replace("’", " ")
  15. string = string.replace(":", " ")
  16. string = string.replace(",", "")
  17. string = string.replace(".", "")
  18. string = string.replace(" ", "")
  19. string = string.replace("œ", "oe")
  20. string = string.replace("!", "")
  21. string = string.replace("?", "")
  22. string = string.replace("\n", " ")
  23. string = string.replace("\r", " ")
  24. string = string.strip()
  25. string = string.decode("utf-8")
  26. string = unicodedata.normalize('NFKD', string)
  27. string = string.encode('ASCII', 'ignore')
  28. string = string.strip().lower()
  29. tabword = string.split(" ")
  30.  
  31.  
  32. dictword = defaultdict(list)
  33. position = 0
  34. for word in tabword:
  35. if word == "":
  36. continue
  37. dictword[word].append(position)
  38. position +=1
  39.  
  40. for key,value in sorted(dictword.iteritems(), key = lambda (k,v):(len(v),v)):
  41. print "%s: %s" % (key, value)
  42.  
  43.  
  44.  
  45.  
  46.  
  47. if __name__ == "__main__":
  48. proceed(sys.argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement