Advertisement
Guest User

Exercice Sam & Max

a guest
Dec 16th, 2013
270
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.57 KB | None | 0 0
  1. import unicodedata
  2. from codecs import open
  3. import re
  4. from operator import itemgetter
  5. import sys
  6.  
  7.  
  8. # unicodedata.normalize ignores ligature characters...
  9. def decode_str(text):
  10.     a = ''
  11.     descriptors = ['LETTER', 'LIGATURE']
  12.     for c in text:
  13.         char = None
  14.         try:
  15.             desc = unicodedata.name(c)
  16.             for d in descriptors:
  17.                 index = desc.find(d)
  18.                 if index >= 0:
  19.                     index = index + len(d) + 1  # +1: space
  20.                     end_index = desc[index:].find(' ')  # find next space
  21.                     if end_index >= 0:
  22.                         char = desc[index:index+end_index]
  23.                     else:
  24.                         char = desc[index:]
  25.         except ValueError:
  26.             pass
  27.  
  28.         a += ' ' if char is None else char.lower()
  29.     return a
  30.  
  31.  
  32. def sort_dict(x, y):
  33.     k1, v1 = x
  34.     k2, v2 = y
  35.     if len(v1) != len(v2):  # Sort by number of pos
  36.         return len(v1) - len(v2)
  37.     return v1[0] - v2[0]  # If same num of pos: sort by first pos
  38.  
  39.  
  40. def find_positions(filename):
  41.     with open(filename, encoding='utf8') as f:
  42.         data = f.read()
  43.     data = decode_str(data)
  44.  
  45.     words = {}
  46.     for i, w in enumerate(re.findall('(\w+)', data)):
  47.         words.setdefault(w, []).append(i)
  48.  
  49.     for w, pos in sorted(words.iteritems(), cmp=sort_dict):
  50.         print '- %s: %s' % (w, ', '.join([str(p) for p in pos]))
  51.  
  52. if __name__ == '__main__':
  53.     if len(sys.argv) == 2:
  54.         find_positions(sys.argv[1])
  55.     else:
  56.         print 'Usage: %s filename' % sys.argv[0]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement