Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import unicodedata
- from codecs import open
- import re
- from operator import itemgetter
- import sys
- # unicodedata.normalize ignores ligature characters...
- def decode_str(text):
- a = ''
- descriptors = ['LETTER', 'LIGATURE']
- for c in text:
- char = None
- try:
- desc = unicodedata.name(c)
- for d in descriptors:
- index = desc.find(d)
- if index >= 0:
- index = index + len(d) + 1 # +1: space
- end_index = desc[index:].find(' ') # find next space
- if end_index >= 0:
- char = desc[index:index+end_index]
- else:
- char = desc[index:]
- except ValueError:
- pass
- a += ' ' if char is None else char.lower()
- return a
- def sort_dict(x, y):
- k1, v1 = x
- k2, v2 = y
- if len(v1) != len(v2): # Sort by number of pos
- return len(v1) - len(v2)
- return v1[0] - v2[0] # If same num of pos: sort by first pos
- def find_positions(filename):
- with open(filename, encoding='utf8') as f:
- data = f.read()
- data = decode_str(data)
- words = {}
- for i, w in enumerate(re.findall('(\w+)', data)):
- words.setdefault(w, []).append(i)
- for w, pos in sorted(words.iteritems(), cmp=sort_dict):
- print '- %s: %s' % (w, ', '.join([str(p) for p in pos]))
- if __name__ == '__main__':
- if len(sys.argv) == 2:
- find_positions(sys.argv[1])
- else:
- print 'Usage: %s filename' % sys.argv[0]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement