Advertisement
Guest User

Untitled

a guest
Feb 17th, 2020
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.34 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Find the most frequent words containing non-ASCII
  4. characters, for each language.
  5. Used to test for decoding/encoding errors, where
  6. Unicode characters are translated into ASCII, e.g.:
  7. "licenca usuario" instead of "licença usuário".
  8. """
  9.  
  10.  
  11. import os
  12. from glob import glob
  13. from unidecode import unidecode
  14. import collections
  15. import itertools
  16. from pprint import pprint
  17.  
  18. from nltk import tokenize
  19.  
  20. from util.common import get_file_content, json_to_dict, parse
  21.  
  22.  
  23. mappings = json_to_dict('../local/mappings.json')
  24. locales = json_to_dict('../cfg/locales.json')
  25.  
  26.  
  27. def localize(path):
  28.     """Turn remote path to local."""
  29.     for remote, local in mappings:
  30.         if remote in path:
  31.             path = path.replace(remote, local)
  32.     return path
  33.  
  34.  
  35. def expand(path, lang):
  36.     """Substitute locale vars with actual values for the given lang."""
  37.     for var_name, var_val in locales[lang].iteritems():
  38.         path = path.replace('%' + var_name + '%', var_val)
  39.     return path
  40.  
  41.  
  42. def get_words(path):
  43.     """Return words with diacritics."""
  44.     content = get_file_content(path, None)
  45.     strings = parse(path, content)
  46.     words = tokenize.regexp.WordPunctTokenizer().tokenize(
  47.         '\n'.join(strings))
  48.     return words
  49.  
  50.  
  51. def has_diacritic(word):
  52.     return unidecode(word) != word
  53.  
  54.  
  55. def has_letter(word):
  56.     return word.swapcase() != word
  57.  
  58.  
  59. def get_paths():
  60.     """
  61.    Parse autotests config files,
  62.    return (lang, path) tuples for Legal scope.
  63.    """
  64.     cfgs = (json_to_dict(x)
  65.             for x in glob('../cfg/project__*.json'))
  66.  
  67.     _paths = ((lang, expand(localize(path), lang))
  68.               for cfg in cfgs
  69.               for path in cfg['legal']
  70.               if 'legal' in cfg and cfg['legal']
  71.               for lang in cfg['languages'])
  72.  
  73.     _paths = filter(lambda (lang, path): os.path.isfile(path),
  74.                     _paths)
  75.     return _paths
  76.  
  77.  
  78. def get_words_by_lang(filenum=None):
  79.     """
  80.    Parse legal files, return {lang: words} dict.
  81.    Use filenum=10 for quick tests.
  82.    """
  83.  
  84.     excl = ('zh-CN', 'zh-TW', 'ja-JP', 'ko-KO', 'th-TH',
  85.             'ru-RU', 'en-US', 'ru-RU-Dev', 'ar-AE', 'id-ID')
  86.  
  87.     paths = ((lang, path) for lang, path in get_paths()
  88.              if lang not in excl)
  89.  
  90.     words = ((lang,
  91.               (word
  92.                for word in get_words(path)
  93.                if has_diacritic(word) and has_letter(word)))
  94.              for lang, path in paths)
  95.  
  96.     words_by_lang = collections.defaultdict(list)
  97.     for lang, group in itertools.islice(words, filenum):
  98.         words_by_lang[lang].extend(group)
  99.  
  100.     return words_by_lang
  101.  
  102.  
  103. def top_words(words, num=10):
  104.     """
  105.    Return num of the most frequent words
  106.    from the words iterable.
  107.    """
  108.     _count = collections.defaultdict(int)
  109.     for idx, word in enumerate(words, 1):
  110.         _count[word] = idx
  111.  
  112.     _count_sorted = sorted(_count.iteritems(),
  113.                            key=lambda (k, v): v,
  114.                            reverse=True)[:num]
  115.  
  116.     return (word for word, num in _count_sorted)
  117.  
  118.  
  119. def main():
  120.     words_by_lang = get_words_by_lang()  # 10 for quick tests
  121.     top = {lang:
  122.                [unidecode(word) for word in top_words(group)]
  123.            for lang, group in words_by_lang.iteritems()}
  124.     pprint(top, indent=4)
  125.  
  126.  
  127. if __name__ == '__main__':
  128.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement