Untitled

# -*- coding: utf-8 -*-
"""
Find the most frequent words containing non-ASCII
characters, for each language.
Used to test for decoding/encoding errors, where
Unicode characters are translated into ASCII, e.g.:
"licenca usuario" instead of "licença usuário".
"""


import os
from glob import glob
from unidecode import unidecode
import collections
import itertools
from pprint import pprint

from nltk import tokenize

from util.common import get_file_content, json_to_dict, parse


mappings = json_to_dict('../local/mappings.json')
locales = json_to_dict('../cfg/locales.json')


def localize(path):
    """Turn remote path to local."""
    for remote, local in mappings:
        if remote in path:
            path = path.replace(remote, local)
    return path


def expand(path, lang):
    """Substitute locale vars with actual values for the given lang."""
    for var_name, var_val in locales[lang].iteritems():
        path = path.replace('%' + var_name + '%', var_val)
    return path


def get_words(path):
    """Return words with diacritics."""
    content = get_file_content(path, None)
    strings = parse(path, content)
    words = tokenize.regexp.WordPunctTokenizer().tokenize(
        '\n'.join(strings))
    return words


def has_diacritic(word):
    return unidecode(word) != word


def has_letter(word):
    return word.swapcase() != word


def get_paths():
    """
    Parse autotests config files,
    return (lang, path) tuples for Legal scope.
    """
    cfgs = (json_to_dict(x)
            for x in glob('../cfg/project__*.json'))

    _paths = ((lang, expand(localize(path), lang))
              for cfg in cfgs
              for path in cfg['legal']
              if 'legal' in cfg and cfg['legal']
              for lang in cfg['languages'])

    _paths = filter(lambda (lang, path): os.path.isfile(path),
                    _paths)
    return _paths


def get_words_by_lang(filenum=None):
    """
    Parse legal files, return {lang: words} dict.
    Use filenum=10 for quick tests.
    """

    excl = ('zh-CN', 'zh-TW', 'ja-JP', 'ko-KO', 'th-TH',
            'ru-RU', 'en-US', 'ru-RU-Dev', 'ar-AE', 'id-ID')

    paths = ((lang, path) for lang, path in get_paths()
             if lang not in excl)

    words = ((lang,
              (word
               for word in get_words(path)
               if has_diacritic(word) and has_letter(word)))
             for lang, path in paths)

    words_by_lang = collections.defaultdict(list)
    for lang, group in itertools.islice(words, filenum):
        words_by_lang[lang].extend(group)

    return words_by_lang


def top_words(words, num=10):
    """
    Return num of the most frequent words
    from the words iterable.
    """
    _count = collections.defaultdict(int)
    for idx, word in enumerate(words, 1):
        _count[word] = idx

    _count_sorted = sorted(_count.iteritems(),
                           key=lambda (k, v): v,
                           reverse=True)[:num]

    return (word for word, num in _count_sorted)


def main():
    words_by_lang = get_words_by_lang()  # 10 for quick tests
    top = {lang:
               [unidecode(word) for word in top_words(group)]
           for lang, group in words_by_lang.iteritems()}
    pprint(top, indent=4)


if __name__ == '__main__':
    main()