Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Find the most frequent words containing non-ASCII
- characters, for each language.
- Used to test for decoding/encoding errors, where
- Unicode characters are translated into ASCII, e.g.:
- "licenca usuario" instead of "licença usuário".
- """
- import os
- from glob import glob
- from unidecode import unidecode
- import collections
- import itertools
- from pprint import pprint
- from nltk import tokenize
- from util.common import get_file_content, json_to_dict, parse
- mappings = json_to_dict('../local/mappings.json')
- locales = json_to_dict('../cfg/locales.json')
- def localize(path):
- """Turn remote path to local."""
- for remote, local in mappings:
- if remote in path:
- path = path.replace(remote, local)
- return path
- def expand(path, lang):
- """Substitute locale vars with actual values for the given lang."""
- for var_name, var_val in locales[lang].iteritems():
- path = path.replace('%' + var_name + '%', var_val)
- return path
- def get_words(path):
- """Return words with diacritics."""
- content = get_file_content(path, None)
- strings = parse(path, content)
- words = tokenize.regexp.WordPunctTokenizer().tokenize(
- '\n'.join(strings))
- return words
- def has_diacritic(word):
- return unidecode(word) != word
- def has_letter(word):
- return word.swapcase() != word
- def get_paths():
- """
- Parse autotests config files,
- return (lang, path) tuples for Legal scope.
- """
- cfgs = (json_to_dict(x)
- for x in glob('../cfg/project__*.json'))
- _paths = ((lang, expand(localize(path), lang))
- for cfg in cfgs
- for path in cfg['legal']
- if 'legal' in cfg and cfg['legal']
- for lang in cfg['languages'])
- _paths = filter(lambda (lang, path): os.path.isfile(path),
- _paths)
- return _paths
- def get_words_by_lang(filenum=None):
- """
- Parse legal files, return {lang: words} dict.
- Use filenum=10 for quick tests.
- """
- excl = ('zh-CN', 'zh-TW', 'ja-JP', 'ko-KO', 'th-TH',
- 'ru-RU', 'en-US', 'ru-RU-Dev', 'ar-AE', 'id-ID')
- paths = ((lang, path) for lang, path in get_paths()
- if lang not in excl)
- words = ((lang,
- (word
- for word in get_words(path)
- if has_diacritic(word) and has_letter(word)))
- for lang, path in paths)
- words_by_lang = collections.defaultdict(list)
- for lang, group in itertools.islice(words, filenum):
- words_by_lang[lang].extend(group)
- return words_by_lang
- def top_words(words, num=10):
- """
- Return num of the most frequent words
- from the words iterable.
- """
- _count = collections.defaultdict(int)
- for idx, word in enumerate(words, 1):
- _count[word] = idx
- _count_sorted = sorted(_count.iteritems(),
- key=lambda (k, v): v,
- reverse=True)[:num]
- return (word for word, num in _count_sorted)
- def main():
- words_by_lang = get_words_by_lang() # 10 for quick tests
- top = {lang:
- [unidecode(word) for word in top_words(group)]
- for lang, group in words_by_lang.iteritems()}
- pprint(top, indent=4)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement