Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- INPUT_FILE = "input.tsv"
- class LanguageInfo:
- def __init__(self, col_header):
- split_header = col_header.split(" ")
- self.num_items = (int)(split_header[0])
- self.language_name = split_header[-1]
- self.source_terms = set()
- # Print missing languages by word
- def print_missing_languages_by_word(langs, all_source_terms):
- for word in sorted(all_source_terms):
- languages_missing = []
- # Check if word is in each language, if not add language to
- # languages_missing list
- for lang in langs:
- if word not in lang.source_terms:
- languages_missing.append(lang.language_name)
- # If word has more than 0 languages missing it, print
- if len(languages_missing) != 0:
- print("Source term '{0}' missing languages: '{1}'".format(word, ", ".join(languages_missing)))
- # Print missing words by language
- def print_missing_words_by_language(langs, all_source_terms):
- for lang in langs:
- # Get difference between cumulative list and language list
- missing_source_terms = all_source_terms - lang.source_terms
- if len(missing_source_terms) != 0:
- print("Language: {0} is missing source terms:".format(lang.language_name))
- print("\t{0}".format("\n\t".join(sorted(missing_source_terms))))
- def main():
- # Build list of languages and words
- with open(INPUT_FILE, 'r') as f:
- # Make list of languages from header
- # Assumes input is in the format "#source_terms ... ... ... language name"
- langs = []
- first_line = f.readline()
- for column_header in first_line.replace("\n", "").split("\t"):
- langs.append(LanguageInfo(column_header))
- # Read through input, get cumulative list of all words
- all_source_terms = set()
- for line in f:
- # Throw away newline character, split into columns
- split_line = line.replace("\n", "").split("\t")
- for i, col in enumerate(split_line):
- # Add word to cumulative list
- all_source_terms.add(col)
- # Add list to appropriate language
- langs[i].source_terms.add(col)
- print_missing_languages_by_word(langs, all_source_terms)
- print("")
- print_missing_words_by_language(langs, all_source_terms)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement