Advertisement
nchen24

findMissing.py

Dec 15th, 2017
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.39 KB | None | 0 0
  1. INPUT_FILE = "input.tsv"
  2.  
  3. class LanguageInfo:
  4.     def __init__(self, col_header):
  5.         split_header = col_header.split(" ")
  6.         self.num_items = (int)(split_header[0])
  7.         self.language_name = split_header[-1]
  8.         self.source_terms = set()
  9.  
  10.  
  11. # Print missing languages by word
  12. def print_missing_languages_by_word(langs, all_source_terms):
  13.     for word in sorted(all_source_terms):
  14.         languages_missing = []
  15.         # Check if word is in each language, if not add language to
  16.         # languages_missing list
  17.         for lang in langs:
  18.             if word not in lang.source_terms:
  19.                 languages_missing.append(lang.language_name)
  20.         # If word has more than 0 languages missing it, print
  21.         if len(languages_missing) != 0:
  22.             print("Source term '{0}' missing languages: '{1}'".format(word, ", ".join(languages_missing)))
  23.  
  24.  
  25. # Print missing words by language
  26. def print_missing_words_by_language(langs, all_source_terms):
  27.     for lang in langs:
  28.         # Get difference between cumulative list and language list
  29.         missing_source_terms = all_source_terms - lang.source_terms
  30.         if len(missing_source_terms) != 0:
  31.             print("Language: {0} is missing source terms:".format(lang.language_name))
  32.             print("\t{0}".format("\n\t".join(sorted(missing_source_terms))))
  33.  
  34.  
  35. def main():
  36.     # Build list of languages and words
  37.     with open(INPUT_FILE, 'r') as f:
  38.         # Make list of languages from header
  39.         # Assumes input is in the format "#source_terms ... ... ... language name"
  40.         langs = []
  41.         first_line = f.readline()
  42.         for column_header in first_line.replace("\n", "").split("\t"):
  43.             langs.append(LanguageInfo(column_header))
  44.  
  45.         # Read through input, get cumulative list of all words
  46.         all_source_terms = set()
  47.         for line in f:
  48.             # Throw away newline character, split into columns
  49.             split_line = line.replace("\n", "").split("\t")
  50.             for i, col in enumerate(split_line):
  51.                 # Add word to cumulative list
  52.                 all_source_terms.add(col)
  53.                 # Add list to appropriate language
  54.                 langs[i].source_terms.add(col)
  55.  
  56.     print_missing_languages_by_word(langs, all_source_terms)
  57.     print("")
  58.     print_missing_words_by_language(langs, all_source_terms)
  59.  
  60.  
  61. if __name__ == "__main__":
  62.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement