Advertisement
Guest User

Untitled

a guest
Jun 13th, 2019
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.44 KB | None | 0 0
  1. import os
  2. import sys
  3. import git
  4. import math
  5. import time
  6. import shutil
  7. import pathlib
  8. import datetime
  9. import tabulate
  10. import subprocess
  11. import pandas as pd
  12. from tqdm import tqdm
  13. from sklearn.metrics import precision_score,recall_score
  14.  
  15. BASE_DIR = '.tmp_{}'.format(int(time.time()))
  16.  
  17. def get_apertium_analyses(token, weighted_bin):
  18. #TODO: Don't use shell=True
  19. analyses = subprocess.run(
  20. ['echo {} | ../lttoolbox/lt-proc {} -N 1'.format(repr(token),
  21. weighted_bin)],
  22. stdout=subprocess.PIPE,
  23. shell=True).stdout.decode()
  24.  
  25. return analyses[analyses.find('/') + 1: analyses.find('$')].split('/')
  26.  
  27. def compute_accuracy(testing_corpus, weighted_bin):
  28. splitted_corpus = [[sample[1:sample.find('/')],
  29. sample[1 + sample.find('/'): -1]]
  30. for sample in testing_corpus]
  31.  
  32. tokens = [sample[0] for sample in splitted_corpus]
  33. tags = [sample[1] for sample in splitted_corpus]
  34.  
  35. testing_df = pd.DataFrame({'token': tokens, 'tag': tags})
  36. testing_df.to_csv('{}/test.csv'.format(BASE_DIR), index=False)
  37. testing_df = testing_df[~(testing_df['tag'].str.contains(r'[0-9 ,]*<num>\$') |
  38. testing_df['tag'].str.endswith('<sent>$') |
  39. testing_df['tag'].str.contains(r'[( ) ;]') |
  40. testing_df['tag'].str.startswith('*'))].copy()
  41. tqdm.pandas()
  42.  
  43. testing_df['predicted_tag'] = testing_df['token'].progress_apply(
  44. lambda token: get_apertium_analyses(token, weighted_bin)[0])
  45.  
  46. testing_df.to_csv('{}/temp.csv'.format(BASE_DIR), index=False)
  47. return testing_df.dropna()
  48.  
  49. if __name__ == '__main__':
  50. BIN_FILE = sys.argv[1]
  51. TAGGED_CORPUS = sys.argv[2]
  52. LANG_REPO_LOC = sys.argv[3]
  53. SPLIT_RATIO = 0.8
  54. TRAINING_CORPUS_LOC = '{}/{}'.format(BASE_DIR, 'training_corpus.tagged')
  55. WEIGHTED_BIN_FILE = '{}/{}'.format(BASE_DIR, 'weighted.bin')
  56. os.mkdir(BASE_DIR)
  57.  
  58. with open(TAGGED_CORPUS, 'r') as f:
  59. lines = [l.strip() for l in f.readlines()]
  60. training_corpus = lines[:int(len(lines) * SPLIT_RATIO)]
  61. testing_corpus = lines[int(len(lines) * SPLIT_RATIO):]
  62.  
  63. with open(TRAINING_CORPUS_LOC, 'w') as f:
  64. for line in training_corpus:
  65. f.write(line)
  66.  
  67. subprocess.run(['./lt-weight', BIN_FILE, TRAINING_CORPUS_LOC, WEIGHTED_BIN_FILE])
  68. pred_df = compute_accuracy(testing_corpus, WEIGHTED_BIN_FILE)
  69.  
  70. prec = precision_score(y_true=pred_df['tag'],
  71. y_pred=pred_df['predicted_tag'],
  72. average='weighted',
  73. labels=pred_df['predicted_tag'].unique()
  74. )
  75.  
  76. recall = recall_score(y_true=pred_df['tag'],
  77. y_pred=pred_df['predicted_tag'],
  78. average='weighted',
  79. labels=pred_df['predicted_tag'].unique()
  80. )
  81.  
  82. repo = git.Repo(LANG_REPO_LOC)
  83. results_df = pd.DataFrame({'time': datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S %d-%m-%Y'),
  84. 'language': [pathlib.Path(LANG_REPO_LOC).stem],
  85. 'head commit in language repository': repo.head.object.hexsha,
  86. 'precision': [prec],
  87. 'recall':[recall]})
  88. print('Results for {} with a split ratio of {}-{}'.format(
  89. TAGGED_CORPUS,
  90. math.ceil(SPLIT_RATIO * 100),
  91. math.ceil((1-SPLIT_RATIO) * 100)))
  92. print(tabulate.tabulate(results_df, headers=results_df.columns, showindex=False, tablefmt='github'))
  93.  
  94. shutil.rmtree(BASE_DIR)
  95. # print(pred_df[pred_df['tag']!=pred_df['predicted_tag']])
  96. # python generate_report.py ../../apertium-eng/eng.automorf.bin ../../apertium-eng/texts/eng.tagged ../../apertium-eng/eng.automorf.bin ../../apertium-eng
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement