SHARE
TWEET

Untitled

a guest Jun 13th, 2019 62 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import os
  2. import sys
  3. import git
  4. import math
  5. import time
  6. import shutil
  7. import pathlib
  8. import datetime
  9. import tabulate
  10. import subprocess
  11. import pandas as pd
  12. from tqdm import tqdm
  13. from sklearn.metrics import precision_score,recall_score
  14.  
  15. BASE_DIR = '.tmp_{}'.format(int(time.time()))
  16.  
  17. def get_apertium_analyses(token, weighted_bin):
  18.     #TODO: Don't use shell=True
  19.     analyses = subprocess.run(
  20.         ['echo {} | ../lttoolbox/lt-proc {} -N 1'.format(repr(token),
  21.                                                 weighted_bin)],
  22.         stdout=subprocess.PIPE,
  23.         shell=True).stdout.decode()
  24.  
  25.     return analyses[analyses.find('/') + 1: analyses.find('$')].split('/')
  26.  
  27. def compute_accuracy(testing_corpus, weighted_bin):
  28.     splitted_corpus = [[sample[1:sample.find('/')],
  29.                         sample[1 + sample.find('/'): -1]]
  30.                         for sample in testing_corpus]
  31.  
  32.     tokens = [sample[0] for sample in splitted_corpus]
  33.     tags = [sample[1] for sample in splitted_corpus]
  34.  
  35.     testing_df = pd.DataFrame({'token': tokens, 'tag': tags})
  36.     testing_df.to_csv('{}/test.csv'.format(BASE_DIR), index=False)
  37.     testing_df = testing_df[~(testing_df['tag'].str.contains(r'[0-9 ,]*<num>\$') |
  38.                              testing_df['tag'].str.endswith('<sent>$') |
  39.                              testing_df['tag'].str.contains(r'[( ) ;]') |
  40.                              testing_df['tag'].str.startswith('*'))].copy()
  41.     tqdm.pandas()
  42.  
  43.     testing_df['predicted_tag'] = testing_df['token'].progress_apply(
  44.         lambda token: get_apertium_analyses(token, weighted_bin)[0])
  45.  
  46.     testing_df.to_csv('{}/temp.csv'.format(BASE_DIR), index=False)
  47.     return testing_df.dropna()
  48.  
  49. if __name__ == '__main__':
  50.     BIN_FILE = sys.argv[1]
  51.     TAGGED_CORPUS = sys.argv[2]
  52.     LANG_REPO_LOC = sys.argv[3]
  53.     SPLIT_RATIO = 0.8
  54.     TRAINING_CORPUS_LOC = '{}/{}'.format(BASE_DIR, 'training_corpus.tagged')
  55.     WEIGHTED_BIN_FILE = '{}/{}'.format(BASE_DIR, 'weighted.bin')
  56.     os.mkdir(BASE_DIR)
  57.  
  58.     with open(TAGGED_CORPUS, 'r') as f:
  59.         lines = [l.strip() for l in f.readlines()]
  60.     training_corpus = lines[:int(len(lines) * SPLIT_RATIO)]
  61.     testing_corpus = lines[int(len(lines) * SPLIT_RATIO):]
  62.  
  63.     with open(TRAINING_CORPUS_LOC, 'w') as f:
  64.         for line in training_corpus:
  65.             f.write(line)
  66.    
  67.     subprocess.run(['./lt-weight', BIN_FILE, TRAINING_CORPUS_LOC, WEIGHTED_BIN_FILE])
  68.     pred_df = compute_accuracy(testing_corpus, WEIGHTED_BIN_FILE)
  69.    
  70.     prec = precision_score(y_true=pred_df['tag'],
  71.                           y_pred=pred_df['predicted_tag'],
  72.                           average='weighted',
  73.                           labels=pred_df['predicted_tag'].unique()
  74.                           )
  75.  
  76.     recall = recall_score(y_true=pred_df['tag'],
  77.                        y_pred=pred_df['predicted_tag'],
  78.                        average='weighted',
  79.                        labels=pred_df['predicted_tag'].unique()
  80.                        )
  81.  
  82.     repo = git.Repo(LANG_REPO_LOC)
  83.     results_df = pd.DataFrame({'time': datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S %d-%m-%Y'),
  84.                                'language': [pathlib.Path(LANG_REPO_LOC).stem],
  85.                                'head commit in language repository': repo.head.object.hexsha,
  86.                                'precision': [prec],
  87.                                'recall':[recall]})
  88.     print('Results for {} with a split ratio of {}-{}'.format(
  89.         TAGGED_CORPUS,
  90.         math.ceil(SPLIT_RATIO * 100),
  91.         math.ceil((1-SPLIT_RATIO) * 100)))
  92.     print(tabulate.tabulate(results_df, headers=results_df.columns, showindex=False, tablefmt='github'))
  93.  
  94.     shutil.rmtree(BASE_DIR)
  95.     # print(pred_df[pred_df['tag']!=pred_df['predicted_tag']])
  96. # python generate_report.py ../../apertium-eng/eng.automorf.bin ../../apertium-eng/texts/eng.tagged ../../apertium-eng/eng.automorf.bin ../../apertium-eng
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top