Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import sys
- import git
- import math
- import time
- import shutil
- import pathlib
- import datetime
- import tabulate
- import subprocess
- import pandas as pd
- from tqdm import tqdm
- from sklearn.metrics import precision_score,recall_score
- BASE_DIR = '.tmp_{}'.format(int(time.time()))
- def get_apertium_analyses(token, weighted_bin):
- #TODO: Don't use shell=True
- analyses = subprocess.run(
- ['echo {} | ../lttoolbox/lt-proc {} -N 1'.format(repr(token),
- weighted_bin)],
- stdout=subprocess.PIPE,
- shell=True).stdout.decode()
- return analyses[analyses.find('/') + 1: analyses.find('$')].split('/')
- def compute_accuracy(testing_corpus, weighted_bin):
- splitted_corpus = [[sample[1:sample.find('/')],
- sample[1 + sample.find('/'): -1]]
- for sample in testing_corpus]
- tokens = [sample[0] for sample in splitted_corpus]
- tags = [sample[1] for sample in splitted_corpus]
- testing_df = pd.DataFrame({'token': tokens, 'tag': tags})
- testing_df.to_csv('{}/test.csv'.format(BASE_DIR), index=False)
- testing_df = testing_df[~(testing_df['tag'].str.contains(r'[0-9 ,]*<num>\$') |
- testing_df['tag'].str.endswith('<sent>$') |
- testing_df['tag'].str.contains(r'[( ) ;]') |
- testing_df['tag'].str.startswith('*'))].copy()
- tqdm.pandas()
- testing_df['predicted_tag'] = testing_df['token'].progress_apply(
- lambda token: get_apertium_analyses(token, weighted_bin)[0])
- testing_df.to_csv('{}/temp.csv'.format(BASE_DIR), index=False)
- return testing_df.dropna()
- if __name__ == '__main__':
- BIN_FILE = sys.argv[1]
- TAGGED_CORPUS = sys.argv[2]
- LANG_REPO_LOC = sys.argv[3]
- SPLIT_RATIO = 0.8
- TRAINING_CORPUS_LOC = '{}/{}'.format(BASE_DIR, 'training_corpus.tagged')
- WEIGHTED_BIN_FILE = '{}/{}'.format(BASE_DIR, 'weighted.bin')
- os.mkdir(BASE_DIR)
- with open(TAGGED_CORPUS, 'r') as f:
- lines = [l.strip() for l in f.readlines()]
- training_corpus = lines[:int(len(lines) * SPLIT_RATIO)]
- testing_corpus = lines[int(len(lines) * SPLIT_RATIO):]
- with open(TRAINING_CORPUS_LOC, 'w') as f:
- for line in training_corpus:
- f.write(line)
- subprocess.run(['./lt-weight', BIN_FILE, TRAINING_CORPUS_LOC, WEIGHTED_BIN_FILE])
- pred_df = compute_accuracy(testing_corpus, WEIGHTED_BIN_FILE)
- prec = precision_score(y_true=pred_df['tag'],
- y_pred=pred_df['predicted_tag'],
- average='weighted',
- labels=pred_df['predicted_tag'].unique()
- )
- recall = recall_score(y_true=pred_df['tag'],
- y_pred=pred_df['predicted_tag'],
- average='weighted',
- labels=pred_df['predicted_tag'].unique()
- )
- repo = git.Repo(LANG_REPO_LOC)
- results_df = pd.DataFrame({'time': datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S %d-%m-%Y'),
- 'language': [pathlib.Path(LANG_REPO_LOC).stem],
- 'head commit in language repository': repo.head.object.hexsha,
- 'precision': [prec],
- 'recall':[recall]})
- print('Results for {} with a split ratio of {}-{}'.format(
- TAGGED_CORPUS,
- math.ceil(SPLIT_RATIO * 100),
- math.ceil((1-SPLIT_RATIO) * 100)))
- print(tabulate.tabulate(results_df, headers=results_df.columns, showindex=False, tablefmt='github'))
- shutil.rmtree(BASE_DIR)
- # print(pred_df[pred_df['tag']!=pred_df['predicted_tag']])
- # python generate_report.py ../../apertium-eng/eng.automorf.bin ../../apertium-eng/texts/eng.tagged ../../apertium-eng/eng.automorf.bin ../../apertium-eng
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement