Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def load_corpus(dirname, tokens_only=False):
- """ This function receives a directory path,
- read files from this directory, load the data
- and return a python generator which contains
- tagged sentences"""
- labels = [f for f in listdir(dirname) if f.endswith('.txt')]
- for name in labels:
- fname = dirname + "/" + name
- with smart_open.smart_open(fname, encoding="iso-8859-1") as fin:
- line = fin.read()
- if tokens_only:
- yield gensim.utils.simple_preprocess(line)
- else:
- # For training data, add tags
- yield gensim.models.doc2vec.TaggedDocument(
- gensim.utils.simple_preprocess(line), [name])
- def start_training(hyperparams, train_corpus):
- model = gensim.models.doc2vec.Doc2Vec(size=hyperparams['size'], min_count=hyperparams['min_count'],
- iter=hyperparams['iter'], workers=16, window=hyperparams['window'],
- alpha=hyperparams['alpha'], min_alpha=hyperparams['min_alpha'],
- dm=hyperparams['dm'])
- print("Building vocabulary")
- model.random.seed(0)
- model.build_vocab(train_corpus)
- print("Training the model")
- print(model)
- model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)
- return model
- def get_word_vec(filename):
- """" This function receives a previously preprocessed file,
- and return a words vector"""
- with open(filename) as data_file:
- words_vec = data_file.read().split()
- return words_vec
- def decay_equation(position):
- return np.exp((np.log(0.5)/(10 - 1)) * (position - 1))
- def eval_model(model, eval_dir, hyperparams):
- ranked_eval = {}
- correct = 0
- eval_files_list = os.listdir(eval_dir)
- for file in eval_files_list:
- eval_file = eval_dir + file
- words_vec = get_word_vec(eval_file)
- model.random.seed(0)
- inferred_vector = model.infer_vector(words_vec, alpha=hyperparams['alpha'], min_alpha=hyperparams['min_alpha'], steps=(hyperparams['iter']))
- similars = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
- target_ER = ''
- if len(file) == 21:
- target_ER = file[-17:]
- elif len(file) == 22:
- target_ER = file[-18:]
- elif len(file) == 23:
- target_ER = file[-19:]
- elif len(file) == 24:
- target_ER = file[-20:]
- elif len(file) == 25:
- target_ER = file[-21:]
- elif len(file) == 29:
- target_ER = file[-18:]
- for i in range(len(similars)):
- sim = similars[i]
- if sim[0] == target_ER:
- print(file, "found in position", i)
- ranked_eval[file] = i
- correct += decay_equation(i)
- break
- accuracy_rate = (correct / len(eval_files_list)) * 100
- return accuracy_rate, ranked_eval
- #train the model
- hyperparams = {
- 'size': size,
- 'min_count': min_count,
- 'iter': iter,
- 'window': window,
- 'alpha': alpha,
- 'min_alpha': min_alpha,
- 'dm': dm,
- }
- print("Training with the hyperparams:")
- print(hyperparams, "\n")
- print("Loading files")
- train_corpus = list(load_corpus(train_dir))
- model = start_training(hyperparams, train_corpus)
- model_file = model_dir + preprocessed_input + '.model'
- model.save(model_file)
- print ("Trained model saved in:", model_file)
- #evaluation
- accuracy_rate, ranked_eval = eval_model(model, eval_dir, hyperparams)
- hyperparams['accuracy_rate'] = accuracy_rate
- hyperparams['model_file'] = model_file
- print("Accuracy rate: ", accuracy_rate)
- #write a log file with hyperparams and its results
- results_file = model_dir + 'tests_result.txt'
- with open(results_file, "a") as f:
- f.write("%s" % (hyperparams))
- f.write("%s\n" % (ranked_eval))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement