Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import division, print_function
- from data_parser import *
- from data_struct import *
- from learners import *
- import nltk
- import modshogun as sg
- import numpy as np
- training_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/Training/'
- development_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/Development/'
- mini_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/mini_subset/'
- seed = 42 # seed used for testing consistency
- class Own_director_kernel(sg.DirectorKernel):
- """
- Abstract base class for own kernels
- """
- def __init__(self, train_data, *kernel_args):
- sg.DirectorKernel.__init__(self)
- self.kernel_function_args = kernel_args
- self.own_init(train_data, train_data)
- def true_kernel_function(self, a, b, *args):
- raise NotImplementedError
- def kernel_function(self, idx_a, idx_b):
- if not (self.l and self.r):
- raise Exception("Kernel {} needs to be initialised first!".format(self.get_name()))
- return self.true_kernel_function(self.l[idx_a], self.r[idx_b], *self.kernel_function_args)
- def own_init(self, l, r):
- self.l, self.r = l, r
- l_features = sg.DummyFeatures(len(l))
- r_features = l_features if l == r else sg.DummyFeatures(len(r))
- self.init(l_features, r_features)
- def set_test(self, test_data):
- self.own_init(self.l, test_data)
- def get_name(self):
- return 'Own_director_kernel'
- def get_kernel_type(self):
- return sg.K_UNKNOWN
- class Own_director_word_kernel(Own_director_kernel):
- def __init__(self, train_data, prop_list, similarity_dict = None):
- super(self.__class__, self).__init__(train_data, prop_list, similarity_dict)
- def true_kernel_function(self, w1, w2, prop_list, similarity_dict):
- similarity_dict = {} if not similarity_dict else similarity_dict
- result = 0
- for prop in prop_list:
- similarity = similarity_dict[prop] if prop in similarity_dict else lambda p1, p2: 1 if p1 == p2 else 0
- result += similarity(w1.properties[prop], w2.properties[prop])
- return result
- def get_name(self):
- return 'Own_director_word_kernel'
- def get_kernel_type(self):
- return sg.K_LINEAR
- class Director_kernel_action_learner(Learner):
- def __init__(self, sentence_list, svm_class = sg.SVMLight, C = 1, kernel = Own_director_word_kernel, kernel_args = ()):
- words = []
- for s in sentence_list:
- words.extend(s.word_list)
- if not words:
- raise Exception("Can't train learner on empty dataset!")
- labels = []
- for w in words:
- labels.append(w.label_from_properties(TYPE, ACTION))
- labels = sg.BinaryLabels(np.array(labels, dtype = np.float64))
- k = kernel(words, (WORD, POS), *kernel_args)
- svm = svm_class(C, k, labels)
- svm.train()
- self.kernel, self.svm = k, svm
- def apply(self, s):
- words = s.word_list
- self.kernel.set_test(words)
- result = self.svm.apply()
- for i, w in enumerate(words):
- if result.get_label(i) == 1:
- sentence = w.sentence
- entity_index = max([int(t[1:]) for t in sentence.id_dict if t[0] == 'T'] + [0]) + 1
- d = {'T' + str(entity_index): {TYPE: ACTION, LOC: [sentence.span_tokens[i]], TEXT: w[WORD]}}
- sentence._populate_id_dict([d])
- return None # all the changes are in-place
- def compare(self, sentence_list):
- """
- given a list of fully annotated sentences (test set), strip bare, run tests, and compare.
- returns: two lists: predicted labels, ground truth labels
- """
- test_labels, truth_labels = [], []
- for s in sentence_list:
- s_test = s.copy_for_testing()
- self.apply(s_test)
- for w, w_test in zip(s.word_list, s_test.word_list):
- truth_labels.append(w.label_from_properties(TYPE, ACTION))
- test_labels.append(w_test.label_from_properties(TYPE, ACTION))
- # TODO add test: if both action with different tag nr. => realign
- return test_labels, truth_labels
- @classmethod
- def test(cls, training_list, test_list, *init_args, **init_kwargs):
- """
- run full test: train learner, and compare test results with ground truth
- training_list, test_list: lists of sentence objects
- returns: two lists: predicted labels, ground truth labels
- """
- a_learner = cls(training_list, *init_args, **init_kwargs)
- return a_learner.compare(test_list)
- def test_learner_fraction(data_set, learner, fraction_test = 0.1, iterations = 25, verbose = False, *learner_args, **learner_kwargs):
- test_labels, truth_labels = [], []
- nr_training = int(len(data_set) * (1 - fraction_test))
- for _ in range(iterations):
- if verbose: print(' -> iteration {}... '.format(_), end='')
- shuffled_data = np.random.permutation(data_set)
- training_data, test_data = np.split(shuffled_data, [nr_training])
- new_test, new_truth = learner.test(training_data, test_data, *learner_args, **learner_kwargs)
- test_labels.extend(new_test)
- truth_labels.extend(new_truth)
- if verbose:
- if new_truth.count(1) != 0:
- new_test = sg.BinaryLabels(np.array(new_test, dtype = np.float64))
- new_truth = sg.BinaryLabels(np.array(new_truth, dtype = np.float64))
- interim_result = sg.PrecisionMeasure()
- interim_result.evaluate(new_test, new_truth)
- print('Done. (pr {} -- rc {} -- F1 {})'.format(interim_result.get_precision(), interim_result.get_recall(), interim_result.get_F1()))
- else:
- print('Done. (no positive labels in set)')
- test_labels = sg.BinaryLabels(np.array(test_labels, dtype = np.float64))
- truth_labels = sg.BinaryLabels(np.array(truth_labels, dtype = np.float64))
- result = sg.PrecisionMeasure()
- result.evaluate(test_labels, truth_labels)
- return result
- data = []
- for d in (training_dir, development_dir):
- # for d in (mini_dir, mini_dir + 'test/'):
- for f in get_dir_namelist(d):
- data.append(parse_file(f, d))
- np.random.seed(seed)
- results = {"F1": [], "precision": [], "accuracy": [], "recall": []}
- for i in range(100):
- m = test_learner_fraction(data, Director_kernel_action_learner, fraction_test=0.05, iterations=25, verbose=True)
- for x in results:
- results[x].append(m.__getattribute__('get_'+x)())
- print("Averages:")
- for x, r in results.itervalues():
- print("{>11}: {}s".format(x, sum(r)/len(r)))
- from pylab import *
- for x, r in results.itervalues():
- plot(r, '-')
- legend(results.keys())
- grid(True)
- show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement