Advertisement
Guest User

Untitled

a guest
Apr 4th, 2013
43
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.50 KB | None | 0 0
  1. from __future__ import division, print_function
  2. from data_parser import *
  3. from data_struct import *
  4. from learners import *
  5. import nltk
  6. import modshogun as sg
  7. import numpy as np
  8. from main import test_learner_fraction
  9.  
  10. training_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/Training/'
  11. development_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/Development/'
  12. mini_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/mini_subset/'
  13.  
  14. seed = 42 # seed used for testing consistency
  15.  
  16. class Own_director_kernel(sg.DirectorKernel):
  17.     """
  18.        Abstract base class for own kernels
  19.    """
  20.     def __init__(self):
  21.         def __init__(self, train_data, *kernel_args):
  22.             sg.DirectorKernel.__init__(self)
  23.             self.kernel_function_args = kernel_args
  24.             self.init(train_data, train_data)
  25.  
  26.     def true_kernel_function(self, a, b, *args):
  27.         raise NotImplementedError
  28.  
  29.     def kernel_function(self, idx_a, idx_b):
  30.         if not (self.l and self.r):
  31.             raise Exception("Kernel {} needs to be initialised first!".format(self.get_name()))
  32.         return self.true_kernel_function(self.l[idx_a], self.r[idx_b], *self.kernel_function_args)
  33.  
  34.     def init(self, l, r):
  35.         self.l, self.r = l, r
  36.         l_features = sg.DummyFeatures(len(l))
  37.         r_features = l_features if l == r else sg.DummyFeatures(len(r))
  38.         sg.DirectorKernel.init(self, l_features, r_features)
  39.  
  40.     def set_test(self, test_data):
  41.         self.init(self.l, test_data)
  42.  
  43.     def get_name(self):
  44.         return 'Own_director_kernel'
  45.  
  46.     def get_kernel_type(self):
  47.         return sg.K_UNKNOWN
  48.  
  49.  
  50. class Own_director_word_kernel(Own_director_kernel):
  51.     def __init__(self, prop_list, similarity_dict = None):
  52.         super(self.__class__, self).__init__()
  53.         self.prop_list = prop_list
  54.         self.similarity_dict = similarity_dict
  55.  
  56.     def true_kernel_function(self, w1, w2, prop_list, similarity_dict):
  57.         similarity_dict = {} if not similarity_dict else similarity_dict
  58.         result = 0
  59.         for prop in prop_list:
  60.             similarity = similarity_dict[prop] if prop in similarity_dict else lambda p1, p2: 1 if p1 == p2 else 0
  61.             result += similarity(w1.properties[prop], w2.properties[prop])
  62.         return result
  63.  
  64.     def get_name(self):
  65.         return 'Own_director_word_kernel'
  66.  
  67.     def get_kernel_type(self):
  68.         return sg.K_LINEAR
  69.  
  70. class Director_kernel_action_learner(Learner):
  71.  
  72.     def __init__(self, sentence_list, svm_class = sg.SVMLight, C = 1, kernel = Own_director_word_kernel, kernel_args = ()):
  73.         words = []
  74.         for s in sentence_list:
  75.             words.extend(s.word_list)
  76.         if not words:
  77.             raise Exception("Can't train learner on empty dataset!")
  78.         labels = []
  79.         for w in words:
  80.             labels.append(w.label_from_properties(TYPE, ACTION))
  81.         labels = sg.BinaryLabels(np.array(labels, dtype = np.float64))
  82.         k = kernel((WORD, POS), *kernel_args)
  83.         k.init(words, words)
  84.         svm = svm_class(C, k, labels)
  85.         svm.train()
  86.         self.kernel, self.svm = k, svm
  87.  
  88.     def apply(self, s):
  89.         words = s.word_list
  90.         self.kernel.set_test(words)
  91.         result = self.svm.apply()
  92.         for i, w in enumerate(words):
  93.             if result.get_label(i) == 1:
  94.                 sentence = w.sentence
  95.                 entity_index = max([int(t[1:]) for t in sentence.id_dict if t[0] == 'T'] + [0]) + 1
  96.                 d = {'T' + str(entity_index): {TYPE: ACTION, LOC: [sentence.span_tokens[i]], TEXT: w[WORD]}}
  97.                 sentence._populate_id_dict([d])
  98.         return None # all the changes are in-place
  99.  
  100.     def compare(self, sentence_list):
  101.         """
  102.            given a list of fully annotated sentences (test set), strip bare, run tests, and compare.
  103.            returns: two lists: predicted labels, ground truth labels
  104.        """
  105.         test_labels, truth_labels = [], []
  106.         for s in sentence_list:
  107.             s_test = s.copy_for_testing()
  108.             self.apply(s_test)
  109.             for w, w_test in zip(s.word_list, s_test.word_list):
  110.                 truth_labels.append(w.label_from_properties(TYPE, ACTION))
  111.                 test_labels.append(w_test.label_from_properties(TYPE, ACTION))
  112.                 # TODO add test: if both action with different tag nr. => realign
  113.         return test_labels, truth_labels
  114.  
  115.     @classmethod
  116.     def test(cls, training_list, test_list, *init_args, **init_kwargs):
  117.         """
  118.            run full test: train learner, and compare test results with ground truth
  119.            training_list, test_list: lists of sentence objects
  120.            returns: two lists: predicted labels, ground truth labels
  121.        """
  122.         a_learner = cls(training_list, *init_args, **init_kwargs)
  123.         return a_learner.compare(test_list)
  124.  
  125. data = []
  126. for d in (training_dir, development_dir):
  127. # for d in (mini_dir, mini_dir + 'test/'):
  128.     for f in get_dir_namelist(d):
  129.         data.append(parse_file(f, d))
  130.  
  131. np.random.seed(seed)
  132. results = {"F1": [], "precision": [], "accuracy": [], "recall": []}
  133. for i in range(100):
  134.     m = test_learner_fraction(data, Director_kernel_action_learner, fraction_test=0.1, iterations=25, verbose=False).get_F1()
  135.     for x in results:
  136.         results[x].append(m.__getattribute__('get_'+x)())
  137.  
  138. print("Averages:")
  139. for x, r in results.itervalues():
  140.     print("{>11}: {}s".format(x, sum(r)/len(r)))
  141. from pylab import *
  142. for x, r in results.itervalues():
  143.     plot(r, '-')
  144. legend(results.keys())
  145. grid(True)
  146. show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement