Advertisement
Guest User

Untitled

a guest
Apr 5th, 2013
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.81 KB | None | 0 0
  1. from __future__ import division, print_function
  2. from data_parser import *
  3. from data_struct import *
  4. from learners import *
  5. import nltk
  6. import modshogun as sg
  7. import numpy as np
  8.  
  9. training_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/Training/'
  10. development_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/Development/'
  11. mini_dir = '/home/thomas/Dropbox/PhD/GRN_Task/Data/mini_subset/'
  12.  
  13. seed = 42 # seed used for testing consistency
  14.  
  15. class Own_director_kernel(sg.DirectorKernel):
  16.     """
  17.        Abstract base class for own kernels
  18.    """
  19.     def __init__(self, train_data, *kernel_args):
  20.         sg.DirectorKernel.__init__(self)
  21.         self.kernel_function_args = kernel_args
  22.         self.own_init(train_data, train_data)
  23.  
  24.     def true_kernel_function(self, a, b, *args):
  25.         raise NotImplementedError
  26.  
  27.     def kernel_function(self, idx_a, idx_b):
  28.         if not (self.l and self.r):
  29.             raise Exception("Kernel {} needs to be initialised first!".format(self.get_name()))
  30.         return self.true_kernel_function(self.l[idx_a], self.r[idx_b], *self.kernel_function_args)
  31.  
  32.     def own_init(self, l, r):
  33.         self.l, self.r = l, r
  34.         l_features = sg.DummyFeatures(len(l))
  35.         r_features = l_features if l == r else sg.DummyFeatures(len(r))
  36.         self.init(l_features, r_features)
  37.  
  38.     def set_test(self, test_data):
  39.         self.own_init(self.l, test_data)
  40.  
  41.     def get_name(self):
  42.         return 'Own_director_kernel'
  43.  
  44.     def get_kernel_type(self):
  45.         return sg.K_UNKNOWN
  46.  
  47.  
  48. class Own_director_word_kernel(Own_director_kernel):
  49.     def __init__(self, train_data, prop_list, similarity_dict = None):
  50.         super(self.__class__, self).__init__(train_data, prop_list, similarity_dict)
  51.  
  52.     def true_kernel_function(self, w1, w2, prop_list, similarity_dict):
  53.         similarity_dict = {} if not similarity_dict else similarity_dict
  54.         result = 0
  55.         for prop in prop_list:
  56.             similarity = similarity_dict[prop] if prop in similarity_dict else lambda p1, p2: 1 if p1 == p2 else 0
  57.             result += similarity(w1.properties[prop], w2.properties[prop])
  58.         return result
  59.  
  60.     def get_name(self):
  61.         return 'Own_director_word_kernel'
  62.  
  63.     def get_kernel_type(self):
  64.         return sg.K_LINEAR
  65.  
  66. class Director_kernel_action_learner(Learner):
  67.  
  68.     def __init__(self, sentence_list, svm_class = sg.SVMLight, C = 1, kernel = Own_director_word_kernel, kernel_args = ()):
  69.         words = []
  70.         for s in sentence_list:
  71.             words.extend(s.word_list)
  72.         if not words:
  73.             raise Exception("Can't train learner on empty dataset!")
  74.         labels = []
  75.         for w in words:
  76.             labels.append(w.label_from_properties(TYPE, ACTION))
  77.         labels = sg.BinaryLabels(np.array(labels, dtype = np.float64))
  78.         k = kernel(words, (WORD, POS), *kernel_args)
  79.         svm = svm_class(C, k, labels)
  80.         svm.train()
  81.         self.kernel, self.svm = k, svm
  82.  
  83.     def apply(self, s):
  84.         words = s.word_list
  85.         self.kernel.set_test(words)
  86.         result = self.svm.apply()
  87.         for i, w in enumerate(words):
  88.             if result.get_label(i) == 1:
  89.                 sentence = w.sentence
  90.                 entity_index = max([int(t[1:]) for t in sentence.id_dict if t[0] == 'T'] + [0]) + 1
  91.                 d = {'T' + str(entity_index): {TYPE: ACTION, LOC: [sentence.span_tokens[i]], TEXT: w[WORD]}}
  92.                 sentence._populate_id_dict([d])
  93.         return None # all the changes are in-place
  94.  
  95.     def compare(self, sentence_list):
  96.         """
  97.            given a list of fully annotated sentences (test set), strip bare, run tests, and compare.
  98.            returns: two lists: predicted labels, ground truth labels
  99.        """
  100.         test_labels, truth_labels = [], []
  101.         for s in sentence_list:
  102.             s_test = s.copy_for_testing()
  103.             self.apply(s_test)
  104.             for w, w_test in zip(s.word_list, s_test.word_list):
  105.                 truth_labels.append(w.label_from_properties(TYPE, ACTION))
  106.                 test_labels.append(w_test.label_from_properties(TYPE, ACTION))
  107.                 # TODO add test: if both action with different tag nr. => realign
  108.         return test_labels, truth_labels
  109.  
  110.     @classmethod
  111.     def test(cls, training_list, test_list, *init_args, **init_kwargs):
  112.         """
  113.            run full test: train learner, and compare test results with ground truth
  114.            training_list, test_list: lists of sentence objects
  115.            returns: two lists: predicted labels, ground truth labels
  116.        """
  117.         a_learner = cls(training_list, *init_args, **init_kwargs)
  118.         return a_learner.compare(test_list)
  119.  
  120. def test_learner_fraction(data_set, learner, fraction_test = 0.1, iterations = 25, verbose = False, *learner_args, **learner_kwargs):
  121.     test_labels, truth_labels = [], []
  122.     nr_training = int(len(data_set) * (1 - fraction_test))
  123.     for _ in range(iterations):
  124.         if verbose: print('  -> iteration {}... '.format(_), end='')
  125.         shuffled_data = np.random.permutation(data_set)
  126.         training_data, test_data = np.split(shuffled_data, [nr_training])
  127.         new_test, new_truth = learner.test(training_data, test_data, *learner_args, **learner_kwargs)
  128.         test_labels.extend(new_test)
  129.         truth_labels.extend(new_truth)
  130.         if verbose:
  131.             if new_truth.count(1) != 0:
  132.                 new_test = sg.BinaryLabels(np.array(new_test, dtype = np.float64))
  133.                 new_truth = sg.BinaryLabels(np.array(new_truth, dtype = np.float64))
  134.                 interim_result = sg.PrecisionMeasure()
  135.                 interim_result.evaluate(new_test, new_truth)
  136.                 print('Done. (pr {} -- rc {} -- F1 {})'.format(interim_result.get_precision(), interim_result.get_recall(), interim_result.get_F1()))
  137.             else:
  138.                 print('Done. (no positive labels in set)')
  139.     test_labels = sg.BinaryLabels(np.array(test_labels, dtype = np.float64))
  140.     truth_labels = sg.BinaryLabels(np.array(truth_labels, dtype = np.float64))
  141.     result = sg.PrecisionMeasure()
  142.     result.evaluate(test_labels, truth_labels)
  143.     return result
  144.  
  145. data = []
  146. for d in (training_dir, development_dir):
  147. # for d in (mini_dir, mini_dir + 'test/'):
  148.     for f in get_dir_namelist(d):
  149.         data.append(parse_file(f, d))
  150.  
  151. np.random.seed(seed)
  152. results = {"F1": [], "precision": [], "accuracy": [], "recall": []}
  153. for i in range(100):
  154.     m = test_learner_fraction(data, Director_kernel_action_learner, fraction_test=0.05, iterations=25, verbose=True)
  155.     for x in results:
  156.         results[x].append(m.__getattribute__('get_'+x)())
  157.  
  158. print("Averages:")
  159. for x, r in results.itervalues():
  160.     print("{>11}: {}s".format(x, sum(r)/len(r)))
  161. from pylab import *
  162. for x, r in results.itervalues():
  163.     plot(r, '-')
  164. legend(results.keys())
  165. grid(True)
  166. show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement