SHARE
TWEET

Untitled

a guest Aug 24th, 2019 80 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2. #coding=utf-8
  3. from __future__ import print_function
  4. import datetime
  5.  
  6. """
  7. 训练前:利用人工标注的tag信息,拓展特征空间
  8. 训练时:把每一种可能的tag都分别计算一个分数,取分数最大时的tag作为真正的tag
  9. LOSS:分别求错误tag和正确tag的特征向量,对错的特征向量,在整体的特征向量空间中减一;
  10.     对正确的特征向量加一
  11. PREDICT:max_tag就是预测函数
  12. """
  13.  
  14. class sentence:
  15.     def __init__(self):
  16.         self.word = []
  17.         self.tag = []
  18.         self.wordchars = []
  19.  
  20. class dataset:
  21.     def __init__(self):
  22.         self.sentences = []
  23.         self.name = ""
  24.    
  25.     def open_file(self, inputfile):
  26.         self.inputfile = open(inputfile, mode='r')
  27.         self.name = inputfile.split('.')[0]
  28.  
  29.     def close_file(self):
  30.         self.inputfile.close()
  31.  
  32.     def read_data(self, sentenceLen):
  33.         sentenceCount = 0
  34.         wordCount = 0
  35.         sen = sentence()
  36.         for s in self.inputfile:
  37.             if(s == '\n'):
  38.                 self.sentences.append(sen)
  39.                 sentenceCount += 1
  40.                 sen = sentence()
  41.                 if(sentenceLen !=-1 and sentenceCount >= sentenceLen):
  42.                     break
  43.                 continue
  44.             list_s = s.split('\t')
  45.             str_word = list_s[1].decode('utf-8')
  46.             str_tag = list_s[3]
  47.             list_wordchars = list(str_word)
  48.             sen.word.append(str_word)
  49.             sen.tag.append(str_tag)
  50.             sen.wordchars.append(list_wordchars)
  51.             wordCount += 1
  52.         print(self.name + ".conll contains " + str(sentenceCount) + " sentences")
  53.         print(self.name + ".conll contains " + str(wordCount) + " words")
  54.  
  55. class linear_model:
  56.     def __init__(self):
  57.         self.model = dict()
  58.         self.tags = dict()
  59.         self.train = dataset()
  60.         self.dev = dataset()
  61.  
  62.         self.train.open_file("train.conll")
  63.         self.train.read_data(-1)
  64.         self.train.close_file()
  65.  
  66.         self.dev.open_file("dev.conll")
  67.         self.dev.read_data(-1)
  68.         self.dev.close_file()
  69.    
  70.     def create_feature_with_tag(self, sentence, pos, tag):
  71.         word_count = len(sentence.word)
  72.         wi = sentence.word[pos]
  73.         pos_word_len = len(sentence.word[pos])
  74.         if(pos == 0):
  75.             wim1 = "$$"
  76.             cim1m1 = "$"
  77.         else:
  78.             wim1 = sentence.word[pos-1]
  79.             cim1m1 = sentence.wordchars[pos-1][len(sentence.word[pos-1])-1]
  80.         if(pos == word_count - 1):
  81.             wip1 = "##"
  82.             cip10 = "#"
  83.         else:
  84.             wip1 = sentence.word[pos + 1]
  85.             cip10 = sentence.wordchars[pos + 1][0]
  86.         cim1 = sentence.wordchars[pos][pos_word_len - 1]
  87.         ci0 = sentence.wordchars[pos][0]
  88.         f = []
  89.         f.append("02:" + str(tag) + "*" + wi)
  90.         f.append("03:" + str(tag) + "*" + wim1)
  91.         f.append("04:" + str(tag) + "*" + wip1)
  92.         f.append("05:" + str(tag) + "*" + cim1m1)
  93.         f.append("06:" + str(tag) + "*" + cip10)
  94.         f.append("07:" + str(tag) + "*" + ci0)
  95.         f.append("08:" + str(tag) + "*" + cim1)
  96.         for i in range(1, pos_word_len - 1):
  97.             cik = sentence.wordchars[pos][i]
  98.             f.append("09:" + str(tag) + "*" + cik)
  99.             f.append("10:" + str(tag) + "*" + ci0 + "*" + cik)
  100.             f.append("11:" + str(tag) + "*" + cim1 + "*" + cik)
  101.             cikp1 = sentence.wordchars[pos][i + 1]
  102.             if(cik == cikp1):
  103.                 f.append("13:" + str(tag) + "*" + cik + "*" + "consecutive")
  104.         if(pos_word_len == 1):
  105.             f.append("12:" + str(tag) + "*" + wi + "*" + cim1m1 + "*" + cip10)
  106.         for i in range(0, pos_word_len - 1):
  107.             if(i >= 4):
  108.                 break
  109.             f.append("14:" + str(tag) + "*" + sentence.word[pos][0:(i + 1)])
  110.             f.append("15:" + str(tag) + "*" + sentence.word[pos][-(i + 1)::])
  111.         return f
  112.  
  113.     def create_feature_space(self):
  114.         for s in self.train.sentences:
  115.             for p in range(0, len(s.word)):
  116.                 f = self.create_feature_with_tag(s, p, s.tag[p])
  117.                 for feature in f:
  118.                     self.model[feature] = 0
  119.                 self.tags[s.tag[p]] = self.tags.get(s.tag[p], 0) + 1
  120.         print("the total number of features is " + str(len(self.model)))
  121.         print("the total number of tags is " + str(len(self.tags)))
  122.  
  123.     def dot(self,f):
  124.         score = 0
  125.         for i in f:
  126.             if(i in self.model):
  127.                 score += self.model[i]
  128.         return score
  129.  
  130.     def max_tag(self, sentence, pos):
  131.         maxnum = -1e10
  132.         tempnum = 0
  133.         tag = "NULL"
  134.         for t in self.tags:
  135.             fv = self.create_feature_with_tag(sentence, pos, t)
  136.             tempnum = self.dot(fv)
  137.             if(tempnum > (maxnum + 1e-10)):
  138.                 maxnum = tempnum
  139.                 tag = t
  140.         return tag
  141.  
  142.     def online_training(self):
  143.         max_train_precision = 0
  144.         max_dev_precision = 0
  145.         for iterator in range(0, 20):
  146.             print("iterator " + str(iterator))
  147.             cnt = 1
  148.             for s in self.train.sentences:
  149.                 for p in range(0, len(s.word)):
  150.                     max_tag = self.max_tag(s, p)
  151.                     correcttag = s.tag[p]
  152.                     if(max_tag != correcttag):
  153.                         fmaxtag = self.create_feature_with_tag(s, p, max_tag)
  154.                         fcorrecttag = self.create_feature_with_tag(s, p, correcttag)
  155.                         for i in fmaxtag:
  156.                             if(i in self.model):
  157.                                 self.model[i] -= 1
  158.                         for i in fcorrecttag:
  159.                             if(i in self.model):
  160.                                 self.model[i] += 1
  161.                 if cnt % 50 == 0:
  162.                     print('Process: {}/{}\r'.format(cnt, len(self.train.sentences)), end='')
  163.                 cnt += 1
  164.             print('\n', end='')
  165.             train_iterator, train_c, train_count, train_precision = self.evaluate(self.train, iterator)
  166.             dev_iterator, dev_c, dev_count, dev_precision = self.evaluate(self.dev, iterator)
  167.             self.save_model(iterator)
  168.             if(train_precision > (max_train_precision + 1e-10)):
  169.                 max_train_precision = train_precision
  170.                 max_train_iterator = train_iterator
  171.                 max_train_c = train_c
  172.                 max_train_count = train_count
  173.             if(dev_precision > (max_dev_precision + 1e-10)):
  174.                 max_dev_precision = dev_precision
  175.                 max_dev_iterator = dev_iterator
  176.                 max_dev_c = dev_c
  177.                 max_dev_count  = dev_count
  178.         print("Conclusion:")
  179.         print("\t"+self.train.name + " iterator: " + str(max_train_iterator) + "\t" + str(max_train_c) + " / " + str(max_train_count) + " = " + str(max_train_precision))
  180.         print("\t"+self.dev.name + " iterator: " + str(max_dev_iterator) + "\t" + str(max_dev_c) + " / " + str(max_dev_count) + " = " + str(max_dev_precision))
  181.  
  182.     def save_model(self, iterator):
  183.         fmodel = open("linearmodel.lm" + str(iterator), mode='w')
  184.         for key in self.model:
  185.             fmodel.write(key.encode('utf-8') + "\t" + str(self.model[key]) + '\n')
  186.         fmodel.close()
  187.  
  188.     def evaluate(self, dataset, iterator):
  189.        c = 0
  190.        count = 0
  191.        fout = open(dataset.name + ".out" + str(iterator), mode='w')
  192.        for s in dataset.sentences:
  193.            for p in range(0, len(s.word)):
  194.                count += 1
  195.                max_tag = self.max_tag(s, p)
  196.                correcttag = s.tag[p]
  197.                fout.write(s.word[p].encode('utf-8') + '\t' + str(max_tag) + '\t' + str(correcttag) + '\n')
  198.                if(max_tag != correcttag):
  199.                    pass
  200.                else:
  201.                    c += 1
  202.        print(dataset.name + "\tprecision is " + str(c) + " / " + str(count) + " = " + str(1.0 * c/count))
  203.        fout.close()
  204.        return iterator, c, count, 1.0 * c/count
  205.  
  206.  
  207. ################################ main #####################################
  208. if __name__ == '__main__':
  209.     starttime = datetime.datetime.now()
  210.     lm = linear_model()
  211.     lm.create_feature_space()
  212.     lm.online_training()
  213.     endtime = datetime.datetime.now()
  214.     print("executing time is " + str((endtime - starttime).seconds) + " s")
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top