Advertisement
Guest User

Untitled

a guest
Aug 24th, 2019
133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.15 KB | None | 0 0
  1. #!/usr/bin/python
  2. #coding=utf-8
  3. from __future__ import print_function
  4. import datetime
  5.  
  6. """
  7. 训练前:利用人工标注的tag信息,拓展特征空间
  8. 训练时:把每一种可能的tag都分别计算一个分数,取分数最大时的tag作为真正的tag
  9. LOSS:分别求错误tag和正确tag的特征向量,对错的特征向量,在整体的特征向量空间中减一;
  10. 对正确的特征向量加一
  11. PREDICT:max_tag就是预测函数
  12. """
  13.  
  14. class sentence:
  15. def __init__(self):
  16. self.word = []
  17. self.tag = []
  18. self.wordchars = []
  19.  
  20. class dataset:
  21. def __init__(self):
  22. self.sentences = []
  23. self.name = ""
  24.  
  25. def open_file(self, inputfile):
  26. self.inputfile = open(inputfile, mode='r')
  27. self.name = inputfile.split('.')[0]
  28.  
  29. def close_file(self):
  30. self.inputfile.close()
  31.  
  32. def read_data(self, sentenceLen):
  33. sentenceCount = 0
  34. wordCount = 0
  35. sen = sentence()
  36. for s in self.inputfile:
  37. if(s == '\n'):
  38. self.sentences.append(sen)
  39. sentenceCount += 1
  40. sen = sentence()
  41. if(sentenceLen !=-1 and sentenceCount >= sentenceLen):
  42. break
  43. continue
  44. list_s = s.split('\t')
  45. str_word = list_s[1].decode('utf-8')
  46. str_tag = list_s[3]
  47. list_wordchars = list(str_word)
  48. sen.word.append(str_word)
  49. sen.tag.append(str_tag)
  50. sen.wordchars.append(list_wordchars)
  51. wordCount += 1
  52. print(self.name + ".conll contains " + str(sentenceCount) + " sentences")
  53. print(self.name + ".conll contains " + str(wordCount) + " words")
  54.  
  55. class linear_model:
  56. def __init__(self):
  57. self.model = dict()
  58. self.tags = dict()
  59. self.train = dataset()
  60. self.dev = dataset()
  61.  
  62. self.train.open_file("train.conll")
  63. self.train.read_data(-1)
  64. self.train.close_file()
  65.  
  66. self.dev.open_file("dev.conll")
  67. self.dev.read_data(-1)
  68. self.dev.close_file()
  69.  
  70. def create_feature_with_tag(self, sentence, pos, tag):
  71. word_count = len(sentence.word)
  72. wi = sentence.word[pos]
  73. pos_word_len = len(sentence.word[pos])
  74. if(pos == 0):
  75. wim1 = "$$"
  76. cim1m1 = "$"
  77. else:
  78. wim1 = sentence.word[pos-1]
  79. cim1m1 = sentence.wordchars[pos-1][len(sentence.word[pos-1])-1]
  80. if(pos == word_count - 1):
  81. wip1 = "##"
  82. cip10 = "#"
  83. else:
  84. wip1 = sentence.word[pos + 1]
  85. cip10 = sentence.wordchars[pos + 1][0]
  86. cim1 = sentence.wordchars[pos][pos_word_len - 1]
  87. ci0 = sentence.wordchars[pos][0]
  88. f = []
  89. f.append("02:" + str(tag) + "*" + wi)
  90. f.append("03:" + str(tag) + "*" + wim1)
  91. f.append("04:" + str(tag) + "*" + wip1)
  92. f.append("05:" + str(tag) + "*" + cim1m1)
  93. f.append("06:" + str(tag) + "*" + cip10)
  94. f.append("07:" + str(tag) + "*" + ci0)
  95. f.append("08:" + str(tag) + "*" + cim1)
  96. for i in range(1, pos_word_len - 1):
  97. cik = sentence.wordchars[pos][i]
  98. f.append("09:" + str(tag) + "*" + cik)
  99. f.append("10:" + str(tag) + "*" + ci0 + "*" + cik)
  100. f.append("11:" + str(tag) + "*" + cim1 + "*" + cik)
  101. cikp1 = sentence.wordchars[pos][i + 1]
  102. if(cik == cikp1):
  103. f.append("13:" + str(tag) + "*" + cik + "*" + "consecutive")
  104. if(pos_word_len == 1):
  105. f.append("12:" + str(tag) + "*" + wi + "*" + cim1m1 + "*" + cip10)
  106. for i in range(0, pos_word_len - 1):
  107. if(i >= 4):
  108. break
  109. f.append("14:" + str(tag) + "*" + sentence.word[pos][0:(i + 1)])
  110. f.append("15:" + str(tag) + "*" + sentence.word[pos][-(i + 1)::])
  111. return f
  112.  
  113. def create_feature_space(self):
  114. for s in self.train.sentences:
  115. for p in range(0, len(s.word)):
  116. f = self.create_feature_with_tag(s, p, s.tag[p])
  117. for feature in f:
  118. self.model[feature] = 0
  119. self.tags[s.tag[p]] = self.tags.get(s.tag[p], 0) + 1
  120. print("the total number of features is " + str(len(self.model)))
  121. print("the total number of tags is " + str(len(self.tags)))
  122.  
  123. def dot(self,f):
  124. score = 0
  125. for i in f:
  126. if(i in self.model):
  127. score += self.model[i]
  128. return score
  129.  
  130. def max_tag(self, sentence, pos):
  131. maxnum = -1e10
  132. tempnum = 0
  133. tag = "NULL"
  134. for t in self.tags:
  135. fv = self.create_feature_with_tag(sentence, pos, t)
  136. tempnum = self.dot(fv)
  137. if(tempnum > (maxnum + 1e-10)):
  138. maxnum = tempnum
  139. tag = t
  140. return tag
  141.  
  142. def online_training(self):
  143. max_train_precision = 0
  144. max_dev_precision = 0
  145. for iterator in range(0, 20):
  146. print("iterator " + str(iterator))
  147. cnt = 1
  148. for s in self.train.sentences:
  149. for p in range(0, len(s.word)):
  150. max_tag = self.max_tag(s, p)
  151. correcttag = s.tag[p]
  152. if(max_tag != correcttag):
  153. fmaxtag = self.create_feature_with_tag(s, p, max_tag)
  154. fcorrecttag = self.create_feature_with_tag(s, p, correcttag)
  155. for i in fmaxtag:
  156. if(i in self.model):
  157. self.model[i] -= 1
  158. for i in fcorrecttag:
  159. if(i in self.model):
  160. self.model[i] += 1
  161. if cnt % 50 == 0:
  162. print('Process: {}/{}\r'.format(cnt, len(self.train.sentences)), end='')
  163. cnt += 1
  164. print('\n', end='')
  165. train_iterator, train_c, train_count, train_precision = self.evaluate(self.train, iterator)
  166. dev_iterator, dev_c, dev_count, dev_precision = self.evaluate(self.dev, iterator)
  167. self.save_model(iterator)
  168. if(train_precision > (max_train_precision + 1e-10)):
  169. max_train_precision = train_precision
  170. max_train_iterator = train_iterator
  171. max_train_c = train_c
  172. max_train_count = train_count
  173. if(dev_precision > (max_dev_precision + 1e-10)):
  174. max_dev_precision = dev_precision
  175. max_dev_iterator = dev_iterator
  176. max_dev_c = dev_c
  177. max_dev_count = dev_count
  178. print("Conclusion:")
  179. print("\t"+self.train.name + " iterator: " + str(max_train_iterator) + "\t" + str(max_train_c) + " / " + str(max_train_count) + " = " + str(max_train_precision))
  180. print("\t"+self.dev.name + " iterator: " + str(max_dev_iterator) + "\t" + str(max_dev_c) + " / " + str(max_dev_count) + " = " + str(max_dev_precision))
  181.  
  182. def save_model(self, iterator):
  183. fmodel = open("linearmodel.lm" + str(iterator), mode='w')
  184. for key in self.model:
  185. fmodel.write(key.encode('utf-8') + "\t" + str(self.model[key]) + '\n')
  186. fmodel.close()
  187.  
  188. def evaluate(self, dataset, iterator):
  189. c = 0
  190. count = 0
  191. fout = open(dataset.name + ".out" + str(iterator), mode='w')
  192. for s in dataset.sentences:
  193. for p in range(0, len(s.word)):
  194. count += 1
  195. max_tag = self.max_tag(s, p)
  196. correcttag = s.tag[p]
  197. fout.write(s.word[p].encode('utf-8') + '\t' + str(max_tag) + '\t' + str(correcttag) + '\n')
  198. if(max_tag != correcttag):
  199. pass
  200. else:
  201. c += 1
  202. print(dataset.name + "\tprecision is " + str(c) + " / " + str(count) + " = " + str(1.0 * c/count))
  203. fout.close()
  204. return iterator, c, count, 1.0 * c/count
  205.  
  206.  
  207. ################################ main #####################################
  208. if __name__ == '__main__':
  209. starttime = datetime.datetime.now()
  210. lm = linear_model()
  211. lm.create_feature_space()
  212. lm.online_training()
  213. endtime = datetime.datetime.now()
  214. print("executing time is " + str((endtime - starttime).seconds) + " s")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement