Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- #coding=utf-8
- from __future__ import print_function
- import datetime
- """
- 训练前:利用人工标注的tag信息,拓展特征空间
- 训练时:把每一种可能的tag都分别计算一个分数,取分数最大时的tag作为真正的tag
- LOSS:分别求错误tag和正确tag的特征向量,对错的特征向量,在整体的特征向量空间中减一;
- 对正确的特征向量加一
- PREDICT:max_tag就是预测函数
- """
- class sentence:
- def __init__(self):
- self.word = []
- self.tag = []
- self.wordchars = []
- class dataset:
- def __init__(self):
- self.sentences = []
- self.name = ""
- def open_file(self, inputfile):
- self.inputfile = open(inputfile, mode='r')
- self.name = inputfile.split('.')[0]
- def close_file(self):
- self.inputfile.close()
- def read_data(self, sentenceLen):
- sentenceCount = 0
- wordCount = 0
- sen = sentence()
- for s in self.inputfile:
- if(s == '\n'):
- self.sentences.append(sen)
- sentenceCount += 1
- sen = sentence()
- if(sentenceLen !=-1 and sentenceCount >= sentenceLen):
- break
- continue
- list_s = s.split('\t')
- str_word = list_s[1].decode('utf-8')
- str_tag = list_s[3]
- list_wordchars = list(str_word)
- sen.word.append(str_word)
- sen.tag.append(str_tag)
- sen.wordchars.append(list_wordchars)
- wordCount += 1
- print(self.name + ".conll contains " + str(sentenceCount) + " sentences")
- print(self.name + ".conll contains " + str(wordCount) + " words")
- class linear_model:
- def __init__(self):
- self.model = dict()
- self.tags = dict()
- self.train = dataset()
- self.dev = dataset()
- self.train.open_file("train.conll")
- self.train.read_data(-1)
- self.train.close_file()
- self.dev.open_file("dev.conll")
- self.dev.read_data(-1)
- self.dev.close_file()
- def create_feature_with_tag(self, sentence, pos, tag):
- word_count = len(sentence.word)
- wi = sentence.word[pos]
- pos_word_len = len(sentence.word[pos])
- if(pos == 0):
- wim1 = "$$"
- cim1m1 = "$"
- else:
- wim1 = sentence.word[pos-1]
- cim1m1 = sentence.wordchars[pos-1][len(sentence.word[pos-1])-1]
- if(pos == word_count - 1):
- wip1 = "##"
- cip10 = "#"
- else:
- wip1 = sentence.word[pos + 1]
- cip10 = sentence.wordchars[pos + 1][0]
- cim1 = sentence.wordchars[pos][pos_word_len - 1]
- ci0 = sentence.wordchars[pos][0]
- f = []
- f.append("02:" + str(tag) + "*" + wi)
- f.append("03:" + str(tag) + "*" + wim1)
- f.append("04:" + str(tag) + "*" + wip1)
- f.append("05:" + str(tag) + "*" + cim1m1)
- f.append("06:" + str(tag) + "*" + cip10)
- f.append("07:" + str(tag) + "*" + ci0)
- f.append("08:" + str(tag) + "*" + cim1)
- for i in range(1, pos_word_len - 1):
- cik = sentence.wordchars[pos][i]
- f.append("09:" + str(tag) + "*" + cik)
- f.append("10:" + str(tag) + "*" + ci0 + "*" + cik)
- f.append("11:" + str(tag) + "*" + cim1 + "*" + cik)
- cikp1 = sentence.wordchars[pos][i + 1]
- if(cik == cikp1):
- f.append("13:" + str(tag) + "*" + cik + "*" + "consecutive")
- if(pos_word_len == 1):
- f.append("12:" + str(tag) + "*" + wi + "*" + cim1m1 + "*" + cip10)
- for i in range(0, pos_word_len - 1):
- if(i >= 4):
- break
- f.append("14:" + str(tag) + "*" + sentence.word[pos][0:(i + 1)])
- f.append("15:" + str(tag) + "*" + sentence.word[pos][-(i + 1)::])
- return f
- def create_feature_space(self):
- for s in self.train.sentences:
- for p in range(0, len(s.word)):
- f = self.create_feature_with_tag(s, p, s.tag[p])
- for feature in f:
- self.model[feature] = 0
- self.tags[s.tag[p]] = self.tags.get(s.tag[p], 0) + 1
- print("the total number of features is " + str(len(self.model)))
- print("the total number of tags is " + str(len(self.tags)))
- def dot(self,f):
- score = 0
- for i in f:
- if(i in self.model):
- score += self.model[i]
- return score
- def max_tag(self, sentence, pos):
- maxnum = -1e10
- tempnum = 0
- tag = "NULL"
- for t in self.tags:
- fv = self.create_feature_with_tag(sentence, pos, t)
- tempnum = self.dot(fv)
- if(tempnum > (maxnum + 1e-10)):
- maxnum = tempnum
- tag = t
- return tag
- def online_training(self):
- max_train_precision = 0
- max_dev_precision = 0
- for iterator in range(0, 20):
- print("iterator " + str(iterator))
- cnt = 1
- for s in self.train.sentences:
- for p in range(0, len(s.word)):
- max_tag = self.max_tag(s, p)
- correcttag = s.tag[p]
- if(max_tag != correcttag):
- fmaxtag = self.create_feature_with_tag(s, p, max_tag)
- fcorrecttag = self.create_feature_with_tag(s, p, correcttag)
- for i in fmaxtag:
- if(i in self.model):
- self.model[i] -= 1
- for i in fcorrecttag:
- if(i in self.model):
- self.model[i] += 1
- if cnt % 50 == 0:
- print('Process: {}/{}\r'.format(cnt, len(self.train.sentences)), end='')
- cnt += 1
- print('\n', end='')
- train_iterator, train_c, train_count, train_precision = self.evaluate(self.train, iterator)
- dev_iterator, dev_c, dev_count, dev_precision = self.evaluate(self.dev, iterator)
- self.save_model(iterator)
- if(train_precision > (max_train_precision + 1e-10)):
- max_train_precision = train_precision
- max_train_iterator = train_iterator
- max_train_c = train_c
- max_train_count = train_count
- if(dev_precision > (max_dev_precision + 1e-10)):
- max_dev_precision = dev_precision
- max_dev_iterator = dev_iterator
- max_dev_c = dev_c
- max_dev_count = dev_count
- print("Conclusion:")
- print("\t"+self.train.name + " iterator: " + str(max_train_iterator) + "\t" + str(max_train_c) + " / " + str(max_train_count) + " = " + str(max_train_precision))
- print("\t"+self.dev.name + " iterator: " + str(max_dev_iterator) + "\t" + str(max_dev_c) + " / " + str(max_dev_count) + " = " + str(max_dev_precision))
- def save_model(self, iterator):
- fmodel = open("linearmodel.lm" + str(iterator), mode='w')
- for key in self.model:
- fmodel.write(key.encode('utf-8') + "\t" + str(self.model[key]) + '\n')
- fmodel.close()
- def evaluate(self, dataset, iterator):
- c = 0
- count = 0
- fout = open(dataset.name + ".out" + str(iterator), mode='w')
- for s in dataset.sentences:
- for p in range(0, len(s.word)):
- count += 1
- max_tag = self.max_tag(s, p)
- correcttag = s.tag[p]
- fout.write(s.word[p].encode('utf-8') + '\t' + str(max_tag) + '\t' + str(correcttag) + '\n')
- if(max_tag != correcttag):
- pass
- else:
- c += 1
- print(dataset.name + "\tprecision is " + str(c) + " / " + str(count) + " = " + str(1.0 * c/count))
- fout.close()
- return iterator, c, count, 1.0 * c/count
- ################################ main #####################################
- if __name__ == '__main__':
- starttime = datetime.datetime.now()
- lm = linear_model()
- lm.create_feature_space()
- lm.online_training()
- endtime = datetime.datetime.now()
- print("executing time is " + str((endtime - starttime).seconds) + " s")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement