Advertisement
Guest User

Untitled

a guest
Aug 22nd, 2019
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.21 KB | None | 0 0
  1. from gensim.models.doc2vec import Doc2Vec, TaggedDocument
  2. import csv
  3. import jieba
  4.  
  5. # jieba初始化
  6. jieba.set_dictionary('dict/dict.txt.big')
  7. jieba.load_userdict('dict/my_dict')
  8. jieba.initialize()
  9.  
  10. # 讀入waimai_10k_tw.csv,並且使用jieba斷詞
  11. sentences = []
  12. with open('dataset/waimai_10k_tw.csv',newline='') as f:
  13. rows = csv.reader(f)
  14. for row in rows:
  15. if(row[0] == 'label'):
  16. continue
  17. line = row[1].strip('\n')
  18. sentence = jieba.cut(line, cut_all=False)
  19. sentence = list(sentence)
  20. sentences.append(sentence)
  21.  
  22. # 資料準備
  23. tagged_data = [TaggedDocument(sentence, [str(i)]) for i, sentence in enumerate(sentences)]
  24.  
  25. # train
  26. max_epochs = 50
  27. vec_size = 200
  28. alpha = 0.03
  29.  
  30. model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025,min_count=1, dm =1)
  31. model.build_vocab(tagged_data)
  32.  
  33. for epoch in range(max_epochs):
  34. print('iteration {0}'.format(epoch),model.alpha)
  35. model.train(tagged_data,
  36. total_examples=model.corpus_count,
  37. epochs=model.epochs)
  38. # decrease the learning rate
  39. model.alpha -= 0.0003
  40. # fix the learning rate, no decay
  41. model.min_alpha = model.alpha
  42.  
  43. model.save('d2vmodel/d2vmodel.model')
  44. print('finish')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement