Advertisement
Guest User

Untitled

a guest
Feb 20th, 2017
133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.29 KB | None | 0 0
  1. # gensim modules
  2. from gensim import utils
  3. from gensim.models.doc2vec import TaggedDocument
  4. from gensim.models import Doc2Vec
  5.  
  6. # random shuffle
  7. from random import shuffle
  8.  
  9. # numpy
  10. import numpy
  11.  
  12. # classifier
  13. from sklearn.linear_model import LogisticRegression
  14.  
  15. import logging
  16. import sys
  17.  
  18. log = logging.getLogger()
  19. log.setLevel(logging.DEBUG)
  20.  
  21. ch = logging.StreamHandler(sys.stdout)
  22. ch.setLevel(logging.DEBUG)
  23. formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  24. ch.setFormatter(formatter)
  25. log.addHandler(ch)
  26.  
  27. class TaggedLineSentence(object):
  28. def __init__(self, sources):
  29. self.sources = sources
  30.  
  31. flipped = {}
  32.  
  33. # make sure that keys are unique
  34. for key, value in sources.items():
  35. if value not in flipped:
  36. flipped[value] = [key]
  37. else:
  38. raise Exception('Non-unique prefix encountered')
  39.  
  40. def __iter__(self):
  41. for source, prefix in self.sources.items():
  42. with utils.smart_open(source) as fin:
  43. for item_no, line in enumerate(fin):
  44. yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
  45.  
  46. def to_array(self):
  47. self.sentences = []
  48. for source, prefix in self.sources.items():
  49. with utils.smart_open(source) as fin:
  50. for item_no, line in enumerate(fin):
  51. self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
  52. return self.sentences
  53.  
  54. def sentences_perm(self):
  55. shuffle(self.sentences)
  56. return self.sentences
  57.  
  58.  
  59. log.info('source load')
  60. sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}
  61.  
  62. log.info('TaggedDocument')
  63. sentences = TaggedLineSentence(sources)
  64.  
  65. log.info('D2V')
  66. model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
  67. model.build_vocab(sentences.to_array())
  68.  
  69. log.info('Epoch')
  70. for epoch in range(10):
  71. log.info('EPOCH: {}'.format(epoch))
  72. model.train(sentences.sentences_perm())
  73.  
  74. log.info('Model Save')
  75. model.save('./imdb.d2v')
  76. model = Doc2Vec.load('./imdb.d2v')
  77.  
  78. log.info('Sentiment')
  79. train_arrays = numpy.zeros((25000, 100))
  80. train_labels = numpy.zeros(25000)
  81.  
  82. for i in range(12500):
  83. prefix_train_pos = 'TRAIN_POS_' + str(i)
  84. prefix_train_neg = 'TRAIN_NEG_' + str(i)
  85. train_arrays[i] = model.docvecs[prefix_train_pos]
  86. train_arrays[12500 + i] = model.docvecs[prefix_train_neg]
  87. train_labels[i] = 1
  88. train_labels[12500 + i] = 0
  89.  
  90.  
  91. test_arrays = numpy.zeros((25000, 100))
  92. test_labels = numpy.zeros(25000)
  93.  
  94. for i in range(12500):
  95. prefix_test_pos = 'TEST_POS_' + str(i)
  96. prefix_test_neg = 'TEST_NEG_' + str(i)
  97. test_arrays[i] = model.docvecs[prefix_test_pos]
  98. test_arrays[12500 + i] = model.docvecs[prefix_test_neg]
  99. test_labels[i] = 1
  100. test_labels[12500 + i] = 0
  101.  
  102. log.info('Fitting')
  103. classifier = LogisticRegression()
  104. classifier.fit(train_arrays, train_labels)
  105.  
  106. LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
  107. intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
  108.  
  109. print(classifier.score(test_arrays, test_labels))
  110. # classify input text
  111. text = input("Enter Your text:")
  112. print(classifier.predict(text.split()))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement