Advertisement
Guest User

Untitled

a guest
Jul 31st, 2015
179
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.39 KB | None | 0 0
  1. #bow representations for the three sets unlabelled, train and test
  2. vectorizer = CountVectorizer(max_features=3000,stop_words='english')
  3.  
  4.  
  5. corpus_tfidf_unsuper = vectorizer.fit_transform(train_data_unsupervised[:,2])
  6. corpus_tfidf_train = vectorizer.transform(train_ds[:,2])
  7. corpus_tfidf_test= vectorizer.transform(test_ds[:,2])
  8.  
  9. #transform to gensim acceptable objects
  10. vocab = vectorizer.get_feature_names()
  11. id2word_unsuper=dict([(i, s) for i, s in enumerate(vocab)])
  12. corpus_vect_gensim_unsuper = matutils.Sparse2Corpus(corpus_tfidf_unsuper.T)
  13. corpus_vect_gensim_train = matutils.Sparse2Corpus(corpus_tfidf_train.T)
  14. corpus_vect_gensim_test = matutils.Sparse2Corpus(corpus_tfidf_test.T)
  15.  
  16. #fit the model to the unlabelled data
  17. lda = models.LdaModel(corpus_vect_gensim_unsuper,
  18. id2word = id2word_unsuper,
  19. num_topics = 10,
  20. passes=1)
  21. #transform the train and test set to the latent topic space
  22. docTopicProbMat_train = lda[corpus_vect_gensim_train]
  23. docTopicProbMat_test = lda[corpus_vect_gensim_test]
  24. #transform to csr matrices
  25. train_lda=matutils.corpus2csc(docTopicProbMat_train)
  26. test_lda=matutils.corpus2csc(docTopicProbMat_test)
  27. #fit the classifier and print the accuracy
  28. clf =LogisticRegression()
  29. clf.fit(train_lda.transpose(), np.array(train_ds[:,0]).astype(int))
  30. ypred = clf.predict(test_lda.transpose())
  31. print accuracy_score(test_ds[:,0].astype(int), ypred)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement