Guest User

Untitled

a guest
Oct 21st, 2017
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.25 KB | None | 0 0
  1. import pickle
  2. import numpy as np
  3. from sklearn.feature_extraction.text import TfidfVectorizer
  4. from sklearn.feature_extraction.text import CountVectorizer
  5. from sklearn.feature_extraction.text import TfidfTransformer
  6. import os
  7. import time
  8. import scipy.sparse as sparse
  9. from sklearn.base import clone
  10.  
  11. corpus = pickle.load(open('raw_text_dataset.pickle','r'))
  12.  
  13. vectorizer = TfidfVectorizer(
  14. stop_words='english',
  15. norm='l2',
  16. use_idf=True,
  17. analyzer='word',
  18. token_pattern='(?u)\b[a-zA-Z]\w+\b',
  19. )
  20.  
  21. X_corpus_docterm = vectorizer.fit_transform(corpus[0])
  22.  
  23. def doc_vec(doc, orig_vectorizer):
  24. """
  25. doc : String argument of text we would like to analyze
  26.  
  27. orig_vectorizer : our tfidf vectorizer from sklearn fit to our corpus from reuters
  28.  
  29. """
  30. doc_vectorizer = clone(orig_vectorizer)
  31.  
  32. # new_term_freq_matrix - should be our L2 according to
  33. # https://stanford.edu/~rjweiss/public_html/IRiSS2013/text2/notebooks/tfidf.html
  34. # Please note - helpful article and borrowed a bit from:
  35. new_term_freq_matrix = vectorizer.transform([doc]).todense()
  36.  
  37. tf_vectorizer = CountVectorizer(
  38. stop_words='english')
  39. tf = tf_vectorizer.fit_transform([doc])
  40.  
  41. # doc_features - the features after tokenization and pre-processing
  42. vocab = tf_vectorizer.vocabulary_
  43.  
  44. # doc_counts
  45. word_counts = {}
  46. for v in vocab:
  47. word_counts[v] = tf.toarray()[0][vocab[v]]
  48.  
  49. return {
  50. 'vec':new_term_freq_matrix,
  51. 'doc_features': vocab.keys(),
  52. 'doc_counts':word_counts,
  53. }
  54.  
  55. doc_list = [
  56. "foo bar cadabra abra",
  57. "red pig green cow",
  58. "other test words that i may be curious about",
  59. ]
  60.  
  61. for d in doc_list:
  62. output = doc_vec(d,vectorizer)
  63. print output
  64.  
  65. {'doc_features': [u'cadabra', u'foo', u'bar', u'abra'], 'doc_counts': {u'cadabra': 1, u'foo': 1, u'bar': 1, u'abra': 1}, 'vec': matrix([[ 0., 0., 0., ..., 0., 0., 0.]])}
  66. {'doc_features': [u'green', u'red', u'cow', u'pig'], 'doc_counts': {u'green': 1, u'cow': 1, u'red': 1, u'pig': 1}, 'vec': matrix([[ 0., 0., 0., ..., 0., 0., 0.]])}
  67. {'doc_features': [u'test', u'curious', u'words'], 'doc_counts': {u'test': 1, u'curious': 1, u'words': 1}, 'vec': matrix([[ 0., 0., 0., ..., 0., 0., 0.]])}
Add Comment
Please, Sign In to add comment