Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pickle
- import numpy as np
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfTransformer
- import os
- import time
- import scipy.sparse as sparse
- from sklearn.base import clone
- corpus = pickle.load(open('raw_text_dataset.pickle','r'))
- vectorizer = TfidfVectorizer(
- stop_words='english',
- norm='l2',
- use_idf=True,
- analyzer='word',
- token_pattern='(?u)\b[a-zA-Z]\w+\b',
- )
- X_corpus_docterm = vectorizer.fit_transform(corpus[0])
- def doc_vec(doc, orig_vectorizer):
- """
- doc : String argument of text we would like to analyze
- orig_vectorizer : our tfidf vectorizer from sklearn fit to our corpus from reuters
- """
- doc_vectorizer = clone(orig_vectorizer)
- # new_term_freq_matrix - should be our L2 according to
- # https://stanford.edu/~rjweiss/public_html/IRiSS2013/text2/notebooks/tfidf.html
- # Please note - helpful article and borrowed a bit from:
- new_term_freq_matrix = vectorizer.transform([doc]).todense()
- tf_vectorizer = CountVectorizer(
- stop_words='english')
- tf = tf_vectorizer.fit_transform([doc])
- # doc_features - the features after tokenization and pre-processing
- vocab = tf_vectorizer.vocabulary_
- # doc_counts
- word_counts = {}
- for v in vocab:
- word_counts[v] = tf.toarray()[0][vocab[v]]
- return {
- 'vec':new_term_freq_matrix,
- 'doc_features': vocab.keys(),
- 'doc_counts':word_counts,
- }
- doc_list = [
- "foo bar cadabra abra",
- "red pig green cow",
- "other test words that i may be curious about",
- ]
- for d in doc_list:
- output = doc_vec(d,vectorizer)
- print output
- {'doc_features': [u'cadabra', u'foo', u'bar', u'abra'], 'doc_counts': {u'cadabra': 1, u'foo': 1, u'bar': 1, u'abra': 1}, 'vec': matrix([[ 0., 0., 0., ..., 0., 0., 0.]])}
- {'doc_features': [u'green', u'red', u'cow', u'pig'], 'doc_counts': {u'green': 1, u'cow': 1, u'red': 1, u'pig': 1}, 'vec': matrix([[ 0., 0., 0., ..., 0., 0., 0.]])}
- {'doc_features': [u'test', u'curious', u'words'], 'doc_counts': {u'test': 1, u'curious': 1, u'words': 1}, 'vec': matrix([[ 0., 0., 0., ..., 0., 0., 0.]])}
Add Comment
Please, Sign In to add comment