Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfTransformer
- from nltk.corpus import stopwords
- import numpy as np
- import numpy.linalg as LA
- train_set = ["The sky is blue.", "The sun is bright."] # Documents
- test_set = ["The sun in the sky is bright."] # Query
- stopWords = stopwords.words('english')
- vectorizer = CountVectorizer(stop_words = stopWords)
- #print vectorizer
- transformer = TfidfTransformer()
- #print transformer
- trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
- testVectorizerArray = vectorizer.transform(test_set).toarray()
- print 'Fit Vectorizer to train set', trainVectorizerArray
- print 'Transform Vectorizer to test set', testVectorizerArray
- transformer.fit(trainVectorizerArray)
- print
- print transformer.transform(trainVectorizerArray).toarray()
- transformer.fit(testVectorizerArray)
- print
- tfidf = transformer.transform(testVectorizerArray)
- print tfidf.todense()
- Fit Vectorizer to train set [[1 0 1 0]
- [0 1 0 1]]
- Transform Vectorizer to test set [[0 1 1 1]]
- [[ 0.70710678 0. 0.70710678 0. ]
- [ 0. 0.70710678 0. 0.70710678]]
- [[ 0. 0.57735027 0.57735027 0.57735027]]
- >>> from sklearn.feature_extraction.text import TfidfVectorizer
- >>> from sklearn.datasets import fetch_20newsgroups
- >>> twenty = fetch_20newsgroups()
- >>> tfidf = TfidfVectorizer().fit_transform(twenty.data)
- >>> tfidf
- <11314x130088 sparse matrix of type '<type 'numpy.float64'>'
- with 1787553 stored elements in Compressed Sparse Row format>
- >>> tfidf[0:1]
- <1x130088 sparse matrix of type '<type 'numpy.float64'>'
- with 89 stored elements in Compressed Sparse Row format>
- >>> from sklearn.metrics.pairwise import linear_kernel
- >>> cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
- >>> cosine_similarities
- array([ 1. , 0.04405952, 0.11016969, ..., 0.04433602,
- 0.04457106, 0.03293218])
- >>> related_docs_indices = cosine_similarities.argsort()[:-5:-1]
- >>> related_docs_indices
- array([ 0, 958, 10576, 3277])
- >>> cosine_similarities[related_docs_indices]
- array([ 1. , 0.54967926, 0.32902194, 0.2825788 ])
- >>> print twenty.data[0]
- From: lerxst@wam.umd.edu (where's my thing)
- Subject: WHAT car is this!?
- Nntp-Posting-Host: rac3.wam.umd.edu
- Organization: University of Maryland, College Park
- Lines: 15
- I was wondering if anyone out there could enlighten me on this car I saw
- the other day. It was a 2-door sports car, looked to be from the late 60s/
- early 70s. It was called a Bricklin. The doors were really small. In addition,
- the front bumper was separate from the rest of the body. This is
- all I know. If anyone can tellme a model name, engine specs, years
- of production, where this car is made, history, or whatever info you
- have on this funky looking car, please e-mail.
- Thanks,
- - IL
- ---- brought to you by your neighborhood Lerxst ----
- >>> print twenty.data[958]
- From: rseymour@reed.edu (Robert Seymour)
- Subject: Re: WHAT car is this!?
- Article-I.D.: reed.1993Apr21.032905.29286
- Reply-To: rseymour@reed.edu
- Organization: Reed College, Portland, OR
- Lines: 26
- In article <1993Apr20.174246.14375@wam.umd.edu> lerxst@wam.umd.edu (where's my
- thing) writes:
- >
- > I was wondering if anyone out there could enlighten me on this car I saw
- > the other day. It was a 2-door sports car, looked to be from the late 60s/
- > early 70s. It was called a Bricklin. The doors were really small. In
- addition,
- > the front bumper was separate from the rest of the body. This is
- > all I know. If anyone can tellme a model name, engine specs, years
- > of production, where this car is made, history, or whatever info you
- > have on this funky looking car, please e-mail.
- Bricklins were manufactured in the 70s with engines from Ford. They are rather
- odd looking with the encased front bumper. There aren't a lot of them around,
- but Hemmings (Motor News) ususally has ten or so listed. Basically, they are a
- performance Ford with new styling slapped on top.
- > ---- brought to you by your neighborhood Lerxst ----
- Rush fan?
- --
- Robert Seymour rseymour@reed.edu
- Physics and Philosophy, Reed College (NeXTmail accepted)
- Artificial Life Project Reed College
- Reed Solar Energy Project (SolTrain) Portland, OR
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.metrics.pairwise import cosine_similarity
- from sklearn.feature_extraction.text import TfidfVectorizer
- f = open("/root/Myfolder/scoringDocuments/doc1")
- doc1 = str.decode(f.read(), "UTF-8", "ignore")
- f = open("/root/Myfolder/scoringDocuments/doc2")
- doc2 = str.decode(f.read(), "UTF-8", "ignore")
- f = open("/root/Myfolder/scoringDocuments/doc3")
- doc3 = str.decode(f.read(), "UTF-8", "ignore")
- train_set = ["president of India",doc1, doc2, doc3]
- tfidf_vectorizer = TfidfVectorizer()
- tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set) #finds the tfidf score with normalization
- print "cosine scores ==> ",cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train) #here the first element of tfidf_matrix_train is matched with other three elements
- [[ 1. 0.07102631 0.02731343 0.06348799]]
- cosine_function = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfTransformer
- from nltk.corpus import stopwords
- import numpy as np
- import numpy.linalg as LA
- train_set = ["The sky is blue.", "The sun is bright."] #Documents
- test_set = ["The sun in the sky is bright."] #Query
- stopWords = stopwords.words('english')
- vectorizer = CountVectorizer(stop_words = stopWords)
- #print vectorizer
- transformer = TfidfTransformer()
- #print transformer
- trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
- testVectorizerArray = vectorizer.transform(test_set).toarray()
- print 'Fit Vectorizer to train set', trainVectorizerArray
- print 'Transform Vectorizer to test set', testVectorizerArray
- cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
- for vector in trainVectorizerArray:
- print vector
- for testV in testVectorizerArray:
- print testV
- cosine = cx(vector, testV)
- print cosine
- transformer.fit(trainVectorizerArray)
- print
- print transformer.transform(trainVectorizerArray).toarray()
- transformer.fit(testVectorizerArray)
- print
- tfidf = transformer.transform(testVectorizerArray)
- print tfidf.todense()
- Fit Vectorizer to train set [[1 0 1 0]
- [0 1 0 1]]
- Transform Vectorizer to test set [[0 1 1 1]]
- [1 0 1 0]
- [0 1 1 1]
- 0.408
- [0 1 0 1]
- [0 1 1 1]
- 0.816
- [[ 0.70710678 0. 0.70710678 0. ]
- [ 0. 0.70710678 0. 0.70710678]]
- [[ 0. 0.57735027 0.57735027 0.57735027]]
- from nltk.corpus import stopwords
- import string
- from nltk.tokenize import wordpunct_tokenize as tokenize
- from nltk.stem.porter import PorterStemmer
- from sklearn.feature_extraction.text import TfidfVectorizer
- from scipy.spatial.distance import cosine
- porter = PorterStemmer()
- stop_words = set(stopwords.words('english'))
- modified_arr = [[porter.stem(i.lower()) for i in tokenize(d.translate(None, string.punctuation)) if i.lower() not in stop_words] for d in documents]
- modified_doc = [' '.join(i) for i in modified_arr] # this is only to convert our list of lists to list of strings that vectorizer uses.
- tf_idf = TfidfVectorizer().fit_transform(modified_doc)
- l = len(documents) - 1
- for i in xrange(l):
- minimum = (1, None)
- minimum = min((cosine(tf_idf[i].todense(), tf_idf[l + 1].todense()), i), minimum)
- print minimum
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.metrics.pairwise import cosine_similarity
- tfidf_vectorizer = TfidfVectorizer()
- tfidf_matrix = tfidf_vectorizer.fit_transform(train_set)
- print tfidf_matrix
- cosine = cosine_similarity(tfidf_matrix[length-1], tfidf_matrix)
- print cosine
- [[ 0.34949812 0.81649658 1. ]]
Add Comment
Please, Sign In to add comment