Guest User

Untitled

a guest
Apr 30th, 2018
173
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.81 KB | None | 0 0
  1. from sklearn.feature_extraction.text import CountVectorizer
  2. from sklearn.feature_extraction.text import TfidfTransformer
  3. from nltk.corpus import stopwords
  4. import numpy as np
  5. import numpy.linalg as LA
  6.  
  7. train_set = ["The sky is blue.", "The sun is bright."] # Documents
  8. test_set = ["The sun in the sky is bright."] # Query
  9. stopWords = stopwords.words('english')
  10.  
  11. vectorizer = CountVectorizer(stop_words = stopWords)
  12. #print vectorizer
  13. transformer = TfidfTransformer()
  14. #print transformer
  15.  
  16. trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
  17. testVectorizerArray = vectorizer.transform(test_set).toarray()
  18. print 'Fit Vectorizer to train set', trainVectorizerArray
  19. print 'Transform Vectorizer to test set', testVectorizerArray
  20.  
  21. transformer.fit(trainVectorizerArray)
  22. print
  23. print transformer.transform(trainVectorizerArray).toarray()
  24.  
  25. transformer.fit(testVectorizerArray)
  26. print
  27. tfidf = transformer.transform(testVectorizerArray)
  28. print tfidf.todense()
  29.  
  30. Fit Vectorizer to train set [[1 0 1 0]
  31. [0 1 0 1]]
  32. Transform Vectorizer to test set [[0 1 1 1]]
  33.  
  34. [[ 0.70710678 0. 0.70710678 0. ]
  35. [ 0. 0.70710678 0. 0.70710678]]
  36.  
  37. [[ 0. 0.57735027 0.57735027 0.57735027]]
  38.  
  39. >>> from sklearn.feature_extraction.text import TfidfVectorizer
  40. >>> from sklearn.datasets import fetch_20newsgroups
  41. >>> twenty = fetch_20newsgroups()
  42.  
  43. >>> tfidf = TfidfVectorizer().fit_transform(twenty.data)
  44. >>> tfidf
  45. <11314x130088 sparse matrix of type '<type 'numpy.float64'>'
  46. with 1787553 stored elements in Compressed Sparse Row format>
  47.  
  48. >>> tfidf[0:1]
  49. <1x130088 sparse matrix of type '<type 'numpy.float64'>'
  50. with 89 stored elements in Compressed Sparse Row format>
  51.  
  52. >>> from sklearn.metrics.pairwise import linear_kernel
  53. >>> cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
  54. >>> cosine_similarities
  55. array([ 1. , 0.04405952, 0.11016969, ..., 0.04433602,
  56. 0.04457106, 0.03293218])
  57.  
  58. >>> related_docs_indices = cosine_similarities.argsort()[:-5:-1]
  59. >>> related_docs_indices
  60. array([ 0, 958, 10576, 3277])
  61. >>> cosine_similarities[related_docs_indices]
  62. array([ 1. , 0.54967926, 0.32902194, 0.2825788 ])
  63.  
  64. >>> print twenty.data[0]
  65. From: lerxst@wam.umd.edu (where's my thing)
  66. Subject: WHAT car is this!?
  67. Nntp-Posting-Host: rac3.wam.umd.edu
  68. Organization: University of Maryland, College Park
  69. Lines: 15
  70.  
  71. I was wondering if anyone out there could enlighten me on this car I saw
  72. the other day. It was a 2-door sports car, looked to be from the late 60s/
  73. early 70s. It was called a Bricklin. The doors were really small. In addition,
  74. the front bumper was separate from the rest of the body. This is
  75. all I know. If anyone can tellme a model name, engine specs, years
  76. of production, where this car is made, history, or whatever info you
  77. have on this funky looking car, please e-mail.
  78.  
  79. Thanks,
  80. - IL
  81. ---- brought to you by your neighborhood Lerxst ----
  82.  
  83. >>> print twenty.data[958]
  84. From: rseymour@reed.edu (Robert Seymour)
  85. Subject: Re: WHAT car is this!?
  86. Article-I.D.: reed.1993Apr21.032905.29286
  87. Reply-To: rseymour@reed.edu
  88. Organization: Reed College, Portland, OR
  89. Lines: 26
  90.  
  91. In article <1993Apr20.174246.14375@wam.umd.edu> lerxst@wam.umd.edu (where's my
  92. thing) writes:
  93. >
  94. > I was wondering if anyone out there could enlighten me on this car I saw
  95. > the other day. It was a 2-door sports car, looked to be from the late 60s/
  96. > early 70s. It was called a Bricklin. The doors were really small. In
  97. addition,
  98. > the front bumper was separate from the rest of the body. This is
  99. > all I know. If anyone can tellme a model name, engine specs, years
  100. > of production, where this car is made, history, or whatever info you
  101. > have on this funky looking car, please e-mail.
  102.  
  103. Bricklins were manufactured in the 70s with engines from Ford. They are rather
  104. odd looking with the encased front bumper. There aren't a lot of them around,
  105. but Hemmings (Motor News) ususally has ten or so listed. Basically, they are a
  106. performance Ford with new styling slapped on top.
  107.  
  108. > ---- brought to you by your neighborhood Lerxst ----
  109.  
  110. Rush fan?
  111.  
  112. --
  113. Robert Seymour rseymour@reed.edu
  114. Physics and Philosophy, Reed College (NeXTmail accepted)
  115. Artificial Life Project Reed College
  116. Reed Solar Energy Project (SolTrain) Portland, OR
  117.  
  118. from sklearn.feature_extraction.text import CountVectorizer
  119. from sklearn.metrics.pairwise import cosine_similarity
  120. from sklearn.feature_extraction.text import TfidfVectorizer
  121.  
  122. f = open("/root/Myfolder/scoringDocuments/doc1")
  123. doc1 = str.decode(f.read(), "UTF-8", "ignore")
  124. f = open("/root/Myfolder/scoringDocuments/doc2")
  125. doc2 = str.decode(f.read(), "UTF-8", "ignore")
  126. f = open("/root/Myfolder/scoringDocuments/doc3")
  127. doc3 = str.decode(f.read(), "UTF-8", "ignore")
  128.  
  129. train_set = ["president of India",doc1, doc2, doc3]
  130.  
  131. tfidf_vectorizer = TfidfVectorizer()
  132. tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set) #finds the tfidf score with normalization
  133. print "cosine scores ==> ",cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train) #here the first element of tfidf_matrix_train is matched with other three elements
  134.  
  135. [[ 1. 0.07102631 0.02731343 0.06348799]]
  136.  
  137. cosine_function = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  138.  
  139. from sklearn.feature_extraction.text import CountVectorizer
  140. from sklearn.feature_extraction.text import TfidfTransformer
  141. from nltk.corpus import stopwords
  142. import numpy as np
  143. import numpy.linalg as LA
  144.  
  145. train_set = ["The sky is blue.", "The sun is bright."] #Documents
  146. test_set = ["The sun in the sky is bright."] #Query
  147. stopWords = stopwords.words('english')
  148.  
  149. vectorizer = CountVectorizer(stop_words = stopWords)
  150. #print vectorizer
  151. transformer = TfidfTransformer()
  152. #print transformer
  153.  
  154. trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
  155. testVectorizerArray = vectorizer.transform(test_set).toarray()
  156. print 'Fit Vectorizer to train set', trainVectorizerArray
  157. print 'Transform Vectorizer to test set', testVectorizerArray
  158. cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
  159.  
  160. for vector in trainVectorizerArray:
  161. print vector
  162. for testV in testVectorizerArray:
  163. print testV
  164. cosine = cx(vector, testV)
  165. print cosine
  166.  
  167. transformer.fit(trainVectorizerArray)
  168. print
  169. print transformer.transform(trainVectorizerArray).toarray()
  170.  
  171. transformer.fit(testVectorizerArray)
  172. print
  173. tfidf = transformer.transform(testVectorizerArray)
  174. print tfidf.todense()
  175.  
  176. Fit Vectorizer to train set [[1 0 1 0]
  177. [0 1 0 1]]
  178. Transform Vectorizer to test set [[0 1 1 1]]
  179. [1 0 1 0]
  180. [0 1 1 1]
  181. 0.408
  182. [0 1 0 1]
  183. [0 1 1 1]
  184. 0.816
  185.  
  186. [[ 0.70710678 0. 0.70710678 0. ]
  187. [ 0. 0.70710678 0. 0.70710678]]
  188.  
  189. [[ 0. 0.57735027 0.57735027 0.57735027]]
  190.  
  191. from nltk.corpus import stopwords
  192. import string
  193. from nltk.tokenize import wordpunct_tokenize as tokenize
  194. from nltk.stem.porter import PorterStemmer
  195. from sklearn.feature_extraction.text import TfidfVectorizer
  196. from scipy.spatial.distance import cosine
  197.  
  198. porter = PorterStemmer()
  199. stop_words = set(stopwords.words('english'))
  200.  
  201. modified_arr = [[porter.stem(i.lower()) for i in tokenize(d.translate(None, string.punctuation)) if i.lower() not in stop_words] for d in documents]
  202.  
  203. modified_doc = [' '.join(i) for i in modified_arr] # this is only to convert our list of lists to list of strings that vectorizer uses.
  204. tf_idf = TfidfVectorizer().fit_transform(modified_doc)
  205.  
  206. l = len(documents) - 1
  207. for i in xrange(l):
  208. minimum = (1, None)
  209. minimum = min((cosine(tf_idf[i].todense(), tf_idf[l + 1].todense()), i), minimum)
  210. print minimum
  211.  
  212. from sklearn.feature_extraction.text import TfidfVectorizer
  213. from sklearn.metrics.pairwise import cosine_similarity
  214.  
  215. tfidf_vectorizer = TfidfVectorizer()
  216. tfidf_matrix = tfidf_vectorizer.fit_transform(train_set)
  217. print tfidf_matrix
  218. cosine = cosine_similarity(tfidf_matrix[length-1], tfidf_matrix)
  219. print cosine
  220.  
  221. [[ 0.34949812 0.81649658 1. ]]
Add Comment
Please, Sign In to add comment