Advertisement
Guest User

Untitled

a guest
Dec 22nd, 2014
165
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.80 KB | None | 0 0
  1. from __future__ import division
  2. import string
  3. import math
  4.  
  5. tokenize = lambda doc: doc.lower().split(" ")
  6.  
  7. document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
  8. document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
  9. document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
  10. document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
  11. document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
  12. document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
  13. document_6 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?"
  14.  
  15. all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]
  16.  
  17. def jaccard_similarity(query, document):
  18. intersection = set(query).intersection(set(document))
  19. union = set(query).union(set(document))
  20. return len(intersection)/len(union)
  21.  
  22. def term_frequency(term, tokenized_document):
  23. return tokenized_document.count(term)
  24.  
  25. def sublinear_term_frequency(term, tokenized_document):
  26. count = tokenized_document.count(term)
  27. if count == 0:
  28. return 0
  29. return 1 + math.log(count)
  30.  
  31. def augmented_term_frequency(term, tokenized_document):
  32. max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
  33. return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))
  34.  
  35. def inverse_document_frequencies(tokenized_documents):
  36. idf_values = {}
  37. all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
  38. for tkn in all_tokens_set:
  39. contains_token = map(lambda doc: tkn in doc, tokenized_documents)
  40. idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
  41. return idf_values
  42.  
  43. def tfidf(documents):
  44. tokenized_documents = [tokenize(d) for d in documents]
  45. idf = inverse_document_frequencies(tokenized_documents)
  46. tfidf_documents = []
  47. for document in tokenized_documents:
  48. doc_tfidf = []
  49. for term in idf.keys():
  50. tf = sublinear_term_frequency(term, document)
  51. doc_tfidf.append(tf * idf[term])
  52. tfidf_documents.append(doc_tfidf)
  53. return tfidf_documents
  54.  
  55. #in Scikit-Learn
  56. from sklearn.feature_extraction.text import TfidfVectorizer
  57.  
  58. sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
  59. sklearn_representation = sklearn_tfidf.fit_transform(all_documents)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement