Advertisement
Guest User

Untitled

a guest
May 22nd, 2015
207
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.58 KB | None | 0 0
  1. import pymongo
  2. import json
  3. from pprint import pprint
  4. import numpy
  5. import scipy
  6. from tokenizer import Tokenizer
  7. import datetime
  8.  
  9. client = pymongo.MongoClient("localhost", 27017)
  10. db = client.yelp
  11. users_collection = db.users
  12. reviews_collection = db.reviews
  13. businesses_collection = db.businesses
  14. restaurant_review_collection = db.restaurant_reviews
  15. food_review_collection = db.food_reviews
  16. restaurants_collection = db.restaurants
  17. culinary_businesses_collection = db.culinary_businesses
  18. print db.name
  19.  
  20. # GET USER, TEXT PAIRS
  21. user_ids = list()
  22. user_texts = list()
  23. business_ids = list()
  24. business_texts = list()
  25.  
  26. print "#####################"
  27. print "# CALCULATING USERS #"
  28. print "#####################"
  29.  
  30. t1 = datetime.datetime.now()
  31. for user in users_collection.find().limit(10):
  32. u_id = user["user_id"]
  33. #print u_id
  34. reviews_current = restaurant_review_collection.find({"user_id":u_id})
  35. text = u_id # since some users have no reviews with text
  36. for review in reviews_current:
  37. text = text + " " + review["text"]
  38. user_ids.append(u_id)
  39. user_texts.append(text)
  40. #print "Review text length: " + str(len(text))
  41. t2 = datetime.datetime.now()
  42.  
  43. print "Execution time: %s" % (t2-t1)
  44.  
  45. print "##########################"
  46. print "# CALCULATING BUSINESSES #"
  47. print "##########################"
  48.  
  49. t1 = datetime.datetime.now()
  50. for restaurant in restaurants_collection.find().limit(100):
  51. b_id = restaurant["business_id"]
  52. print b_id
  53. reviews_current = restaurant_review_collection.find({"business_id":b_id})
  54. text = b_id # since some users have no reviews with text
  55. for review in reviews_current:
  56. text = text + " " + review["text"]
  57. text = text[0: min(len(text), 100000)]
  58. business_ids.append(b_id)
  59. business_texts.append(text)
  60. print "Review text length: " + str(len(text))
  61.  
  62. myTokenizer = Tokenizer(business_ids, business_texts)
  63. t2 = datetime.datetime.now()
  64.  
  65. print "Execution time: %s" % (t2-t1)
  66.  
  67. #myTokenizer = Tokenizer(user_ids, user_texts)
  68. #myTokenizer = Tokenizer(business_ids, business_texts)
  69.  
  70. #results = myTokenizer.getSimilarDocuments("KXJbnHT4PDS1JZNCFKdmMg", 5)
  71. results = myTokenizer.getSimilarDocuments("qw5gR8vW7mSOK4VROSwdMA", 5)
  72.  
  73. #print results
  74. print "USER 1-----------------------------------------"
  75. print "qw5gR8vW7mSOK4VROSwdMA"
  76. #print user_texts[user_ids.index("KXJbnHT4PDS1JZNCFKdmMg")]
  77. #print business_texts[business_ids.index("qw5gR8vW7mSOK4VROSwdMA")]
  78. print "USER 2-----------------------------------------"
  79. #print user_texts[user_ids.index(results[0])]
  80. #print business_texts[business_ids.index(results[0])]
  81. print results[0]
  82. print "USER 3-----------------------------------------"
  83. print results[1]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement