Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pymongo
- import json
- from pprint import pprint
- import numpy
- import scipy
- from tokenizer import Tokenizer
- import datetime
- client = pymongo.MongoClient("localhost", 27017)
- db = client.yelp
- users_collection = db.users
- reviews_collection = db.reviews
- businesses_collection = db.businesses
- restaurant_review_collection = db.restaurant_reviews
- food_review_collection = db.food_reviews
- restaurants_collection = db.restaurants
- culinary_businesses_collection = db.culinary_businesses
- print db.name
- # GET USER, TEXT PAIRS
- user_ids = list()
- user_texts = list()
- business_ids = list()
- business_texts = list()
- print "#####################"
- print "# CALCULATING USERS #"
- print "#####################"
- t1 = datetime.datetime.now()
- for user in users_collection.find().limit(10):
- u_id = user["user_id"]
- #print u_id
- reviews_current = restaurant_review_collection.find({"user_id":u_id})
- text = u_id # since some users have no reviews with text
- for review in reviews_current:
- text = text + " " + review["text"]
- user_ids.append(u_id)
- user_texts.append(text)
- #print "Review text length: " + str(len(text))
- t2 = datetime.datetime.now()
- print "Execution time: %s" % (t2-t1)
- print "##########################"
- print "# CALCULATING BUSINESSES #"
- print "##########################"
- t1 = datetime.datetime.now()
- for restaurant in restaurants_collection.find().limit(100):
- b_id = restaurant["business_id"]
- print b_id
- reviews_current = restaurant_review_collection.find({"business_id":b_id})
- text = b_id # since some users have no reviews with text
- for review in reviews_current:
- text = text + " " + review["text"]
- text = text[0: min(len(text), 100000)]
- business_ids.append(b_id)
- business_texts.append(text)
- print "Review text length: " + str(len(text))
- myTokenizer = Tokenizer(business_ids, business_texts)
- t2 = datetime.datetime.now()
- print "Execution time: %s" % (t2-t1)
- #myTokenizer = Tokenizer(user_ids, user_texts)
- #myTokenizer = Tokenizer(business_ids, business_texts)
- #results = myTokenizer.getSimilarDocuments("KXJbnHT4PDS1JZNCFKdmMg", 5)
- results = myTokenizer.getSimilarDocuments("qw5gR8vW7mSOK4VROSwdMA", 5)
- #print results
- print "USER 1-----------------------------------------"
- print "qw5gR8vW7mSOK4VROSwdMA"
- #print user_texts[user_ids.index("KXJbnHT4PDS1JZNCFKdmMg")]
- #print business_texts[business_ids.index("qw5gR8vW7mSOK4VROSwdMA")]
- print "USER 2-----------------------------------------"
- #print user_texts[user_ids.index(results[0])]
- #print business_texts[business_ids.index(results[0])]
- print results[0]
- print "USER 3-----------------------------------------"
- print results[1]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement