Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Q6: for each user, output each user's useful votes
- # OUTPUT: userid, number of useful votes
- # SECONDARY OUTPUT: the average of total useful votes
- import numpy as np
- from pandas import *
- # for each user, store the "useful votes" (field #10) they have
- usefulVotesCount = dict()
- n = 0
- f = open('yelp_reviews.txt', 'r')
- for line in f:
- # ignore first row
- if (n > 0):
- # split into a list on the pipe delimiter
- currentRow = line.split("|")
- currentUserId = currentRow[6]
- currentUsefulVotes = float(currentRow[9])
- # save userIDs into dict.
- # if it's not there, create an entry
- if currentUserId not in usefulVotesCount:
- usefulVotesCount[currentUserId] = 0
- usefulVotesCount[currentUserId ] += currentUsefulVotes
- # increment what row we're on
- n += 1
- # end loop to build dict
- # put this dict into a series for easy analysis
- series = Series(usefulVotesCount)
- ########### OUTPUT
- # first, for each userid, output total number of reviews they had
- numberUsers = series.shape[0] # this is the length of series
- pandas.set_option("display.max_rows", numberUsers)
- print series
- # secondly, find the AVERAGE number of useful votes per user
- print series.mean() # -> 6.78323294735
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement