dat q6

# Q6: for each user, output each user's useful votes
# OUTPUT: userid, number of useful votes
# SECONDARY OUTPUT: the average of total useful votes
import numpy as np
from pandas import *

# for each user, store the "useful votes" (field #10) they have
usefulVotesCount = dict()

n = 0
f = open('yelp_reviews.txt', 'r')
for line in f:

    # ignore first row
    if (n > 0):

        # split into a list on the pipe delimiter
        currentRow = line.split("|")
        currentUserId = currentRow[6]
        currentUsefulVotes = float(currentRow[9])

        # save userIDs into dict.
        # if it's not there, create an entry
        if currentUserId not in usefulVotesCount:
            usefulVotesCount[currentUserId] = 0
        usefulVotesCount[currentUserId ] += currentUsefulVotes

    # increment what row we're on
    n += 1

# end loop to build dict
# put this dict into a series for easy analysis
series = Series(usefulVotesCount)

########### OUTPUT
# first, for each userid, output total number of reviews they had
numberUsers = series.shape[0] # this is the length of series
pandas.set_option("display.max_rows", numberUsers)
print series

# secondly, find the AVERAGE number of useful votes per user
print series.mean() # -> 6.78323294735