Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- unames = ['user_id','gender','age','occupation','zip']
- users = pd.read_table('ch02\movielens\users.dat', sep = '::', header = None, names = unames)
- rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
- ratings = pd.read_table('ch02/movielens/ratings.dat', sep = '::', header = None, names = rnames)
- mnames = ['movie_id','title','genres']
- movies = pd.read_table('ch02/movielens/movies.dat', sep = '::', header = None, names = mnames)
- data = pd.merge(pd.merge(users, ratings), movies)
- data.ix[0]
- mean_ratings = data.pivot_table('rating', index = 'title', columns =
- 'gender', aggfunc = 'mean')
- #next to filter teh movies that received at leaset 250 ratings
- ratings_by_title = data.groupby('title').size()
- active_titles = ratings_by_title.index[ratings_by_title >= 250]
- mean_ratings = mean_ratings.ix[active_titles]
- top_female_ratings = mean_ratings.sort_index(by='F', ascending = False)
- #to find the moview that are most divisive between male and female viewers
- mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
- sort_by_diff = mean_ratings.sort_index(by='diff')
- sort_by_diff[::-1][:15] # reverse order of rows, take first 15 rows
- rating_std_by_title = data.groupby('title')['rating'].std()
- rating_std_by_title = rating_std_by_title.ix[active_titles]
- rating_std_by_title.order(ascending = False)[:10]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement