daily pastebin goal
2%
SHARE
TWEET

ssqdsd

a guest Apr 21st, 2017 44 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pyspark
  2. import pyspark.mllib.recommendation as reco
  3.  
  4. try:
  5.     sc = pyspark.SparkContext('local[*]')
  6. except:
  7.     sc = sc
  8.  
  9. #read ratings file as lines of text, assuming no errors on data
  10. lines = sc.textFile("ml-1m/ratings.dat")
  11. ratings = lines.map(lambda l: l.split("::")). \
  12.             map(lambda p: reco.Rating( user=int(p[0]), product=int(p[1]), rating=int(p[2])) )
  13.  
  14. lines = sc.textFile("ml-1m/movies.dat")
  15. movies =  lines.map(lambda l: l.split("::")).map(lambda p: (int(p[0]),p[1], p[2].split('|') ))
  16. _movies = sc.parallelize(movies.sortBy(lambda a: a[2]).take(20))
  17. for m in _movies.collect():
  18.     print(m)
  19. my_user_id = 0
  20.    
  21. my_ratings = [
  22.      (my_user_id, 9, 5 ),
  23.      (my_user_id, 20, 5 ),
  24.      (my_user_id, 71, 4 ),
  25.      (my_user_id, 145,  5 ),
  26.      (my_user_id, 204,   4 ),
  27.      (my_user_id, 227,  5 ),
  28.      (my_user_id, 251,  5 ),
  29.      (my_user_id, 315, 5 ),
  30.      (my_user_id, 384, 4),
  31.      (my_user_id, 393, 4)]
  32.  
  33. new_ratings = sc.parallelize(my_ratings).map(lambda p: reco.Rating( user=int(p[0]), product=int(p[1]), rating=int(p[2])) )
  34. ratings = ratings.union(new_ratings)
  35.  
  36. # calculate how many ratings has each movie
  37.  
  38. moviecounts = ratings.map(lambda r: (r[1], 1)).reduceByKey(lambda a,b: a+b)
  39.  
  40. # filter by ratings count
  41. too_few_ratings = moviecounts.filter(lambda x: x[1]<30)
  42.  
  43. # get only movies ID's
  44. too_few_ratings = too_few_ratings.map(lambda x: x[0]).collect()
  45.  
  46. iterations = 5;
  47. rank = 8;
  48. lambd = 0.3;
  49. #model calculation
  50. model = reco.ALS.train(ratings, rank, iterations, lambd, nonnegative=True, seed=10)
  51.  
  52. my_not_rated = _movies.filter(lambda x: x[0] not in [row[1] for row in my_ratings]) #remove my rated
  53. my_not_rated = my_not_rated.filter(lambda x: x[0] not in too_few_ratings) #remove with too few ratings
  54. my_not_rated_with_user_id = my_not_rated.map(lambda p: (my_user_id, p[0])) #add userid
  55.  
  56. predictions = model.predictAll(my_not_rated_with_user_id).map(lambda r: (r[1], r[2]))
RAW Paste Data
Top