ssqdsd

import pyspark
import pyspark.mllib.recommendation as reco

try:
    sc = pyspark.SparkContext('local[*]')
except:
    sc = sc

#read ratings file as lines of text, assuming no errors on data
lines = sc.textFile("ml-1m/ratings.dat")
ratings = lines.map(lambda l: l.split("::")). \
            map(lambda p: reco.Rating( user=int(p[0]), product=int(p[1]), rating=int(p[2])) )

lines = sc.textFile("ml-1m/movies.dat")
movies =  lines.map(lambda l: l.split("::")).map(lambda p: (int(p[0]),p[1], p[2].split('|') ))
_movies = sc.parallelize(movies.sortBy(lambda a: a[2]).take(20))
for m in _movies.collect():
    print(m)
my_user_id = 0

my_ratings = [
     (my_user_id, 9, 5 ),
     (my_user_id, 20, 5 ),
     (my_user_id, 71, 4 ),
     (my_user_id, 145,  5 ),
     (my_user_id, 204,   4 ),
     (my_user_id, 227,  5 ),
     (my_user_id, 251,  5 ),
     (my_user_id, 315, 5 ),
     (my_user_id, 384, 4),
     (my_user_id, 393, 4)]

new_ratings = sc.parallelize(my_ratings).map(lambda p: reco.Rating( user=int(p[0]), product=int(p[1]), rating=int(p[2])) )
ratings = ratings.union(new_ratings)

# calculate how many ratings has each movie

moviecounts = ratings.map(lambda r: (r[1], 1)).reduceByKey(lambda a,b: a+b)

# filter by ratings count
too_few_ratings = moviecounts.filter(lambda x: x[1]<30)

# get only movies ID's
too_few_ratings = too_few_ratings.map(lambda x: x[0]).collect()

iterations = 5;
rank = 8;
lambd = 0.3;
#model calculation
model = reco.ALS.train(ratings, rank, iterations, lambd, nonnegative=True, seed=10)

my_not_rated = _movies.filter(lambda x: x[0] not in [row[1] for row in my_ratings]) #remove my rated
my_not_rated = my_not_rated.filter(lambda x: x[0] not in too_few_ratings) #remove with too few ratings
my_not_rated_with_user_id = my_not_rated.map(lambda p: (my_user_id, p[0])) #add userid

predictions = model.predictAll(my_not_rated_with_user_id).map(lambda r: (r[1], r[2]))