Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from math import sqrt
- import matplotlib.pyplot as plt
- import csv
- DATA_POINT_NULL = 99
- GFX_COLOR_1 = 1
- GFX_COLOR_2 = 2
- def load_data_user_based(path):
- prefs = {}
- i = 0;
- with open(path, 'r') as csvfile:
- spamreader = csv.reader(csvfile, delimiter=' ', quotechar='"')
- for user in spamreader:
- i +=1
- prefs.setdefault(int(i), {})
- array2d = [float(digit.replace(',', '.')) for digit in user]
- for joke in range(1, len(array2d)):
- if(float(array2d[joke]) == DATA_POINT_NULL):
- continue
- prefs[int(i)][int(joke)]=float(array2d[joke])
- return prefs
- def load_data_item_based(path):
- prefs = {}
- i = 0;
- with open(path, 'r') as csvfile:
- spamreader = csv.reader(csvfile, delimiter=' ', quotechar='"')
- for user in spamreader:
- i += 1
- array2d = [float(digit.replace(',', '.')) for digit in user]
- for joke in range(1, len(array2d)):
- prefs.setdefault(int(joke) , {})
- if(float(array2d[joke]) == DATA_POINT_NULL):
- continue
- prefs[int(joke)][int(i)]=float(array2d[joke])
- return prefs
- def visualize(prefs):
- x = []
- y = []
- z = []
- for user in prefs:
- for joke in prefs[user]:
- if(float(prefs[user][joke]) == DATA_POINT_NULL):
- continue
- x.append(int(user))
- y.append(int(joke))
- if(float(prefs[user][joke]) < 0):
- z.append(GFX_COLOR_1)
- else:
- z.append(GFX_COLOR_2)
- plt.xlim([0,100])
- plt.ylim([0,100])
- plt.scatter(x, y, c = z)
- plt.show()
- #homer
- def sim_distance_1(prefs, person1, person2):
- X = set(prefs[int(person1)])
- Y = set(prefs[int(person2)])
- X_len = len(X)
- Y_len = len(Y)
- min_len = min(X_len, Y_len)
- Z = list(X & Y)
- if DATA_POINT_NULL in Z:
- Z.remove(DATA_POINT_NULL)
- Z_len = len(Z)
- return (float(Z_len) / float(min_len))
- def sim_distance_2(prefs, person1, person2):
- si = {}
- for item in prefs[int(person1)]:
- if item in prefs[int(person2)]:
- si[item] = 1
- if len(si) == 0:
- return 0
- sum_of_squares = sum([pow(prefs[int(person1)][int(item)]-prefs[int(person2)][int(item)],2)
- for item in prefs[int(person1)] if item in prefs[int(person2)]])
- return 1/(1+sum_of_squares)
- def topMatches(prefs_learn_data, person, object_id, k=5, similarity=sim_distance_1):
- prefs_with_object_id = {}
- for user in prefs_learn_data:
- if object_id in prefs_learn_data[int(user)]:
- prefs_with_object_id[int(user)] = prefs_learn_data[int(user)]
- prefs_with_object_id[int(person)] = prefs_learn_data[int(person)]
- scores = [(similarity(prefs_with_object_id, person, other), prefs_learn_data[int(other)][int(object_id)], other)
- for other in prefs_with_object_id if other!=person]
- scores.sort()
- scores.reverse()
- result_scores = [score for score in scores if score[0] > 0]
- return result_scores[0:k]
- def get_rating(prefs_learn_data, person, object_id, similarity=sim_distance_1):
- scores = topMatches(prefs_learn_data, person, object_id, similarity=similarity)
- if len(scores) == 0: return 0
- sum_sim_score = sum(score[0]*score[1] for score in scores)
- sum_sims = sum(score[0] for score in scores)
- rating = sum_sim_score/sum_sims
- return rating
- def calculate_error(rating_real, rating_predict):
- sum = 0
- for i in range(len(rating_real)-1):
- sum += pow(rating_real[i]-rating_predict[i], 2)
- return sqrt(sum)
- def user_based_party(visualize = False, sim = sim_distance_1):
- learn_data = load_data_user_based(path='jester-data-small-supper-80.csv')
- test_data = load_data_user_based(path='jester-data-small-supper-20.csv')
- if visualize == True:
- visualize(learn_data)
- rating_real = []
- rating_predict = []
- i = 0;
- for user in test_data:
- i += 1
- print i
- for joke in test_data[user]:
- rating_real.append(test_data[user][joke])
- rating_predict.append(get_rating(learn_data, int(user), int(joke), similarity = sim))
- print "========== USER BASED PARTY GOES HERE =========="
- print rating_predict
- print rating_real
- print calculate_error(rating_real, rating_predict)
- def item_based_party(visualize = False, sim = sim_distance_1):
- learn_data = load_data_item_based(path='jester-data-small-supper-80.csv')
- test_data = load_data_item_based(path='jester-data-small-supper-20.csv')
- if visualize == True:
- visualize(learn_data)
- rating_real = []
- rating_predict = []
- i = 0
- for user in test_data:
- i += 1
- print i
- for joke in test_data[int(user)]:
- rating_real.append(test_data[user][joke])
- rating_predict.append(get_rating(learn_data, int(joke), int(user), similarity = sim))
- print "========== ITEM BASED PARTY GOES HERE =========="
- print rating_predict
- print rating_real
- print calculate_error(rating_real, rating_predict)
- def main():
- user_based_party(visualize = False, sim = sim_distance_1)
- item_based_party(visualize = False, sim = sim_distance_1)
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement