Advertisement
Guest User

Untitled

a guest
Oct 28th, 2016
60
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.75 KB | None | 0 0
  1. from math import sqrt
  2. import matplotlib.pyplot as plt
  3. import csv
  4.  
  5. DATA_POINT_NULL = 99
  6. GFX_COLOR_1 = 1
  7. GFX_COLOR_2 = 2
  8.  
  9. def load_data_user_based(path):
  10. prefs = {}
  11. i = 0;
  12. with open(path, 'r') as csvfile:
  13. spamreader = csv.reader(csvfile, delimiter=' ', quotechar='"')
  14. for user in spamreader:
  15. i +=1
  16. prefs.setdefault(int(i), {})
  17. array2d = [float(digit.replace(',', '.')) for digit in user]
  18. for joke in range(1, len(array2d)):
  19. if(float(array2d[joke]) == DATA_POINT_NULL):
  20. continue
  21. prefs[int(i)][int(joke)]=float(array2d[joke])
  22. return prefs
  23.  
  24. def load_data_item_based(path):
  25. prefs = {}
  26. i = 0;
  27. with open(path, 'r') as csvfile:
  28. spamreader = csv.reader(csvfile, delimiter=' ', quotechar='"')
  29. for user in spamreader:
  30. i += 1
  31. array2d = [float(digit.replace(',', '.')) for digit in user]
  32. for joke in range(1, len(array2d)):
  33. prefs.setdefault(int(joke) , {})
  34. if(float(array2d[joke]) == DATA_POINT_NULL):
  35. continue
  36. prefs[int(joke)][int(i)]=float(array2d[joke])
  37. return prefs
  38.  
  39.  
  40. def visualize(prefs):
  41. x = []
  42. y = []
  43. z = []
  44. for user in prefs:
  45. for joke in prefs[user]:
  46. if(float(prefs[user][joke]) == DATA_POINT_NULL):
  47. continue
  48. x.append(int(user))
  49. y.append(int(joke))
  50. if(float(prefs[user][joke]) < 0):
  51. z.append(GFX_COLOR_1)
  52. else:
  53. z.append(GFX_COLOR_2)
  54.  
  55. plt.xlim([0,100])
  56. plt.ylim([0,100])
  57. plt.scatter(x, y, c = z)
  58. plt.show()
  59.  
  60. #homer
  61. def sim_distance_1(prefs, person1, person2):
  62. X = set(prefs[int(person1)])
  63. Y = set(prefs[int(person2)])
  64. X_len = len(X)
  65. Y_len = len(Y)
  66.  
  67. min_len = min(X_len, Y_len)
  68. Z = list(X & Y)
  69. if DATA_POINT_NULL in Z:
  70. Z.remove(DATA_POINT_NULL)
  71. Z_len = len(Z)
  72.  
  73. return (float(Z_len) / float(min_len))
  74.  
  75. def sim_distance_2(prefs, person1, person2):
  76. si = {}
  77.  
  78. for item in prefs[int(person1)]:
  79. if item in prefs[int(person2)]:
  80. si[item] = 1
  81.  
  82. if len(si) == 0:
  83. return 0
  84.  
  85. sum_of_squares = sum([pow(prefs[int(person1)][int(item)]-prefs[int(person2)][int(item)],2)
  86. for item in prefs[int(person1)] if item in prefs[int(person2)]])
  87.  
  88. return 1/(1+sum_of_squares)
  89.  
  90. def topMatches(prefs_learn_data, person, object_id, k=5, similarity=sim_distance_1):
  91.  
  92. prefs_with_object_id = {}
  93. for user in prefs_learn_data:
  94. if object_id in prefs_learn_data[int(user)]:
  95. prefs_with_object_id[int(user)] = prefs_learn_data[int(user)]
  96.  
  97. prefs_with_object_id[int(person)] = prefs_learn_data[int(person)]
  98.  
  99. scores = [(similarity(prefs_with_object_id, person, other), prefs_learn_data[int(other)][int(object_id)], other)
  100. for other in prefs_with_object_id if other!=person]
  101. scores.sort()
  102. scores.reverse()
  103.  
  104. result_scores = [score for score in scores if score[0] > 0]
  105. return result_scores[0:k]
  106.  
  107. def get_rating(prefs_learn_data, person, object_id, similarity=sim_distance_1):
  108. scores = topMatches(prefs_learn_data, person, object_id, similarity=similarity)
  109. if len(scores) == 0: return 0
  110. sum_sim_score = sum(score[0]*score[1] for score in scores)
  111. sum_sims = sum(score[0] for score in scores)
  112. rating = sum_sim_score/sum_sims
  113. return rating
  114.  
  115. def calculate_error(rating_real, rating_predict):
  116. sum = 0
  117. for i in range(len(rating_real)-1):
  118. sum += pow(rating_real[i]-rating_predict[i], 2)
  119. return sqrt(sum)
  120.  
  121. def user_based_party(visualize = False, sim = sim_distance_1):
  122. learn_data = load_data_user_based(path='jester-data-small-supper-80.csv')
  123. test_data = load_data_user_based(path='jester-data-small-supper-20.csv')
  124.  
  125. if visualize == True:
  126. visualize(learn_data)
  127.  
  128. rating_real = []
  129. rating_predict = []
  130. i = 0;
  131. for user in test_data:
  132. i += 1
  133. print i
  134. for joke in test_data[user]:
  135. rating_real.append(test_data[user][joke])
  136. rating_predict.append(get_rating(learn_data, int(user), int(joke), similarity = sim))
  137.  
  138. print "========== USER BASED PARTY GOES HERE =========="
  139. print rating_predict
  140. print rating_real
  141. print calculate_error(rating_real, rating_predict)
  142.  
  143. def item_based_party(visualize = False, sim = sim_distance_1):
  144. learn_data = load_data_item_based(path='jester-data-small-supper-80.csv')
  145. test_data = load_data_item_based(path='jester-data-small-supper-20.csv')
  146.  
  147. if visualize == True:
  148. visualize(learn_data)
  149.  
  150. rating_real = []
  151. rating_predict = []
  152.  
  153. i = 0
  154.  
  155. for user in test_data:
  156. i += 1
  157. print i
  158. for joke in test_data[int(user)]:
  159. rating_real.append(test_data[user][joke])
  160. rating_predict.append(get_rating(learn_data, int(joke), int(user), similarity = sim))
  161.  
  162. print "========== ITEM BASED PARTY GOES HERE =========="
  163. print rating_predict
  164. print rating_real
  165. print calculate_error(rating_real, rating_predict)
  166.  
  167. def main():
  168. user_based_party(visualize = False, sim = sim_distance_1)
  169. item_based_party(visualize = False, sim = sim_distance_1)
  170.  
  171.  
  172. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement