Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas
- import operator
- import math
- import sys
- dataframe = None
- def __avg__(lst):
- return sum(lst)/len(lst)
- def define_dataframe(csv_file):
- try:
- global dataframe
- dataframe = pandas.read_csv(csv_file)
- except pandas.io.common.EmptyDataError:
- raise Exception('Not a valid file.')
- def print_dataframe():
- try:
- assert dataframe is not None, 'Dataframe was not created.'
- print('\n')
- print(dataframe)
- print('\n')
- except AssertionError, e:
- print(e.args[0])
- def rate(user, item):
- try:
- return float(dataframe.get_value(user, item))
- except ValueError:
- return None
- except KeyError:
- raise Exception('Not a valid user or item.')
- def item_rates(item, **kwargs):
- rates = []
- for rate in dataframe[item]:
- try:
- rates.append(float(rate))
- except ValueError:
- if kwargs.get('complete', False):
- rates.append(None)
- return rates
- def user_rates(user, **kwargs):
- rates = []
- for rate in dataframe.ix[user]:
- try:
- rates.append(float(rate))
- except ValueError:
- if kwargs.get('complete', False):
- rates.append(None)
- return rates
- def sim(user_a, user_b):
- user_a_rates = user_rates(user_a, complete=True)
- user_b_rates = user_rates(user_b, complete=True)
- user_a_avg = __avg__(user_rates(user_a))
- user_b_avg = __avg__(user_rates(user_b))
- numerator = 0
- denominator_1 = 0
- denominator_2 = 0
- for element_a, element_b in zip(user_a_rates, user_b_rates):
- if element_a is not None and element_b is not None:
- numerator += (element_a - user_a_avg)*(element_b - user_b_avg)
- denominator_1 += math.pow(element_a - user_a_avg, 2)
- denominator_2 += math.pow(element_b - user_b_avg, 2)
- return numerator/(math.sqrt(denominator_1)*math.sqrt(denominator_2))
- def __user_pred__(similarities, curr_user, curr_item):
- numerator = 0
- denominator = 0
- for data_user, sim_ratio in similarities.items():
- numerator += sim_ratio * \
- (rate(data_user, curr_item) -
- __avg__(user_rates(data_user)))
- denominator += sim_ratio
- return __avg__(user_rates(curr_user)) + (numerator / denominator)
- def __item_pred__(similarities, curr_user, curr_item):
- return -1.0
- def pred(user, item, **kwargs):
- similarities = {}
- for data_user in dataframe.index.values:
- if data_user != user:
- similarities[data_user] = sim(user, data_user)
- if 'N' in kwargs:
- try:
- assert kwargs['N'] <= len(similarities), \
- 'N cannot be larger than the number of rows.'
- except AssertionError, e:
- print(e.args[0])
- sys.exit(0)
- similarities = dict(sorted(similarities.items(),
- key=operator.itemgetter(1),
- reverse=True)[:kwargs.get('N', 2)])
- return (__item_pred__(similarities, user, item),
- __user_pred__(similarities, user, item))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement