Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Density:0.0571693682775\n",
- "(943L, 1680L)\n"
- ]
- }
- ],
- "source": [
- "#load training data and trnsform into a matrix\n",
- "train=pd.read_csv('ua.base',sep='\\t',header=None, names=['uid', 'mid', 'rating', 't'])\n",
- "#print(train.head())\n",
- "ratings= train.pivot_table(index='uid',columns='mid',values='rating')\n",
- "#print(ratings)\n",
- "print (\"Density:\"+str(train.shape[0]/float(ratings.shape[0]*ratings.shape[1])))\n",
- "upos={v:i for i,v in enumerate(ratings.index.values)}\n",
- "mpos={v:i for i,v in enumerate(ratings.columns)}\n",
- "ratings=ratings.fillna(0).values\n",
- "print(ratings.shape)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Size:9430\n"
- ]
- }
- ],
- "source": [
- "#load test data\n",
- "test=pd.read_csv('ua.test',sep='\\t',header=None, names=['uid', 'mid', 'rating', 't'])\n",
- "#truth= test.pivot_table(index='uid',columns='mid',values='rating')\n",
- "print (\"Size:\"+str(test.shape[0]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Classic / Simple recomendations"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "#Build Similarity Matrix\n",
- "S = np.zeros((943,943))\n",
- "#loop over all user combinations\n",
- "#Remember: sim(i,j)=sim(j,i)\n",
- "for i in range(943):\n",
- " for j in range(i+1,943):\n",
- " product=np.dot(ratings[i,:],ratings[j,:])\n",
- " norm=np.linalg.norm(ratings[i,:])*np.linalg.norm(ratings[j,:])\n",
- " similarity=product/norm\n",
- " S[i,j]=similarity\n",
- " S[j,i]=similarity"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "#produce ratings for test data\n",
- "predictions=[]\n",
- "#for every user,movie compute the sum of similarity*rating\n",
- "for row in test.iterrows():\n",
- " user=upos[row[1]['uid']]\n",
- " if row[1]['mid'] not in mpos:\n",
- " predictions.append(2.5)\n",
- " continue\n",
- " movie=mpos[row[1]['mid']]\n",
- " non_zero_users=np.where(ratings[:,movie] >0 )#users who rated \"movie\"\n",
- " weightedRatingSum=0\n",
- " sumSimilarities=0\n",
- " for index in non_zero_users[0]:\n",
- " weightedRatingSum+=S[index,user]* ratings[index,movie]\n",
- " sumSimilarities+=S[index,user]\n",
- " predictedRating=2.5\n",
- " if sumSimilarities !=0 :\n",
- " predictedRating=weightedRatingSum/sumSimilarities\n",
- " predictions.append(predictedRating)\n",
- "test['pred']=predictions"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0.829277938933\n",
- "1.07049302232\n"
- ]
- }
- ],
- "source": [
- "#compute mean abolute error and mean squre error\n",
- "mae=(test['rating']-test['pred']).apply(np.abs).values.mean()\n",
- "print mae\n",
- "mse=(test['rating']-test['pred']).apply(np.square).values.mean()\n",
- "print mse"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#produce ratings for test data based on the deviation\n",
- "predictions=[]\n",
- "#for every user,movie compute the sum of similarity*rating\n",
- "for row in test.iterrows():\n",
- " user=upos[row[1]['uid']]\n",
- " if row[1]['mid'] not in mpos:\n",
- " predictions.append(2.5)\n",
- " continue\n",
- " movie=mpos[row[1]['mid']]\n",
- " non_zero_users=np.where(ratings[:,movie] >0 )#users who rated \"movie\"\n",
- " weightedRatingSum=0\n",
- " sumSimilarities=0\n",
- " for index in non_zero_users[0]:\n",
- " meanIndexuser=ratings[index,np.where(ratings[index,:] >0 )].mean()\n",
- " weightedRatingSum+=S[index,user]* (ratings[index,movie]-meanIndexuser)\n",
- " sumSimilarities+=S[index,user]\n",
- " predictedRating=2.5\n",
- " if sumSimilarities !=0 :\n",
- " predictedRating=ratings[user,np.where(ratings[user,:] >0 )].mean()+\n",
- " weightedRatingSum/sumSimilarities\n",
- " predictions.append(predictedRating)\n",
- "test['pred']=predictions\n",
- "#compute mean abolute error and mean squre error\n",
- "mae=(test['rating']-test['pred']).apply(np.abs).values.mean()\n",
- "print mae\n",
- "mse=(test['rating']-test['pred']).apply(np.square).values.mean()\n",
- "print mse"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Use SVD"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "from scipy.sparse.linalg import svds"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "#compute SVD\n",
- "U,S,V =svds(ratings-ratings.mean(axis=0,keepdims=True),k=5)\n",
- "newRatings=U.dot(np.diag(S)).dot(V)+ratings.mean(axis=0,keepdims=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2.63406231903\n",
- "8.39674374306\n"
- ]
- }
- ],
- "source": [
- "#produce new rating matrix\n",
- "predictions=[]\n",
- "for row in test.iterrows():\n",
- " user=upos[row[1]['uid']]\n",
- " if row[1]['mid'] not in mpos:\n",
- " predictions.append(2.5)\n",
- " continue\n",
- " movie=mpos[row[1]['mid']]\n",
- " predictions.append(newRatings[user,movie])\n",
- "test['pred']=predictions\n",
- "mae=(test['rating']-test['pred']).apply(np.abs).values.mean()\n",
- "print mae\n",
- "mse=(test['rating']-test['pred']).apply(np.square).values.mean()\n",
- "print mse"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Gradient Decent"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "#define cost function\n",
- "def costFunction(params, Y, rank):\n",
- "\n",
- " numMovies = Y.shape[1]\n",
- " numUsers = Y.shape[0]\n",
- "\n",
- " # Unfold the A and B matrices from params\n",
- " A = np.reshape(params[:numUsers*rank], (numUsers, rank), order='F')\n",
- " B = np.reshape(params[numUsers*rank:], (numMovies, rank), order='F')\n",
- "\n",
- " # calculate cost function.\n",
- " diff=A.dot(B.T)-Y\n",
- " error=sum(diff[np.where(Y>0)]**2) \n",
- " return error"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#define gradient function\n",
- "def gradient(params, Y, rank):\n",
- " numMovies = Y.shape[1]\n",
- " numUsers = Y.shape[0]\n",
- " # Unfold the A and B matrices om params\n",
- " A = np.reshape(params[:numUsers*rank], (numUsers, rank), order='F')\n",
- " B = np.reshape(params[numUsers*rank:], (numMovies, rank), order='F')\n",
- " dA=np.zeros(A.shape)\n",
- " for i in range(numUsers):\n",
- " #Y ratings by user i only non zero rated \n",
- " Ynz=Y[i,Y[i,:]!=0]\n",
- " #B features of movies rated by user i\n",
- " Bnz=B[Y[i,:]!=0,:]\n",
- " #A row i of user fearues\n",
- " dA[i,:]=(A[i,:].dot(Bnz.T)-Ynz).dot(Bnz)\n",
- " dB=np.zeros(B.shape)\n",
- " for i in range(numMovies): \n",
- " Ynz=Y[Y[:,i]!=0,i]\n",
- " Anz=A[Y[:,i]!=0,:]\n",
- " dB[i,:]=(Anz.dot(B[i,:].T)-Ynz).T.dot(Anz)\n",
- " return np.concatenate((dA.flatten('F'),dB.flatten('F')))\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Warning: Maximum number of iterations has been exceeded.\n",
- " Current function value: 62012.337751\n",
- " Iterations: 50\n",
- " Function evaluations: 175\n",
- " Gradient evaluations: 175\n",
- "0.796250804199\n",
- "1.07159269579\n"
- ]
- }
- ],
- "source": [
- "from scipy import optimize\n",
- "numMovies = ratings.shape[1]\n",
- "numUsers = ratings.shape[0]\n",
- "rank = 5\n",
- "\n",
- "# Set Initial Parameters A, B\n",
- "A = np.random.randn(numUsers, rank)\n",
- "B = np.random.randn(numMovies, rank)\n",
- "initial_parameters = np.concatenate((A.flatten('F'),B.flatten('F')))\n",
- "\n",
- "params = optimize.fmin_cg(costFunction, initial_parameters, \n",
- " fprime=gradient, args=(ratings, rank),maxiter=50)\n",
- "\n",
- "# Unfold the A and B matrices from params\n",
- "A = np.reshape(params[:numUsers*rank], (numUsers, rank), order='F')\n",
- "B = np.reshape(params[numUsers*rank:], (numMovies, rank), order='F')\n",
- "\n",
- "# Compute the predictions matrix\n",
- "newRatings=A.dot(B.T)\n",
- "#produce new rating matrix\n",
- "predictions=[]\n",
- "for row in test.iterrows():\n",
- " user=upos[row[1]['uid']]\n",
- " if row[1]['mid'] not in mpos:\n",
- " predictions.append(2.5)\n",
- " continue\n",
- " movie=mpos[row[1]['mid']]\n",
- " predictions.append(newRatings[user,movie])\n",
- "test['pred']=predictions\n",
- "mae=(test['rating']-test['pred']).apply(np.abs).values.mean()\n",
- "print mae\n",
- "mse=(test['rating']-test['pred']).apply(np.square).values.mean()\n",
- "print mse"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement