Advertisement
Guest User

Untitled

a guest
Nov 18th, 2017
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.16 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {
  7. "collapsed": true
  8. },
  9. "outputs": [],
  10. "source": [
  11. "import pandas as pd\n",
  12. "import numpy as np\n",
  13. "import matplotlib.pyplot as plt"
  14. ]
  15. },
  16. {
  17. "cell_type": "code",
  18. "execution_count": 7,
  19. "metadata": {
  20. "collapsed": false
  21. },
  22. "outputs": [
  23. {
  24. "name": "stdout",
  25. "output_type": "stream",
  26. "text": [
  27. "Density:0.0571693682775\n",
  28. "(943L, 1680L)\n"
  29. ]
  30. }
  31. ],
  32. "source": [
  33. "#load training data and trnsform into a matrix\n",
  34. "train=pd.read_csv('ua.base',sep='\\t',header=None, names=['uid', 'mid', 'rating', 't'])\n",
  35. "#print(train.head())\n",
  36. "ratings= train.pivot_table(index='uid',columns='mid',values='rating')\n",
  37. "#print(ratings)\n",
  38. "print (\"Density:\"+str(train.shape[0]/float(ratings.shape[0]*ratings.shape[1])))\n",
  39. "upos={v:i for i,v in enumerate(ratings.index.values)}\n",
  40. "mpos={v:i for i,v in enumerate(ratings.columns)}\n",
  41. "ratings=ratings.fillna(0).values\n",
  42. "print(ratings.shape)"
  43. ]
  44. },
  45. {
  46. "cell_type": "code",
  47. "execution_count": 6,
  48. "metadata": {
  49. "collapsed": false
  50. },
  51. "outputs": [
  52. {
  53. "name": "stdout",
  54. "output_type": "stream",
  55. "text": [
  56. "Size:9430\n"
  57. ]
  58. }
  59. ],
  60. "source": [
  61. "#load test data\n",
  62. "test=pd.read_csv('ua.test',sep='\\t',header=None, names=['uid', 'mid', 'rating', 't'])\n",
  63. "#truth= test.pivot_table(index='uid',columns='mid',values='rating')\n",
  64. "print (\"Size:\"+str(test.shape[0]))"
  65. ]
  66. },
  67. {
  68. "cell_type": "markdown",
  69. "metadata": {},
  70. "source": [
  71. "Classic / Simple recomendations"
  72. ]
  73. },
  74. {
  75. "cell_type": "code",
  76. "execution_count": 8,
  77. "metadata": {
  78. "collapsed": false
  79. },
  80. "outputs": [],
  81. "source": [
  82. "#Build Similarity Matrix\n",
  83. "S = np.zeros((943,943))\n",
  84. "#loop over all user combinations\n",
  85. "#Remember: sim(i,j)=sim(j,i)\n",
  86. "for i in range(943):\n",
  87. " for j in range(i+1,943):\n",
  88. " product=np.dot(ratings[i,:],ratings[j,:])\n",
  89. " norm=np.linalg.norm(ratings[i,:])*np.linalg.norm(ratings[j,:])\n",
  90. " similarity=product/norm\n",
  91. " S[i,j]=similarity\n",
  92. " S[j,i]=similarity"
  93. ]
  94. },
  95. {
  96. "cell_type": "code",
  97. "execution_count": 9,
  98. "metadata": {
  99. "collapsed": false
  100. },
  101. "outputs": [],
  102. "source": [
  103. "#produce ratings for test data\n",
  104. "predictions=[]\n",
  105. "#for every user,movie compute the sum of similarity*rating\n",
  106. "for row in test.iterrows():\n",
  107. " user=upos[row[1]['uid']]\n",
  108. " if row[1]['mid'] not in mpos:\n",
  109. " predictions.append(2.5)\n",
  110. " continue\n",
  111. " movie=mpos[row[1]['mid']]\n",
  112. " non_zero_users=np.where(ratings[:,movie] >0 )#users who rated \"movie\"\n",
  113. " weightedRatingSum=0\n",
  114. " sumSimilarities=0\n",
  115. " for index in non_zero_users[0]:\n",
  116. " weightedRatingSum+=S[index,user]* ratings[index,movie]\n",
  117. " sumSimilarities+=S[index,user]\n",
  118. " predictedRating=2.5\n",
  119. " if sumSimilarities !=0 :\n",
  120. " predictedRating=weightedRatingSum/sumSimilarities\n",
  121. " predictions.append(predictedRating)\n",
  122. "test['pred']=predictions"
  123. ]
  124. },
  125. {
  126. "cell_type": "code",
  127. "execution_count": 11,
  128. "metadata": {
  129. "collapsed": false
  130. },
  131. "outputs": [
  132. {
  133. "name": "stdout",
  134. "output_type": "stream",
  135. "text": [
  136. "0.829277938933\n",
  137. "1.07049302232\n"
  138. ]
  139. }
  140. ],
  141. "source": [
  142. "#compute mean abolute error and mean squre error\n",
  143. "mae=(test['rating']-test['pred']).apply(np.abs).values.mean()\n",
  144. "print mae\n",
  145. "mse=(test['rating']-test['pred']).apply(np.square).values.mean()\n",
  146. "print mse"
  147. ]
  148. },
  149. {
  150. "cell_type": "code",
  151. "execution_count": 27,
  152. "metadata": {
  153. "collapsed": true
  154. },
  155. "outputs": [],
  156. "source": [
  157. "#produce ratings for test data based on the deviation\n",
  158. "predictions=[]\n",
  159. "#for every user,movie compute the sum of similarity*rating\n",
  160. "for row in test.iterrows():\n",
  161. " user=upos[row[1]['uid']]\n",
  162. " if row[1]['mid'] not in mpos:\n",
  163. " predictions.append(2.5)\n",
  164. " continue\n",
  165. " movie=mpos[row[1]['mid']]\n",
  166. " non_zero_users=np.where(ratings[:,movie] >0 )#users who rated \"movie\"\n",
  167. " weightedRatingSum=0\n",
  168. " sumSimilarities=0\n",
  169. " for index in non_zero_users[0]:\n",
  170. " meanIndexuser=ratings[index,np.where(ratings[index,:] >0 )].mean()\n",
  171. " weightedRatingSum+=S[index,user]* (ratings[index,movie]-meanIndexuser)\n",
  172. " sumSimilarities+=S[index,user]\n",
  173. " predictedRating=2.5\n",
  174. " if sumSimilarities !=0 :\n",
  175. " predictedRating=ratings[user,np.where(ratings[user,:] >0 )].mean()+\n",
  176. " weightedRatingSum/sumSimilarities\n",
  177. " predictions.append(predictedRating)\n",
  178. "test['pred']=predictions\n",
  179. "#compute mean abolute error and mean squre error\n",
  180. "mae=(test['rating']-test['pred']).apply(np.abs).values.mean()\n",
  181. "print mae\n",
  182. "mse=(test['rating']-test['pred']).apply(np.square).values.mean()\n",
  183. "print mse"
  184. ]
  185. },
  186. {
  187. "cell_type": "markdown",
  188. "metadata": {},
  189. "source": [
  190. "Use SVD"
  191. ]
  192. },
  193. {
  194. "cell_type": "code",
  195. "execution_count": 12,
  196. "metadata": {
  197. "collapsed": true
  198. },
  199. "outputs": [],
  200. "source": [
  201. "from scipy.sparse.linalg import svds"
  202. ]
  203. },
  204. {
  205. "cell_type": "code",
  206. "execution_count": 18,
  207. "metadata": {
  208. "collapsed": false
  209. },
  210. "outputs": [],
  211. "source": [
  212. "#compute SVD\n",
  213. "U,S,V =svds(ratings-ratings.mean(axis=0,keepdims=True),k=5)\n",
  214. "newRatings=U.dot(np.diag(S)).dot(V)+ratings.mean(axis=0,keepdims=True)"
  215. ]
  216. },
  217. {
  218. "cell_type": "code",
  219. "execution_count": 19,
  220. "metadata": {
  221. "collapsed": false
  222. },
  223. "outputs": [
  224. {
  225. "name": "stdout",
  226. "output_type": "stream",
  227. "text": [
  228. "2.63406231903\n",
  229. "8.39674374306\n"
  230. ]
  231. }
  232. ],
  233. "source": [
  234. "#produce new rating matrix\n",
  235. "predictions=[]\n",
  236. "for row in test.iterrows():\n",
  237. " user=upos[row[1]['uid']]\n",
  238. " if row[1]['mid'] not in mpos:\n",
  239. " predictions.append(2.5)\n",
  240. " continue\n",
  241. " movie=mpos[row[1]['mid']]\n",
  242. " predictions.append(newRatings[user,movie])\n",
  243. "test['pred']=predictions\n",
  244. "mae=(test['rating']-test['pred']).apply(np.abs).values.mean()\n",
  245. "print mae\n",
  246. "mse=(test['rating']-test['pred']).apply(np.square).values.mean()\n",
  247. "print mse"
  248. ]
  249. },
  250. {
  251. "cell_type": "markdown",
  252. "metadata": {},
  253. "source": [
  254. "Gradient Decent"
  255. ]
  256. },
  257. {
  258. "cell_type": "code",
  259. "execution_count": 20,
  260. "metadata": {
  261. "collapsed": false
  262. },
  263. "outputs": [],
  264. "source": [
  265. "#define cost function\n",
  266. "def costFunction(params, Y, rank):\n",
  267. "\n",
  268. " numMovies = Y.shape[1]\n",
  269. " numUsers = Y.shape[0]\n",
  270. "\n",
  271. " # Unfold the A and B matrices from params\n",
  272. " A = np.reshape(params[:numUsers*rank], (numUsers, rank), order='F')\n",
  273. " B = np.reshape(params[numUsers*rank:], (numMovies, rank), order='F')\n",
  274. "\n",
  275. " # calculate cost function.\n",
  276. " diff=A.dot(B.T)-Y\n",
  277. " error=sum(diff[np.where(Y>0)]**2) \n",
  278. " return error"
  279. ]
  280. },
  281. {
  282. "cell_type": "code",
  283. "execution_count": 25,
  284. "metadata": {
  285. "collapsed": true
  286. },
  287. "outputs": [],
  288. "source": [
  289. "#define gradient function\n",
  290. "def gradient(params, Y, rank):\n",
  291. " numMovies = Y.shape[1]\n",
  292. " numUsers = Y.shape[0]\n",
  293. " # Unfold the A and B matrices om params\n",
  294. " A = np.reshape(params[:numUsers*rank], (numUsers, rank), order='F')\n",
  295. " B = np.reshape(params[numUsers*rank:], (numMovies, rank), order='F')\n",
  296. " dA=np.zeros(A.shape)\n",
  297. " for i in range(numUsers):\n",
  298. " #Y ratings by user i only non zero rated \n",
  299. " Ynz=Y[i,Y[i,:]!=0]\n",
  300. " #B features of movies rated by user i\n",
  301. " Bnz=B[Y[i,:]!=0,:]\n",
  302. " #A row i of user fearues\n",
  303. " dA[i,:]=(A[i,:].dot(Bnz.T)-Ynz).dot(Bnz)\n",
  304. " dB=np.zeros(B.shape)\n",
  305. " for i in range(numMovies): \n",
  306. " Ynz=Y[Y[:,i]!=0,i]\n",
  307. " Anz=A[Y[:,i]!=0,:]\n",
  308. " dB[i,:]=(Anz.dot(B[i,:].T)-Ynz).T.dot(Anz)\n",
  309. " return np.concatenate((dA.flatten('F'),dB.flatten('F')))\n",
  310. " "
  311. ]
  312. },
  313. {
  314. "cell_type": "code",
  315. "execution_count": 26,
  316. "metadata": {
  317. "collapsed": false
  318. },
  319. "outputs": [
  320. {
  321. "name": "stdout",
  322. "output_type": "stream",
  323. "text": [
  324. "Warning: Maximum number of iterations has been exceeded.\n",
  325. " Current function value: 62012.337751\n",
  326. " Iterations: 50\n",
  327. " Function evaluations: 175\n",
  328. " Gradient evaluations: 175\n",
  329. "0.796250804199\n",
  330. "1.07159269579\n"
  331. ]
  332. }
  333. ],
  334. "source": [
  335. "from scipy import optimize\n",
  336. "numMovies = ratings.shape[1]\n",
  337. "numUsers = ratings.shape[0]\n",
  338. "rank = 5\n",
  339. "\n",
  340. "# Set Initial Parameters A, B\n",
  341. "A = np.random.randn(numUsers, rank)\n",
  342. "B = np.random.randn(numMovies, rank)\n",
  343. "initial_parameters = np.concatenate((A.flatten('F'),B.flatten('F')))\n",
  344. "\n",
  345. "params = optimize.fmin_cg(costFunction, initial_parameters, \n",
  346. " fprime=gradient, args=(ratings, rank),maxiter=50)\n",
  347. "\n",
  348. "# Unfold the A and B matrices from params\n",
  349. "A = np.reshape(params[:numUsers*rank], (numUsers, rank), order='F')\n",
  350. "B = np.reshape(params[numUsers*rank:], (numMovies, rank), order='F')\n",
  351. "\n",
  352. "# Compute the predictions matrix\n",
  353. "newRatings=A.dot(B.T)\n",
  354. "#produce new rating matrix\n",
  355. "predictions=[]\n",
  356. "for row in test.iterrows():\n",
  357. " user=upos[row[1]['uid']]\n",
  358. " if row[1]['mid'] not in mpos:\n",
  359. " predictions.append(2.5)\n",
  360. " continue\n",
  361. " movie=mpos[row[1]['mid']]\n",
  362. " predictions.append(newRatings[user,movie])\n",
  363. "test['pred']=predictions\n",
  364. "mae=(test['rating']-test['pred']).apply(np.abs).values.mean()\n",
  365. "print mae\n",
  366. "mse=(test['rating']-test['pred']).apply(np.square).values.mean()\n",
  367. "print mse"
  368. ]
  369. }
  370. ],
  371. "metadata": {
  372. "kernelspec": {
  373. "display_name": "Python 2",
  374. "language": "python",
  375. "name": "python2"
  376. },
  377. "language_info": {
  378. "codemirror_mode": {
  379. "name": "ipython",
  380. "version": 2
  381. },
  382. "file_extension": ".py",
  383. "mimetype": "text/x-python",
  384. "name": "python",
  385. "nbconvert_exporter": "python",
  386. "pygments_lexer": "ipython2",
  387. "version": "2.7.13"
  388. }
  389. },
  390. "nbformat": 4,
  391. "nbformat_minor": 2
  392. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement