Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2019
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.13 KB | None | 0 0
  1. import urllib2
  2. import requests
  3. import json
  4. import imdb
  5. import time
  6. import itertools
  7. import wget
  8. import os
  9. import tmdbsimple as tmdb
  10. import numpy as np
  11. import random
  12. import matplotlib
  13. import matplotlib.pyplot as plt
  14. %matplotlib inline
  15. import seaborn as sns
  16. import pickle
  17. from sklearn.feature_extraction.text import CountVectorizer
  18. import re
  19. from sklearn.preprocessing import MultiLabelBinarizer
  20. from sklearn.feature_extraction.text import TfidfTransformer
  21. from sklearn.multiclass import OneVsRestClassifier
  22. from sklearn.svm import SVC
  23. from sklearn.model_selection import GridSearchCV
  24. from sklearn.metrics import f1_score
  25. from sklearn.metrics import make_scorer
  26. from sklearn.metrics import classification_report
  27. import pickle
  28. from sklearn.naive_bayes import MultinomialNB
  29.  
  30. api_key = ''
  31. tmdb.API_KEY = api_key #This sets the API key setting for the tmdb object
  32. search = tmdb.Search() #this instantiates a tmdb "search" object which allows your to search for the movie
  33.  
  34. # Done before, reading from pickle file now to maintain consistency of data!
  35. # We now sample 100 movies per genre. Problem is that the sorting is by popular movies, so they will overlap.
  36. # Need to exclude movies that were already sampled.
  37. movies = []
  38. baseyear = 2017
  39.  
  40. print('Starting pulling movies from TMDB. If you want to debug, uncomment the print command. This will take a while, please wait...')
  41. done_ids=[]
  42. for g_id in nr_ids:
  43. #print('Pulling movies for genre ID '+g_id)
  44. baseyear -= 1
  45. for page in xrange(1,6,1):
  46. time.sleep(0.5)
  47.  
  48. url = 'https://api.themoviedb.org/3/discover/movie?api_key=' + api_key
  49. url += '&language=en-US&sort_by=popularity.desc&year=' + str(baseyear)
  50. url += '&with_genres=' + str(g_id) + '&page=' + str(page)
  51.  
  52. data = urllib2.urlopen(url).read()
  53.  
  54. dataDict = json.loads(data)
  55. movies.extend(dataDict["results"])
  56. done_ids.append(str(g_id))
  57. print("Pulled movies for genres - "+','.join(done_ids))
  58.  
  59. f6=open("movies_for_posters",'rb')
  60. movies=pickle.load(f6)
  61. f6.close()
  62.  
  63. movie_ids = [m['id'] for m in movies]
  64. print "originally we had ",len(movie_ids)," movies"
  65. movie_ids=np.unique(movie_ids)
  66. print len(movie_ids)
  67. seen_before=[]
  68. no_duplicate_movies=[]
  69. for i in range(len(movies)):
  70. movie=movies[i]
  71. id=movie['id']
  72. if id in seen_before:
  73. continue
  74. # print "Seen before"
  75. else:
  76. seen_before.append(id)
  77. no_duplicate_movies.append(movie)
  78. print "After removing duplicates we have ",len(no_duplicate_movies), " movies"
  79.  
  80. poster_movies=[]
  81. counter=0
  82. movies_no_poster=[]
  83. print("Total movies : ",len(movies))
  84. print("Started downloading posters...")
  85. for movie in movies:
  86. id=movie['id']
  87. title=movie['title']
  88. if counter==1:
  89. print('Downloaded first. Code is working fine. Please wait, this will take quite some time...')
  90. if counter%300==0 and counter!=0:
  91. print "Done with ",counter," movies!"
  92. print "Trying to get poster for ",title
  93. try:
  94. grab_poster_tmdb(title)
  95. poster_movies.append(movie)
  96. except:
  97. try:
  98. time.sleep(7)
  99. grab_poster_tmdb(title)
  100. poster_movies.append(movie)
  101. except:
  102. movies_no_poster.append(movie)
  103. counter+=1
  104. print("Done with all the posters!")
  105.  
  106. print len(movies_no_poster)
  107. print len(poster_movies)
  108.  
  109. f=open('poster_movies.pckl','r')
  110. poster_movies=pickle.load(f)
  111. f.close()
  112.  
  113. f=open('no_poster_movies.pckl','r')
  114. movies_no_poster=pickle.load(f)
  115. f.close()
  116.  
  117. movies_with_overviews=[]
  118. for i in range(len(no_duplicate_movies)):
  119. movie=no_duplicate_movies[i]
  120. id=movie['id']
  121. overview=movie['overview']
  122.  
  123. if len(overview)==0:
  124. continue
  125. else:
  126. movies_with_overviews.append(movie)
  127.  
  128. len(movies_with_overviews)
  129.  
  130. # genres=np.zeros((len(top1000_movies),3))
  131. genres=[]
  132. all_ids=[]
  133. for i in range(len(movies_with_overviews)):
  134. movie=movies_with_overviews[i]
  135. id=movie['id']
  136. genre_ids=movie['genre_ids']
  137. genres.append(genre_ids)
  138. all_ids.extend(genre_ids)
  139.  
  140. mlb=MultiLabelBinarizer()
  141. Y=mlb.fit_transform(genres)
  142.  
  143. print Y.shape
  144. print np.sum(Y, axis=0)
  145.  
  146. len(list_of_genres)
  147.  
  148. # Create a tmdb genre object!
  149. genres=tmdb.Genres()
  150. # the list() method of the Genres() class returns a listing of all genres in the form of a dictionary.
  151. list_of_genres=genres.list()['genres']
  152. Genre_ID_to_name={}
  153. for i in range(len(list_of_genres)):
  154. genre_id=list_of_genres[i]['id']
  155. genre_name=list_of_genres[i]['name']
  156. Genre_ID_to_name[genre_id]=genre_name
  157. for i in set(all_ids):
  158. if i not in Genre_ID_to_name.keys():
  159. print i
  160.  
  161. Genre_ID_to_name[10769]="Foreign" #Adding it to the dictionary
  162.  
  163. len(Genre_ID_to_name.keys())
  164.  
  165. sample_movie=movies_with_overviews[5]
  166. sample_overview=sample_movie['overview']
  167. sample_title=sample_movie['title']
  168. print "The overview for the movie",sample_title," is - \n\n"
  169. print sample_overview
  170.  
  171. content=[]
  172. for i in range(len(movies_with_overviews)):
  173. movie=movies_with_overviews[i]
  174. id=movie['id']
  175. overview=movie['overview']
  176. overview=overview.replace(',','')
  177. overview=overview.replace('.','')
  178. content.append(overview)
  179.  
  180. print content[0]
  181. print len(content)
  182.  
  183. # The min_df paramter makes sure we exclude words that only occur very rarely
  184. # The default also is to exclude any words that occur in every movie description
  185. vectorize=CountVectorizer(max_df=0.95, min_df=0.005)
  186. X=vectorize.fit_transform(content)
  187.  
  188. X.shape
  189.  
  190. f4=open('X.pckl','wb')
  191. f5=open('Y.pckl','wb')
  192. pickle.dump(X,f4)
  193. pickle.dump(Y,f5)
  194. f6=open('Genredict.pckl','wb')
  195. pickle.dump(Genre_ID_to_name,f6)
  196. f4.close()
  197. f5.close()
  198. f6.close()
  199.  
  200. tfidf_transformer = TfidfTransformer()
  201. X_tfidf = tfidf_transformer.fit_transform(X)
  202. X_tfidf.shape
  203.  
  204. msk = np.random.rand(X_tfidf.shape[0]) < 0.8
  205. X_train_tfidf=X_tfidf[msk]
  206. X_test_tfidf=X_tfidf[~msk]
  207. Y_train=Y[msk]
  208. Y_test=Y[~msk]
  209. positions=range(len(movies_with_overviews))
  210. # print positions
  211. test_movies=np.asarray(positions)[~msk]
  212. # test_movies
  213.  
  214. parameters = {'kernel':['linear'], 'C':[0.01, 0.1, 1.0]}
  215. gridCV = GridSearchCV(SVC(class_weight='balanced'), parameters, scoring=make_scorer(f1_score, average='micro'))
  216. classif = OneVsRestClassifier(gridCV)
  217.  
  218. classif.fit(X_train_tfidf, Y_train)
  219.  
  220. predstfidf=classif.predict(X_test_tfidf)
  221.  
  222. print classification_report(Y_test, predstfidf, target_names=genre_names)
  223.  
  224. genre_list=sorted(list(Genre_ID_to_name.keys()))
  225. predictions=[]
  226. for i in range(X_test_tfidf.shape[0]):
  227. pred_genres=[]
  228. movie_label_scores=predstfidf[i]
  229. # print movie_label_scores
  230. for j in range(20):
  231. #print j
  232. if movie_label_scores[j]!=0:
  233. genre=Genre_ID_to_name[genre_list[j]]
  234. pred_genres.append(genre)
  235. predictions.append(pred_genres)
  236.  
  237. f=open('classifer_svc','wb')
  238. pickle.dump(classif,f)
  239. f.close()
  240. for i in range(X_test_tfidf.shape[0]):
  241. if i%50==0 and i!=0:
  242. print 'MOVIE: ',movies_with_overviews[i]['title'],'\tPREDICTION: ',','.join(predictions[i])
  243.  
  244. classifnb = OneVsRestClassifier(MultinomialNB())
  245. classifnb.fit(X[msk].toarray(), Y_train)
  246. predsnb=classifnb.predict(X[~msk].toarray())
  247. f2=open('classifer_nb','wb')
  248. pickle.dump(classifnb,f2)
  249. f2.close()
  250.  
  251. predictionsnb=[]
  252. for i in range(X_test_tfidf.shape[0]):
  253. pred_genres=[]
  254. movie_label_scores=predsnb[i]
  255. for j in range(20):
  256. #print j
  257. if movie_label_scores[j]!=0:
  258. genre=Genre_ID_to_name[genre_list[j]]
  259. pred_genres.append(genre)
  260. predictionsnb.append(pred_genres)
  261. for i in range(X_test_tfidf.shape[0]):
  262. if i%50==0 and i!=0:
  263. print 'MOVIE: ',movies_with_overviews[i]['title'],'\tPREDICTION: ',','.join(predictionsnb[i])
  264.  
  265. def precision_recall(gt,preds):
  266. TP=0
  267. FP=0
  268. FN=0
  269. for t in gt:
  270. if t in preds:
  271. TP+=1
  272. else:
  273. FN+=1
  274. for p in preds:
  275. if p not in gt:
  276. FP+=1
  277. if TP+FP==0:
  278. precision=0
  279. else:
  280. precision=TP/float(TP+FP)
  281. if TP+FN==0:
  282. recall=0
  283. else:
  284. recall=TP/float(TP+FN)
  285. return precision,recall
  286. precs=[]
  287. recs=[]
  288. for i in range(len(test_movies)):
  289. if i%1==0:
  290. pos=test_movies[i]
  291. test_movie=movies_with_overviews[pos]
  292. gtids=test_movie['genre_ids']
  293. gt=[]
  294. for g in gtids:
  295. g_name=Genre_ID_to_name[g]
  296. gt.append(g_name)
  297. # print predictions[i],movies_with_overviews[i]['title'],gt
  298. a,b=precision_recall(gt,predictions[i])
  299. precs.append(a)
  300. recs.append(b)
  301.  
  302. print np.mean(np.asarray(precs)),np.mean(np.asarray(recs))
  303.  
  304. precs=[]
  305. recs=[]
  306. for i in range(len(test_movies)):
  307. if i%1==0:
  308. pos=test_movies[i]
  309. test_movie=movies_with_overviews[pos]
  310. gtids=test_movie['genre_ids']
  311. gt=[]
  312. for g in gtids:
  313. g_name=Genre_ID_to_name[g]
  314. gt.append(g_name)
  315. # print predictions[i],movies_with_overviews[i]['title'],gt
  316. a,b=precision_recall(gt,predictionsnb[i])
  317. precs.append(a)
  318. recs.append(b)
  319.  
  320. print np.mean(np.asarray(precs)),np.mean(np.asarray(recs))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement