Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- import requests
- import json
- import imdb
- import time
- import itertools
- import wget
- import os
- import tmdbsimple as tmdb
- import numpy as np
- import random
- import matplotlib
- import matplotlib.pyplot as plt
- %matplotlib inline
- import seaborn as sns
- import pickle
- from sklearn.feature_extraction.text import CountVectorizer
- import re
- from sklearn.preprocessing import MultiLabelBinarizer
- from sklearn.feature_extraction.text import TfidfTransformer
- from sklearn.multiclass import OneVsRestClassifier
- from sklearn.svm import SVC
- from sklearn.model_selection import GridSearchCV
- from sklearn.metrics import f1_score
- from sklearn.metrics import make_scorer
- from sklearn.metrics import classification_report
- import pickle
- from sklearn.naive_bayes import MultinomialNB
- api_key = ''
- tmdb.API_KEY = api_key #This sets the API key setting for the tmdb object
- search = tmdb.Search() #this instantiates a tmdb "search" object which allows your to search for the movie
- # Done before, reading from pickle file now to maintain consistency of data!
- # We now sample 100 movies per genre. Problem is that the sorting is by popular movies, so they will overlap.
- # Need to exclude movies that were already sampled.
- movies = []
- baseyear = 2017
- print('Starting pulling movies from TMDB. If you want to debug, uncomment the print command. This will take a while, please wait...')
- done_ids=[]
- for g_id in nr_ids:
- #print('Pulling movies for genre ID '+g_id)
- baseyear -= 1
- for page in xrange(1,6,1):
- time.sleep(0.5)
- url = 'https://api.themoviedb.org/3/discover/movie?api_key=' + api_key
- url += '&language=en-US&sort_by=popularity.desc&year=' + str(baseyear)
- url += '&with_genres=' + str(g_id) + '&page=' + str(page)
- data = urllib2.urlopen(url).read()
- dataDict = json.loads(data)
- movies.extend(dataDict["results"])
- done_ids.append(str(g_id))
- print("Pulled movies for genres - "+','.join(done_ids))
- f6=open("movies_for_posters",'rb')
- movies=pickle.load(f6)
- f6.close()
- movie_ids = [m['id'] for m in movies]
- print "originally we had ",len(movie_ids)," movies"
- movie_ids=np.unique(movie_ids)
- print len(movie_ids)
- seen_before=[]
- no_duplicate_movies=[]
- for i in range(len(movies)):
- movie=movies[i]
- id=movie['id']
- if id in seen_before:
- continue
- # print "Seen before"
- else:
- seen_before.append(id)
- no_duplicate_movies.append(movie)
- print "After removing duplicates we have ",len(no_duplicate_movies), " movies"
- poster_movies=[]
- counter=0
- movies_no_poster=[]
- print("Total movies : ",len(movies))
- print("Started downloading posters...")
- for movie in movies:
- id=movie['id']
- title=movie['title']
- if counter==1:
- print('Downloaded first. Code is working fine. Please wait, this will take quite some time...')
- if counter%300==0 and counter!=0:
- print "Done with ",counter," movies!"
- print "Trying to get poster for ",title
- try:
- grab_poster_tmdb(title)
- poster_movies.append(movie)
- except:
- try:
- time.sleep(7)
- grab_poster_tmdb(title)
- poster_movies.append(movie)
- except:
- movies_no_poster.append(movie)
- counter+=1
- print("Done with all the posters!")
- print len(movies_no_poster)
- print len(poster_movies)
- f=open('poster_movies.pckl','r')
- poster_movies=pickle.load(f)
- f.close()
- f=open('no_poster_movies.pckl','r')
- movies_no_poster=pickle.load(f)
- f.close()
- movies_with_overviews=[]
- for i in range(len(no_duplicate_movies)):
- movie=no_duplicate_movies[i]
- id=movie['id']
- overview=movie['overview']
- if len(overview)==0:
- continue
- else:
- movies_with_overviews.append(movie)
- len(movies_with_overviews)
- # genres=np.zeros((len(top1000_movies),3))
- genres=[]
- all_ids=[]
- for i in range(len(movies_with_overviews)):
- movie=movies_with_overviews[i]
- id=movie['id']
- genre_ids=movie['genre_ids']
- genres.append(genre_ids)
- all_ids.extend(genre_ids)
- mlb=MultiLabelBinarizer()
- Y=mlb.fit_transform(genres)
- print Y.shape
- print np.sum(Y, axis=0)
- len(list_of_genres)
- # Create a tmdb genre object!
- genres=tmdb.Genres()
- # the list() method of the Genres() class returns a listing of all genres in the form of a dictionary.
- list_of_genres=genres.list()['genres']
- Genre_ID_to_name={}
- for i in range(len(list_of_genres)):
- genre_id=list_of_genres[i]['id']
- genre_name=list_of_genres[i]['name']
- Genre_ID_to_name[genre_id]=genre_name
- for i in set(all_ids):
- if i not in Genre_ID_to_name.keys():
- print i
- Genre_ID_to_name[10769]="Foreign" #Adding it to the dictionary
- len(Genre_ID_to_name.keys())
- sample_movie=movies_with_overviews[5]
- sample_overview=sample_movie['overview']
- sample_title=sample_movie['title']
- print "The overview for the movie",sample_title," is - \n\n"
- print sample_overview
- content=[]
- for i in range(len(movies_with_overviews)):
- movie=movies_with_overviews[i]
- id=movie['id']
- overview=movie['overview']
- overview=overview.replace(',','')
- overview=overview.replace('.','')
- content.append(overview)
- print content[0]
- print len(content)
- # The min_df paramter makes sure we exclude words that only occur very rarely
- # The default also is to exclude any words that occur in every movie description
- vectorize=CountVectorizer(max_df=0.95, min_df=0.005)
- X=vectorize.fit_transform(content)
- X.shape
- f4=open('X.pckl','wb')
- f5=open('Y.pckl','wb')
- pickle.dump(X,f4)
- pickle.dump(Y,f5)
- f6=open('Genredict.pckl','wb')
- pickle.dump(Genre_ID_to_name,f6)
- f4.close()
- f5.close()
- f6.close()
- tfidf_transformer = TfidfTransformer()
- X_tfidf = tfidf_transformer.fit_transform(X)
- X_tfidf.shape
- msk = np.random.rand(X_tfidf.shape[0]) < 0.8
- X_train_tfidf=X_tfidf[msk]
- X_test_tfidf=X_tfidf[~msk]
- Y_train=Y[msk]
- Y_test=Y[~msk]
- positions=range(len(movies_with_overviews))
- # print positions
- test_movies=np.asarray(positions)[~msk]
- # test_movies
- parameters = {'kernel':['linear'], 'C':[0.01, 0.1, 1.0]}
- gridCV = GridSearchCV(SVC(class_weight='balanced'), parameters, scoring=make_scorer(f1_score, average='micro'))
- classif = OneVsRestClassifier(gridCV)
- classif.fit(X_train_tfidf, Y_train)
- predstfidf=classif.predict(X_test_tfidf)
- print classification_report(Y_test, predstfidf, target_names=genre_names)
- genre_list=sorted(list(Genre_ID_to_name.keys()))
- predictions=[]
- for i in range(X_test_tfidf.shape[0]):
- pred_genres=[]
- movie_label_scores=predstfidf[i]
- # print movie_label_scores
- for j in range(20):
- #print j
- if movie_label_scores[j]!=0:
- genre=Genre_ID_to_name[genre_list[j]]
- pred_genres.append(genre)
- predictions.append(pred_genres)
- f=open('classifer_svc','wb')
- pickle.dump(classif,f)
- f.close()
- for i in range(X_test_tfidf.shape[0]):
- if i%50==0 and i!=0:
- print 'MOVIE: ',movies_with_overviews[i]['title'],'\tPREDICTION: ',','.join(predictions[i])
- classifnb = OneVsRestClassifier(MultinomialNB())
- classifnb.fit(X[msk].toarray(), Y_train)
- predsnb=classifnb.predict(X[~msk].toarray())
- f2=open('classifer_nb','wb')
- pickle.dump(classifnb,f2)
- f2.close()
- predictionsnb=[]
- for i in range(X_test_tfidf.shape[0]):
- pred_genres=[]
- movie_label_scores=predsnb[i]
- for j in range(20):
- #print j
- if movie_label_scores[j]!=0:
- genre=Genre_ID_to_name[genre_list[j]]
- pred_genres.append(genre)
- predictionsnb.append(pred_genres)
- for i in range(X_test_tfidf.shape[0]):
- if i%50==0 and i!=0:
- print 'MOVIE: ',movies_with_overviews[i]['title'],'\tPREDICTION: ',','.join(predictionsnb[i])
- def precision_recall(gt,preds):
- TP=0
- FP=0
- FN=0
- for t in gt:
- if t in preds:
- TP+=1
- else:
- FN+=1
- for p in preds:
- if p not in gt:
- FP+=1
- if TP+FP==0:
- precision=0
- else:
- precision=TP/float(TP+FP)
- if TP+FN==0:
- recall=0
- else:
- recall=TP/float(TP+FN)
- return precision,recall
- precs=[]
- recs=[]
- for i in range(len(test_movies)):
- if i%1==0:
- pos=test_movies[i]
- test_movie=movies_with_overviews[pos]
- gtids=test_movie['genre_ids']
- gt=[]
- for g in gtids:
- g_name=Genre_ID_to_name[g]
- gt.append(g_name)
- # print predictions[i],movies_with_overviews[i]['title'],gt
- a,b=precision_recall(gt,predictions[i])
- precs.append(a)
- recs.append(b)
- print np.mean(np.asarray(precs)),np.mean(np.asarray(recs))
- precs=[]
- recs=[]
- for i in range(len(test_movies)):
- if i%1==0:
- pos=test_movies[i]
- test_movie=movies_with_overviews[pos]
- gtids=test_movie['genre_ids']
- gt=[]
- for g in gtids:
- g_name=Genre_ID_to_name[g]
- gt.append(g_name)
- # print predictions[i],movies_with_overviews[i]['title'],gt
- a,b=precision_recall(gt,predictionsnb[i])
- precs.append(a)
- recs.append(b)
- print np.mean(np.asarray(precs)),np.mean(np.asarray(recs))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement