Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- print(actors[['primaryName', 'knownForTitles']].head())
- primaryName knownForTitles
- 0 Rowan Atkinson tt0109831,tt0118689,tt0110357,tt0274166
- 1 Bill Paxton tt0112384,tt0117998,tt0264616,tt0090605
- 2 Juliette Binoche tt1219827,tt0108394,tt0116209,tt0241303
- 3 Linda Fiorentino tt0110308,tt0119654,tt0088680,tt0120655
- 4 Richard Linklater tt0243017,tt1065073,tt2209418,tt0405296
- print(movies[['tconst', 'primaryTitle']].head())
- tconst primaryTitle
- 0 tt0001604 The Fatal Wedding
- 1 tt0002467 Romani, the Brigand
- 2 tt0003037 Fantomas: The Man in Black
- 3 tt0003593 Across America by Motor Car
- 4 tt0003830 Detective Craig's Coup
- def add_cast(movie_df, actor_df):
- results = movie_df.copy()
- length = len(results)
- #create an empty feature
- results['cast'] = ""
- #iterate through the movie identifiers
- for index, value in results['tconst'].iteritems():
- #create a new dataframe containing all the cast associated with the movie id
- cast = actor_df[actor_df['knownForTitles'].str.contains(value)]
- #check to see if the 'primaryName' list is empty
- if len(list(cast['primaryName'].values)) != 0:
- #set the new movie 'cast' feature equal to a list of the cast names
- results.loc[index]['cast'] = list(cast['primaryName'].values)
- #logging
- if index % 1000 == 0:
- logging.warning(f'Results location: {index} out of {length}')
- #delete cast df to free up memory
- del cast
- return results
- def actors_loop(movie_df, actor_df):
- results = movie_df.copy()
- length = len(actor_df)
- #create an empty feature
- results['cast'] = ""
- #iterate through all actors
- for index, value in actor_df['knownForTitles'].iteritems():
- #skip empties
- if str(value) == r"N":
- logging.warning(f'skipping: {index} with a value of {value}')
- continue
- #generate a list of movies that this actor has been in
- cinemetography = [x.strip() for x in value.split(',')]
- #iterate through every movie the actor has been in
- for movie in cinemetography:
- #pull out the movie info if it exists
- movie_info = results[results['tconst'] == movie]
- #continue if empty
- if len(movie_info) == 0:
- continue
- #set the cast variable equal to the actor name
- results[results['tconst'] == movie]['cast'] = (actor_df['primaryName'].loc[index])
- #delete the df to save space ?maybe
- del movie_info
- #logging
- if index % 1000 == 0:
- logging.warning(f'Results location: {index} out of {length}')
- return results
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement