Advertisement
Guest User

Untitled

a guest
Jun 24th, 2019
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.77 KB | None | 0 0
  1. print(actors[['primaryName', 'knownForTitles']].head())
  2. primaryName knownForTitles
  3. 0 Rowan Atkinson tt0109831,tt0118689,tt0110357,tt0274166
  4. 1 Bill Paxton tt0112384,tt0117998,tt0264616,tt0090605
  5. 2 Juliette Binoche tt1219827,tt0108394,tt0116209,tt0241303
  6. 3 Linda Fiorentino tt0110308,tt0119654,tt0088680,tt0120655
  7. 4 Richard Linklater tt0243017,tt1065073,tt2209418,tt0405296
  8.  
  9. print(movies[['tconst', 'primaryTitle']].head())
  10. tconst primaryTitle
  11. 0 tt0001604 The Fatal Wedding
  12. 1 tt0002467 Romani, the Brigand
  13. 2 tt0003037 Fantomas: The Man in Black
  14. 3 tt0003593 Across America by Motor Car
  15. 4 tt0003830 Detective Craig's Coup
  16.  
  17. def add_cast(movie_df, actor_df):
  18.  
  19. results = movie_df.copy()
  20. length = len(results)
  21.  
  22. #create an empty feature
  23. results['cast'] = ""
  24.  
  25. #iterate through the movie identifiers
  26. for index, value in results['tconst'].iteritems():
  27. #create a new dataframe containing all the cast associated with the movie id
  28. cast = actor_df[actor_df['knownForTitles'].str.contains(value)]
  29.  
  30.  
  31. #check to see if the 'primaryName' list is empty
  32. if len(list(cast['primaryName'].values)) != 0:
  33. #set the new movie 'cast' feature equal to a list of the cast names
  34. results.loc[index]['cast'] = list(cast['primaryName'].values)
  35.  
  36. #logging
  37. if index % 1000 == 0:
  38. logging.warning(f'Results location: {index} out of {length}')
  39.  
  40. #delete cast df to free up memory
  41. del cast
  42.  
  43. return results
  44.  
  45. def actors_loop(movie_df, actor_df):
  46.  
  47. results = movie_df.copy()
  48. length = len(actor_df)
  49.  
  50.  
  51. #create an empty feature
  52. results['cast'] = ""
  53.  
  54. #iterate through all actors
  55. for index, value in actor_df['knownForTitles'].iteritems():
  56.  
  57.  
  58. #skip empties
  59. if str(value) == r"N":
  60. logging.warning(f'skipping: {index} with a value of {value}')
  61. continue
  62.  
  63. #generate a list of movies that this actor has been in
  64. cinemetography = [x.strip() for x in value.split(',')]
  65.  
  66. #iterate through every movie the actor has been in
  67. for movie in cinemetography:
  68.  
  69. #pull out the movie info if it exists
  70. movie_info = results[results['tconst'] == movie]
  71.  
  72. #continue if empty
  73. if len(movie_info) == 0:
  74. continue
  75.  
  76. #set the cast variable equal to the actor name
  77. results[results['tconst'] == movie]['cast'] = (actor_df['primaryName'].loc[index])
  78.  
  79. #delete the df to save space ?maybe
  80. del movie_info
  81.  
  82. #logging
  83. if index % 1000 == 0:
  84. logging.warning(f'Results location: {index} out of {length}')
  85.  
  86. return results
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement