Untitled

print(actors[['primaryName', 'knownForTitles']].head())
         primaryName                           knownForTitles
0     Rowan Atkinson  tt0109831,tt0118689,tt0110357,tt0274166
1        Bill Paxton  tt0112384,tt0117998,tt0264616,tt0090605
2   Juliette Binoche  tt1219827,tt0108394,tt0116209,tt0241303
3   Linda Fiorentino  tt0110308,tt0119654,tt0088680,tt0120655
4  Richard Linklater  tt0243017,tt1065073,tt2209418,tt0405296

print(movies[['tconst', 'primaryTitle']].head())
      tconst                 primaryTitle
0  tt0001604            The Fatal Wedding
1  tt0002467          Romani, the Brigand
2  tt0003037   Fantomas: The Man in Black
3  tt0003593  Across America by Motor Car
4  tt0003830       Detective Craig's Coup

def add_cast(movie_df, actor_df):

    results = movie_df.copy()
    length = len(results)

    #create an empty feature
    results['cast'] = ""

    #iterate through the movie identifiers
    for index, value in results['tconst'].iteritems():
        #create a new dataframe containing all the cast associated with the movie id
        cast = actor_df[actor_df['knownForTitles'].str.contains(value)]


        #check to see if the 'primaryName' list is empty
        if len(list(cast['primaryName'].values)) != 0:
            #set the new movie 'cast' feature equal to a list of the cast names
            results.loc[index]['cast'] = list(cast['primaryName'].values)

        #logging
        if index % 1000 == 0:
            logging.warning(f'Results location: {index} out of {length}')

        #delete cast df to free up memory
        del cast

    return results

def actors_loop(movie_df, actor_df):

    results = movie_df.copy()
    length = len(actor_df)


    #create an empty feature
    results['cast'] = ""

    #iterate through all actors
    for index, value in actor_df['knownForTitles'].iteritems():


        #skip empties
        if str(value) == r"N":
            logging.warning(f'skipping: {index} with a value of {value}')
            continue

        #generate a list of movies that this actor has been in
        cinemetography = [x.strip() for x in value.split(',')]

        #iterate through every movie the actor has been in
        for movie in cinemetography:

            #pull out the movie info if it exists
            movie_info = results[results['tconst'] == movie]

            #continue if empty
            if len(movie_info) == 0:
                continue

            #set the cast variable equal to the actor name
            results[results['tconst'] == movie]['cast'] = (actor_df['primaryName'].loc[index])

            #delete the df to save space ?maybe
            del movie_info

        #logging
        if index % 1000 == 0:
            logging.warning(f'Results location: {index} out of {length}')

    return results