Untitled

import time
import tweepy
import pandas as pd
import numpy as np

twitterDataFile = '/media/Samsung/tweet_tv3.json'
query = "@tv3cat"
max_pages_download = 10000


def tweets_to_data_frame(tweets):
    # data = None
    data = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=['tweetText'])

    # display(data.head(10))
    try:
        data['tweetIdstr'] = np.array([tweet.id_str for tweet in tweets])
        data['tweetLenght'] = np.array([len(tweet.full_text) for tweet in tweets])
        data['tweetCreateDate'] = np.array([tweet.created_at for tweet in tweets])
        data['tweetSource'] = np.array([tweet.source for tweet in tweets])
        data['tweetLikes'] = np.array([tweet.favorite_count for tweet in tweets])
        data['tweetRetweets'] = np.array([tweet.retweet_count for tweet in tweets])
        data['userLocation'] = np.array([tweet.user.location for tweet in tweets])
        data['userName'] = np.array([tweet.user.name for tweet in tweets])
        data['userScreenName'] = np.array([tweet.user.screen_name for tweet in tweets])
        data['userVerified'] = np.array([tweet.user.verified for tweet in tweets])
        data['userFollowers'] = np.array([tweet.user.followers_count for tweet in tweets])
        data['userIdstr'] = np.array([tweet.user.id_str for tweet in tweets])
        data['userCreateDate'] = np.array([tweet.user.created_at for tweet in tweets])
        data['tweetLan'] = np.array([tweet.lang for tweet in tweets])

        # Could be interesting getting tweeter place, but almost all tweets doesnt have location
        data['tweetCoorLan'] = np.array([tweet.coordinates.coordinates[0] for tweet in tweets])
        data['tweetCoordLat'] = np.array([tweet.coordinates.coordinates[1] for tweet in tweets])
        data['tweetPlaceId'] = np.array([tweet.place.id for tweet in tweets])
        data['tweetPlaceName'] = np.array([tweet.place.name for tweet in tweets])
        data['tweetPlaceCountry'] = np.array([tweet.place.country for tweet in tweets])

        # We add this exception just in case one tweet cant be read, we continue with next one....
        # Some tweets have some fields Nullable ( can have null or none value)
    except Exception as e:
        print("Error dentro de tweet " + str(last_id))  # df.append(data)
        print("Error" + e.__str__())

    return data


if __name__ == '__main__':

    consumer_key = "r8y13uUfAduFvj8X4dsEdAMWG"
    consumer_secret = "gBAaLFgxTXXnLlnqAmPJKpD22Cg3L0I6tW9yy0kdY34pv8cEZO"
    access_token = "1150328765597310976-Sww8Yua53EmGOnkr09cPkUylS1hSuO"
    access_token_secret = "OIIurB22pKTax4FYbs9QNRuXTLFhasSLfqI0Zd4lkjkm4"

    df = pd.DataFrame()

    # Creating the authentication  app object
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)

    #  #Now it’s time to create our API object.
    #     #APLLICATION-ONLY AUTHENTICATTION.......
    #     #Requests / window(app auth) segun tabla twitter: https://developer.twitter.com/en/docs/basics/rate-limits
    #     #Requests / window (app auth) en search es de 450 requests por 15 minutos windows
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    pag = 0
    last_id = None
    results = True

    for results in tweepy.Cursor(api.search, q=query, count=100, tweet_mode='extended', max_id=last_id).pages(
            max_pages_download):

        print("\nActual page: " + str(pag))
        # dfReaded = None
        dfReaded = tweets_to_data_frame(results)
        # print (dfReaded) If memory
        df = df.append(dfReaded)
        print("\nNumber of Tweets downloaded so far ....")
        print(df.__len__())

        if len(results) == 0:
            print("There is nothing left to download ....")
            break
        else:
            last_id = results[-1].id - 1
        pag = pag + 1

    print("Tweets to write :")
    print(len(df.index))

    # df = df.set_index(['tweetIdstr'])

    df.to_json(twitterDataFile, orient='records', lines=True)