Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import time
- import tweepy
- import pandas as pd
- import numpy as np
- twitterDataFile = '/media/Samsung/tweet_tv3.json'
- query = "@tv3cat"
- max_pages_download = 10000
- def tweets_to_data_frame(tweets):
- # data = None
- data = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=['tweetText'])
- # display(data.head(10))
- try:
- data['tweetIdstr'] = np.array([tweet.id_str for tweet in tweets])
- data['tweetLenght'] = np.array([len(tweet.full_text) for tweet in tweets])
- data['tweetCreateDate'] = np.array([tweet.created_at for tweet in tweets])
- data['tweetSource'] = np.array([tweet.source for tweet in tweets])
- data['tweetLikes'] = np.array([tweet.favorite_count for tweet in tweets])
- data['tweetRetweets'] = np.array([tweet.retweet_count for tweet in tweets])
- data['userLocation'] = np.array([tweet.user.location for tweet in tweets])
- data['userName'] = np.array([tweet.user.name for tweet in tweets])
- data['userScreenName'] = np.array([tweet.user.screen_name for tweet in tweets])
- data['userVerified'] = np.array([tweet.user.verified for tweet in tweets])
- data['userFollowers'] = np.array([tweet.user.followers_count for tweet in tweets])
- data['userIdstr'] = np.array([tweet.user.id_str for tweet in tweets])
- data['userCreateDate'] = np.array([tweet.user.created_at for tweet in tweets])
- data['tweetLan'] = np.array([tweet.lang for tweet in tweets])
- # Could be interesting getting tweeter place, but almost all tweets doesnt have location
- data['tweetCoorLan'] = np.array([tweet.coordinates.coordinates[0] for tweet in tweets])
- data['tweetCoordLat'] = np.array([tweet.coordinates.coordinates[1] for tweet in tweets])
- data['tweetPlaceId'] = np.array([tweet.place.id for tweet in tweets])
- data['tweetPlaceName'] = np.array([tweet.place.name for tweet in tweets])
- data['tweetPlaceCountry'] = np.array([tweet.place.country for tweet in tweets])
- # We add this exception just in case one tweet cant be read, we continue with next one....
- # Some tweets have some fields Nullable ( can have null or none value)
- except Exception as e:
- print("Error dentro de tweet " + str(last_id)) # df.append(data)
- print("Error" + e.__str__())
- return data
- if __name__ == '__main__':
- consumer_key = "r8y13uUfAduFvj8X4dsEdAMWG"
- consumer_secret = "gBAaLFgxTXXnLlnqAmPJKpD22Cg3L0I6tW9yy0kdY34pv8cEZO"
- access_token = "1150328765597310976-Sww8Yua53EmGOnkr09cPkUylS1hSuO"
- access_token_secret = "OIIurB22pKTax4FYbs9QNRuXTLFhasSLfqI0Zd4lkjkm4"
- df = pd.DataFrame()
- # Creating the authentication app object
- auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
- # #Now itβs time to create our API object.
- # #APLLICATION-ONLY AUTHENTICATTION.......
- # #Requests / window(app auth) segun tabla twitter: https://developer.twitter.com/en/docs/basics/rate-limits
- # #Requests / window (app auth) en search es de 450 requests por 15 minutos windows
- api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
- pag = 0
- last_id = None
- results = True
- for results in tweepy.Cursor(api.search, q=query, count=100, tweet_mode='extended', max_id=last_id).pages(
- max_pages_download):
- print("\nActual page: " + str(pag))
- # dfReaded = None
- dfReaded = tweets_to_data_frame(results)
- # print (dfReaded) If memory
- df = df.append(dfReaded)
- print("\nNumber of Tweets downloaded so far ....")
- print(df.__len__())
- if len(results) == 0:
- print("There is nothing left to download ....")
- break
- else:
- last_id = results[-1].id - 1
- pag = pag + 1
- print("Tweets to write :")
- print(len(df.index))
- # df = df.set_index(['tweetIdstr'])
- df.to_json(twitterDataFile, orient='records', lines=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement