SHARE
TWEET

Untitled

a guest Jul 22nd, 2019 61 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import time
  2. import tweepy
  3. import pandas as pd
  4. import numpy as np
  5.  
  6. twitterDataFile = '/media/Samsung/tweet_tv3.json'
  7. query = "@tv3"
  8. max_pages_download = 6000
  9.  
  10. def tweets_to_data_frame(tweets):
  11.  
  12.     data = None
  13.     data = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=['Tweets'])
  14.  
  15.     #display(data.head(10))
  16.     try:
  17.         data['tweetIdstr'] = np.array([tweet.id_str for tweet in tweets])
  18.         data['tweetLenght'] = np.array([len(tweet.full_text) for tweet in tweets])
  19.         data['tweetCreateDate'] = np.array([tweet.created_at for tweet in tweets])
  20.         data['tweetSource'] = np.array([tweet.source for tweet in tweets])
  21.         data['tweetLikes'] = np.array([tweet.favorite_count for tweet in tweets])
  22.         data['tweetRetweets'] = np.array([tweet.retweet_count for tweet in tweets])
  23.         data['userLocation'] = np.array([tweet.user.location for tweet in tweets])
  24.         data['userName'] = np.array([tweet.user.name for tweet in tweets])
  25.         data['userScreenName'] = np.array([tweet.user.screen_name for tweet in tweets])
  26.         data['userVerified'] = np.array([tweet.user.verified for tweet in tweets])
  27.         data['userFollowers'] = np.array([tweet.user.followers_count for tweet in tweets])
  28.         data['userIdstr'] = np.array([tweet.user.id_str for tweet in tweets])
  29.         data['userCreateDate'] = np.array([tweet.user.created_at for tweet in tweets])
  30.         data['tweetLan'] = np.array([tweet.lang for tweet in tweets])
  31.  
  32.         # Could be interesting getting tweeter place, but almost all tweets doesnt have location
  33.         data['tweetCoorLan'] = np.array([tweet.coordinates.coordinates[0] for tweet in tweets])
  34.         data['tweetCoordLat'] = np.array([tweet.coordinates.coordinates[1] for tweet in tweets])
  35.         data['tweetPlaceId'] = np.array([tweet.place.id for tweet in tweets])
  36.         data['tweetPlaceName'] = np.array([tweet.place.name for tweet in tweets])
  37.         data['tweetPlaceCountry'] = np.array([tweet.place.country for tweet in tweets])
  38.  
  39.         # We add this exception just in case one tweet cant be read, we continue with next one....
  40.         # Some tweets have some fields Nullable ( can have null or none value)
  41.     except Exception as e:
  42.         print("Error dentro de tweet " + str(last_id))  # df.append(data)
  43.  
  44.     return data
  45.  
  46.  
  47. if __name__ == '__main__':
  48.  
  49.     consumer_key = "r8y13uUfAduFvj8X4dsEdAMWG"
  50.     consumer_secret = "gBAaLFgxTXXnLlnqAmPJKpD22Cg3L0I6tW9yy0kdY34pv8cEZO"
  51.     access_token = "1150328765597310976-Sww8Yua53EmGOnkr09cPkUylS1hSuO"
  52.     access_token_secret = "OIIurB22pKTax4FYbs9QNRuXTLFhasSLfqI0Zd4lkjkm4"
  53.  
  54.     df = pd.DataFrame()
  55.  
  56.     # Creating the authentication  app object
  57.     auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
  58.  
  59.     #  #Now it’s time to create our API object.
  60.     #     #APLLICATION-ONLY AUTHENTICATTION.......
  61.     #     #Requests / window(app auth) segun tabla twitter: https://developer.twitter.com/en/docs/basics/rate-limits
  62.     #     #Requests / window (app auth) en search es de 450 requests por 15 minutos windows
  63.     api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify = True)
  64.     result= []
  65.  
  66.     pag=0
  67.     last_id = None
  68.     results = True
  69.  
  70.     for results in tweepy.Cursor(api.search, q=query, count=100, tweet_mode='extended', max_id=last_id).pages(max_pages_download):
  71.         if pag >= max_pages_download:
  72.             print("\nMaximum quantity of pages...leaving ")
  73.             break
  74.  
  75.         print("\nActual page: " + str(pag))
  76.         dfReaded = None
  77.         dfReaded = tweets_to_data_frame ( results)
  78.         #print (dfReaded) If memory
  79.         df = df.append(dfReaded)
  80.         print("\nNumber of Tweets downloaded so far ....")
  81.         print(df.__len__())
  82.  
  83.         if (len(results) == 0):
  84.             print("There is nothing left to download ....")
  85.             break
  86.         else:
  87.             last_id = results[-1].id - 1
  88.         pag = pag+ 1
  89.  
  90.     print("Tweets to write :")
  91.     print ( len(df.index))
  92.  
  93.     #df = df.set_index(['tweetIdstr'])
  94.  
  95.     df = df.to_json(twitterDataFile,orient ='records',lines= True)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top