Advertisement
Guest User

Untitled

a guest
Jul 22nd, 2019
155
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.08 KB | None | 0 0
  1. import time
  2. import tweepy
  3. import pandas as pd
  4. import numpy as np
  5.  
  6. twitterDataFile = '/media/Samsung/tweet_tv3.json'
  7. query = "@tv3"
  8. max_pages_download = 6000
  9.  
  10. def tweets_to_data_frame(tweets):
  11.  
  12. data = None
  13. data = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=['Tweets'])
  14.  
  15. #display(data.head(10))
  16. try:
  17. data['tweetIdstr'] = np.array([tweet.id_str for tweet in tweets])
  18. data['tweetLenght'] = np.array([len(tweet.full_text) for tweet in tweets])
  19. data['tweetCreateDate'] = np.array([tweet.created_at for tweet in tweets])
  20. data['tweetSource'] = np.array([tweet.source for tweet in tweets])
  21. data['tweetLikes'] = np.array([tweet.favorite_count for tweet in tweets])
  22. data['tweetRetweets'] = np.array([tweet.retweet_count for tweet in tweets])
  23. data['userLocation'] = np.array([tweet.user.location for tweet in tweets])
  24. data['userName'] = np.array([tweet.user.name for tweet in tweets])
  25. data['userScreenName'] = np.array([tweet.user.screen_name for tweet in tweets])
  26. data['userVerified'] = np.array([tweet.user.verified for tweet in tweets])
  27. data['userFollowers'] = np.array([tweet.user.followers_count for tweet in tweets])
  28. data['userIdstr'] = np.array([tweet.user.id_str for tweet in tweets])
  29. data['userCreateDate'] = np.array([tweet.user.created_at for tweet in tweets])
  30. data['tweetLan'] = np.array([tweet.lang for tweet in tweets])
  31.  
  32. # Could be interesting getting tweeter place, but almost all tweets doesnt have location
  33. data['tweetCoorLan'] = np.array([tweet.coordinates.coordinates[0] for tweet in tweets])
  34. data['tweetCoordLat'] = np.array([tweet.coordinates.coordinates[1] for tweet in tweets])
  35. data['tweetPlaceId'] = np.array([tweet.place.id for tweet in tweets])
  36. data['tweetPlaceName'] = np.array([tweet.place.name for tweet in tweets])
  37. data['tweetPlaceCountry'] = np.array([tweet.place.country for tweet in tweets])
  38.  
  39. # We add this exception just in case one tweet cant be read, we continue with next one....
  40. # Some tweets have some fields Nullable ( can have null or none value)
  41. except Exception as e:
  42. print("Error dentro de tweet " + str(last_id)) # df.append(data)
  43.  
  44. return data
  45.  
  46.  
  47. if __name__ == '__main__':
  48.  
  49. consumer_key = "r8y13uUfAduFvj8X4dsEdAMWG"
  50. consumer_secret = "gBAaLFgxTXXnLlnqAmPJKpD22Cg3L0I6tW9yy0kdY34pv8cEZO"
  51. access_token = "1150328765597310976-Sww8Yua53EmGOnkr09cPkUylS1hSuO"
  52. access_token_secret = "OIIurB22pKTax4FYbs9QNRuXTLFhasSLfqI0Zd4lkjkm4"
  53.  
  54. df = pd.DataFrame()
  55.  
  56. # Creating the authentication app object
  57. auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
  58.  
  59. # #Now it’s time to create our API object.
  60. # #APLLICATION-ONLY AUTHENTICATTION.......
  61. # #Requests / window(app auth) segun tabla twitter: https://developer.twitter.com/en/docs/basics/rate-limits
  62. # #Requests / window (app auth) en search es de 450 requests por 15 minutos windows
  63. api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify = True)
  64. result= []
  65.  
  66. pag=0
  67. last_id = None
  68. results = True
  69.  
  70. for results in tweepy.Cursor(api.search, q=query, count=100, tweet_mode='extended', max_id=last_id).pages(max_pages_download):
  71. if pag >= max_pages_download:
  72. print("\nMaximum quantity of pages...leaving ")
  73. break
  74.  
  75. print("\nActual page: " + str(pag))
  76. dfReaded = None
  77. dfReaded = tweets_to_data_frame ( results)
  78. #print (dfReaded) If memory
  79. df = df.append(dfReaded)
  80. print("\nNumber of Tweets downloaded so far ....")
  81. print(df.__len__())
  82.  
  83. if (len(results) == 0):
  84. print("There is nothing left to download ....")
  85. break
  86. else:
  87. last_id = results[-1].id - 1
  88. pag = pag+ 1
  89.  
  90. print("Tweets to write :")
  91. print ( len(df.index))
  92.  
  93. #df = df.set_index(['tweetIdstr'])
  94.  
  95. df = df.to_json(twitterDataFile,orient ='records',lines= True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement