Advertisement
Guest User

Untitled

a guest
Jul 22nd, 2019
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.00 KB | None | 0 0
  1. import time
  2. import tweepy
  3. import pandas as pd
  4. import numpy as np
  5.  
  6. twitterDataFile = '/media/Samsung/tweet_tv3.json'
  7. query = "@tv3cat"
  8. max_pages_download = 10000
  9.  
  10.  
  11. def tweets_to_data_frame(tweets):
  12. # data = None
  13. data = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=['tweetText'])
  14.  
  15. # display(data.head(10))
  16. try:
  17. data['tweetIdstr'] = np.array([tweet.id_str for tweet in tweets])
  18. data['tweetLenght'] = np.array([len(tweet.full_text) for tweet in tweets])
  19. data['tweetCreateDate'] = np.array([tweet.created_at for tweet in tweets])
  20. data['tweetSource'] = np.array([tweet.source for tweet in tweets])
  21. data['tweetLikes'] = np.array([tweet.favorite_count for tweet in tweets])
  22. data['tweetRetweets'] = np.array([tweet.retweet_count for tweet in tweets])
  23. data['userLocation'] = np.array([tweet.user.location for tweet in tweets])
  24. data['userName'] = np.array([tweet.user.name for tweet in tweets])
  25. data['userScreenName'] = np.array([tweet.user.screen_name for tweet in tweets])
  26. data['userVerified'] = np.array([tweet.user.verified for tweet in tweets])
  27. data['userFollowers'] = np.array([tweet.user.followers_count for tweet in tweets])
  28. data['userIdstr'] = np.array([tweet.user.id_str for tweet in tweets])
  29. data['userCreateDate'] = np.array([tweet.user.created_at for tweet in tweets])
  30. data['tweetLan'] = np.array([tweet.lang for tweet in tweets])
  31.  
  32. # Could be interesting getting tweeter place, but almost all tweets doesnt have location
  33. data['tweetCoorLan'] = np.array([tweet.coordinates.coordinates[0] for tweet in tweets])
  34. data['tweetCoordLat'] = np.array([tweet.coordinates.coordinates[1] for tweet in tweets])
  35. data['tweetPlaceId'] = np.array([tweet.place.id for tweet in tweets])
  36. data['tweetPlaceName'] = np.array([tweet.place.name for tweet in tweets])
  37. data['tweetPlaceCountry'] = np.array([tweet.place.country for tweet in tweets])
  38.  
  39. # We add this exception just in case one tweet cant be read, we continue with next one....
  40. # Some tweets have some fields Nullable ( can have null or none value)
  41. except Exception as e:
  42. print("Error dentro de tweet " + str(last_id)) # df.append(data)
  43. print("Error" + e.__str__())
  44.  
  45. return data
  46.  
  47.  
  48. if __name__ == '__main__':
  49.  
  50. consumer_key = "r8y13uUfAduFvj8X4dsEdAMWG"
  51. consumer_secret = "gBAaLFgxTXXnLlnqAmPJKpD22Cg3L0I6tW9yy0kdY34pv8cEZO"
  52. access_token = "1150328765597310976-Sww8Yua53EmGOnkr09cPkUylS1hSuO"
  53. access_token_secret = "OIIurB22pKTax4FYbs9QNRuXTLFhasSLfqI0Zd4lkjkm4"
  54.  
  55. df = pd.DataFrame()
  56.  
  57. # Creating the authentication app object
  58. auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
  59.  
  60. # #Now it’s time to create our API object.
  61. # #APLLICATION-ONLY AUTHENTICATTION.......
  62. # #Requests / window(app auth) segun tabla twitter: https://developer.twitter.com/en/docs/basics/rate-limits
  63. # #Requests / window (app auth) en search es de 450 requests por 15 minutos windows
  64. api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
  65.  
  66. pag = 0
  67. last_id = None
  68. results = True
  69.  
  70. for results in tweepy.Cursor(api.search, q=query, count=100, tweet_mode='extended', max_id=last_id).pages(
  71. max_pages_download):
  72.  
  73. print("\nActual page: " + str(pag))
  74. # dfReaded = None
  75. dfReaded = tweets_to_data_frame(results)
  76. # print (dfReaded) If memory
  77. df = df.append(dfReaded)
  78. print("\nNumber of Tweets downloaded so far ....")
  79. print(df.__len__())
  80.  
  81. if len(results) == 0:
  82. print("There is nothing left to download ....")
  83. break
  84. else:
  85. last_id = results[-1].id - 1
  86. pag = pag + 1
  87.  
  88. print("Tweets to write :")
  89. print(len(df.index))
  90.  
  91. # df = df.set_index(['tweetIdstr'])
  92.  
  93. df.to_json(twitterDataFile, orient='records', lines=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement