Advertisement
Guest User

Untitled

a guest
Jul 21st, 2019
133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.22 KB | None | 0 0
  1. import time
  2.  
  3. import tweepy
  4. import json
  5. import pandas as pd
  6. import numpy as np
  7. from IPython.display import display
  8.  
  9. # In this example, the handler is time.sleep(15 * 60),
  10. # but you can of course handle it in any way you want.
  11.  
  12. def limit_handled(cursor):
  13. while True:
  14. try:
  15. yield cursor.next()
  16. except tweepy.RateLimitError:
  17. print("Waiting for rate limit.....")
  18. time.sleep(15 * 60)
  19. except tweepy.TweepError :
  20. print("Error no rate limit.....")
  21. df.to_json('/media/Samsung/tweet_CNN.json', orient='records', lines=True)
  22.  
  23.  
  24. def tweets_to_data_frame(tweets):
  25.  
  26. data = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=['Tweets'])
  27.  
  28. #display(data.head(10))
  29.  
  30. data['id'] = np.array([tweet.id for tweet in tweets])
  31. data['len'] = np.array([len(tweet.full_text) for tweet in tweets])
  32. data['date'] = np.array([tweet.created_at for tweet in tweets])
  33. data['source'] = np.array([tweet.source for tweet in tweets])
  34. data['likes'] = np.array([tweet.favorite_count for tweet in tweets])
  35. data['retweets'] = np.array([tweet.retweet_count for tweet in tweets])
  36. #df.append(data)
  37.  
  38. return data
  39. #last_visited = tweets[-1]._json['id'] - 1
  40. #last_visited = (tweets[-1].id )- 1
  41. # df.append(data, ignore_index=True)
  42. # return last_visited
  43.  
  44. if __name__ == '__main__':
  45.  
  46. consumer_key = "r8y13uUfAduFvj8X4dsEdAMWG"
  47. consumer_secret = "gBAaLFgxTXXnLlnqAmPJKpD22Cg3L0I6tW9yy0kdY34pv8cEZO"
  48. access_token = "1150328765597310976-Sww8Yua53EmGOnkr09cPkUylS1hSuO"
  49. access_token_secret = "OIIurB22pKTax4FYbs9QNRuXTLFhasSLfqI0Zd4lkjkm4"
  50.  
  51. df = pd.DataFrame()
  52.  
  53.  
  54. #Now it’s time to create our API object.
  55. #APLLICATION-ONLY AUTHENTICATTION.......
  56. #Requests / window(app auth) segun tabla twitter: https://developer.twitter.com/en/docs/basics/rate-limits
  57. #Requests / window (app auth) en search es de 450 requests por 15 minutos windows
  58.  
  59. # Creating the authentication app object
  60. auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
  61.  
  62.  
  63. # Creating the API object while passing in auth information
  64. api = tweepy.API(auth, wait_on_rate_limit=True)
  65. result= []
  66.  
  67. #//OPTION 1 : WITH JSON ELEMENTS
  68.  
  69. #with open('tweet.json', 'a', encoding='utf8') as file:
  70. # foreach through all tweets pulled
  71. # for tweet in results:
  72. # printing the text stored inside the tweet object
  73. # print (tweet.text)
  74. # result.append({
  75. # 'text': tweet.text,
  76. # 'author_name': tweet.user.screen_name
  77. # })
  78. # json.dump(tweet._json,file,sort_keys = True,indent = 4)
  79.  
  80. #The search term you want to find
  81. query = "@CNN"
  82. # Language code (follows ISO 639-1 standards)
  83. #language = "en"
  84. pag=0
  85. total_downloaded_twitters = 0
  86. last_id = None
  87. results = True
  88. for results in limit_handled(tweepy.Cursor(api.search, q=query, count=100, tweet_mode='extended', max_id=last_id).pages()):
  89. if pag >= 5000:
  90. break
  91. print("Pagina" + str(pag))
  92. #results = api.search(q=query, count=100, tweet_mode='extended', max_id=last_id)
  93. # total_downloaded_twitters = total_downloaded_twitters + results. __length_hint__
  94. # print("...%s tweets downloaded so far" + str(total_downloaded_twitters))
  95. dfReaded = tweets_to_data_frame ( results)
  96. print (dfReaded)
  97. df = df.append(dfReaded)
  98. #last_id = results[-1]._json['id'-1]
  99. print("\nahora va el df ....")
  100. print(df.__len__())
  101. if (len(results) == 0):
  102. print("There is nothing left to download ....")
  103. break
  104. else:
  105. last_id = results[-1].id - 1
  106.  
  107. print("here is not there")
  108. print (last_id)
  109. pag = pag+ 1
  110.  
  111. print("Tweets to write :")
  112. print ( len(df.index))
  113.  
  114. df = df.set_index(['id'])
  115.  
  116. df = df.to_json('/media/Samsung/tweet_CNN.json',orient ='records',lines= True)
  117.  
  118. #del df
  119.  
  120. #df = pd.read_json('tweetCNN.json', orient='records',lines= True)
  121.  
  122. print(df)
  123. # with open('tweetData.json', 'w', encoding='utf8') as file:
  124.  
  125. # json.dumps(df,file)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement