Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import time
- import tweepy
- import json
- import pandas as pd
- import numpy as np
- from IPython.display import display
- # In this example, the handler is time.sleep(15 * 60),
- # but you can of course handle it in any way you want.
- def limit_handled(cursor):
- while True:
- try:
- yield cursor.next()
- except tweepy.RateLimitError:
- print("Waiting for rate limit.....")
- time.sleep(15 * 60)
- except tweepy.TweepError :
- print("Error no rate limit.....")
- df.to_json('/media/Samsung/tweet_CNN.json', orient='records', lines=True)
- def tweets_to_data_frame(tweets):
- data = pd.DataFrame(data=[tweet.full_text for tweet in tweets], columns=['Tweets'])
- #display(data.head(10))
- data['id'] = np.array([tweet.id for tweet in tweets])
- data['len'] = np.array([len(tweet.full_text) for tweet in tweets])
- data['date'] = np.array([tweet.created_at for tweet in tweets])
- data['source'] = np.array([tweet.source for tweet in tweets])
- data['likes'] = np.array([tweet.favorite_count for tweet in tweets])
- data['retweets'] = np.array([tweet.retweet_count for tweet in tweets])
- #df.append(data)
- return data
- #last_visited = tweets[-1]._json['id'] - 1
- #last_visited = (tweets[-1].id )- 1
- # df.append(data, ignore_index=True)
- # return last_visited
- if __name__ == '__main__':
- consumer_key = "r8y13uUfAduFvj8X4dsEdAMWG"
- consumer_secret = "gBAaLFgxTXXnLlnqAmPJKpD22Cg3L0I6tW9yy0kdY34pv8cEZO"
- access_token = "1150328765597310976-Sww8Yua53EmGOnkr09cPkUylS1hSuO"
- access_token_secret = "OIIurB22pKTax4FYbs9QNRuXTLFhasSLfqI0Zd4lkjkm4"
- df = pd.DataFrame()
- #Now it’s time to create our API object.
- #APLLICATION-ONLY AUTHENTICATTION.......
- #Requests / window(app auth) segun tabla twitter: https://developer.twitter.com/en/docs/basics/rate-limits
- #Requests / window (app auth) en search es de 450 requests por 15 minutos windows
- # Creating the authentication app object
- auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
- # Creating the API object while passing in auth information
- api = tweepy.API(auth, wait_on_rate_limit=True)
- result= []
- #//OPTION 1 : WITH JSON ELEMENTS
- #with open('tweet.json', 'a', encoding='utf8') as file:
- # foreach through all tweets pulled
- # for tweet in results:
- # printing the text stored inside the tweet object
- # print (tweet.text)
- # result.append({
- # 'text': tweet.text,
- # 'author_name': tweet.user.screen_name
- # })
- # json.dump(tweet._json,file,sort_keys = True,indent = 4)
- #The search term you want to find
- query = "@CNN"
- # Language code (follows ISO 639-1 standards)
- #language = "en"
- pag=0
- total_downloaded_twitters = 0
- last_id = None
- results = True
- for results in limit_handled(tweepy.Cursor(api.search, q=query, count=100, tweet_mode='extended', max_id=last_id).pages()):
- if pag >= 5000:
- break
- print("Pagina" + str(pag))
- #results = api.search(q=query, count=100, tweet_mode='extended', max_id=last_id)
- # total_downloaded_twitters = total_downloaded_twitters + results. __length_hint__
- # print("...%s tweets downloaded so far" + str(total_downloaded_twitters))
- dfReaded = tweets_to_data_frame ( results)
- print (dfReaded)
- df = df.append(dfReaded)
- #last_id = results[-1]._json['id'-1]
- print("\nahora va el df ....")
- print(df.__len__())
- if (len(results) == 0):
- print("There is nothing left to download ....")
- break
- else:
- last_id = results[-1].id - 1
- print("here is not there")
- print (last_id)
- pag = pag+ 1
- print("Tweets to write :")
- print ( len(df.index))
- df = df.set_index(['id'])
- df = df.to_json('/media/Samsung/tweet_CNN.json',orient ='records',lines= True)
- #del df
- #df = pd.read_json('tweetCNN.json', orient='records',lines= True)
- print(df)
- # with open('tweetData.json', 'w', encoding='utf8') as file:
- # json.dumps(df,file)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement