Advertisement
napiii

twitterscraper-query

Jul 16th, 2018
187
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.23 KB | None | 0 0
  1. from __future__ import division
  2. import random
  3. import requests
  4. import datetime as dt
  5. import json
  6. from functools import partial
  7. from multiprocessing.pool import Pool
  8.  
  9. from twitterscraper.tweet import Tweet
  10. from twitterscraper.logging import logger
  11.  
  12. HEADERS_LIST = ["Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
  13.                 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
  14.                 "Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13",
  15.                 "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
  16.                 "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201",
  17.                 "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
  18.                 "Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre"]
  19.  
  20. HEADER = {'User-Agent': random.choice(HEADERS_LIST)}
  21.  
  22. INIT_URL = "https://twitter.com/search?f=tweets&vertical=default&q={q}&l={lang}"
  23. RELOAD_URL = "https://twitter.com/i/search/timeline?f=tweets&vertical=" \
  24.              "default&include_available_features=1&include_entities=1&" \
  25.              "reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}"
  26.  
  27. def linspace(start, stop, n):
  28.     if n == 1:
  29.         yield stop
  30.         return
  31.     h = (stop - start) / (n - 1)
  32.     for i in range(n):
  33.         yield start + h * i
  34.  
  35.  
  36. def query_single_page(url, html_response=True, retry=10):
  37.     """
  38.    Returns tweets from the given URL.
  39.  
  40.    :param url: The URL to get the tweets from
  41.    :param html_response: False, if the HTML is embedded in a JSON
  42.    :param retry: Number of retries if something goes wrong.
  43.    :return: The list of tweets, the pos argument for getting the next page.
  44.    """
  45.  
  46.     try:
  47.         response = requests.get(url, headers=HEADER)
  48.         if html_response:
  49.             html = response.text or ''
  50.         else:
  51.             html = ''
  52.             try:
  53.                 json_resp = json.loads(response.text)
  54.                 html = json_resp['items_html'] or ''
  55.             except ValueError as e:
  56.                 logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url))
  57.  
  58.         tweets = list(Tweet.from_html(html))
  59.  
  60.         if not tweets:
  61.             return [], None
  62.  
  63.         if not html_response:
  64.             return tweets, json_resp['min_position']
  65.  
  66.         return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
  67.     except requests.exceptions.HTTPError as e:
  68.         logger.exception('HTTPError {} while requesting "{}"'.format(
  69.             e, url))
  70.     except requests.exceptions.ConnectionError as e:
  71.         logger.exception('ConnectionError {} while requesting "{}"'.format(
  72.             e, url))
  73.     except requests.exceptions.Timeout as e:
  74.         logger.exception('TimeOut {} while requesting "{}"'.format(
  75.             e, url))
  76.     except json.decoder.JSONDecodeError as e:
  77.         logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format(
  78.             e, url))
  79.        
  80.     if retry > 0:
  81.         logger.info("Retrying... (Attempts left: {})".format(retry))
  82.         return query_single_page(url, html_response, retry-1)
  83.  
  84.     logger.error("Giving up.")
  85.     return [], None
  86.  
  87.  
  88. def query_tweets_once(query, limit=None, lang=''):
  89.     """
  90.    Queries twitter for all the tweets you want! It will load all pages it gets
  91.    from twitter. However, twitter might out of a sudden stop serving new pages,
  92.    in that case, use the `query_tweets` method.
  93.  
  94.    Note that this function catches the KeyboardInterrupt so it can return
  95.    tweets on incomplete queries if the user decides to abort.
  96.  
  97.    :param query: Any advanced query you want to do! Compile it at
  98.                  https://twitter.com/search-advanced and just copy the query!
  99.    :param limit: Scraping will be stopped when at least ``limit`` number of
  100.                  items are fetched.
  101.    :param num_tweets: Number of tweets fetched outside this function.
  102.    :return:      A list of twitterscraper.Tweet objects. You will get at least
  103.                  ``limit`` number of items.
  104.    """
  105.     logger.info("Querying {}".format(query))
  106.     query = query.replace(' ', '%20').replace("#", "%23").replace(":", "%3A")
  107.     pos = None
  108.     tweets = []
  109.     try:
  110.         while True:
  111.             new_tweets, pos = query_single_page(
  112.                 INIT_URL.format(q=query, lang=lang) if pos is None
  113.                 else RELOAD_URL.format(q=query, pos=pos, lang=lang),
  114.                 pos is None
  115.             )
  116.             if len(new_tweets) == 0:
  117.                 logger.info("Got {} tweets for {}.".format(
  118.                     len(tweets), query))
  119.                 return tweets
  120.  
  121.             tweets += new_tweets
  122.  
  123.             if limit and len(tweets) >= limit:
  124.                 logger.info("Got {} tweets for {}.".format(
  125.                     len(tweets), query))
  126.                 return tweets
  127.     except KeyboardInterrupt:
  128.         logger.info("Program interrupted by user. Returning tweets gathered "
  129.                      "so far...")
  130.     except BaseException:
  131.         logger.exception("An unknown error occurred! Returning tweets "
  132.                           "gathered so far.")
  133.     logger.info("Got {} tweets for {}.".format(
  134.         len(tweets), query))
  135.     return tweets
  136.  
  137.  
  138. def eliminate_duplicates(iterable):
  139.     """
  140.    Yields all unique elements of an iterable sorted. Elements are considered
  141.    non unique if the equality comparison to another element is true. (In those
  142.    cases, the set conversion isn't sufficient as it uses identity comparison.)
  143.    """
  144.     class NoElement: pass
  145.  
  146.     prev_elem = NoElement
  147.     for elem in sorted(iterable):
  148.         if prev_elem is NoElement:
  149.             prev_elem = elem
  150.             yield elem
  151.             continue
  152.  
  153.         if prev_elem != elem:
  154.             prev_elem = elem
  155.             yield elem
  156.  
  157. def query_tweets(query, limit=None, begindate=dt.date(2006,3,21), enddate=dt.date.today(), poolsize=20, lang=''):
  158.     no_days = (enddate - begindate).days
  159.     if poolsize > no_days:
  160.         # Since we are assigning each pool a range of dates to query,
  161.         # the number of pools should not exceed the number of dates.
  162.         poolsize = no_days
  163.  
  164.     dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)]
  165.     if limit:
  166.         limit_per_pool = (limit // poolsize)+1
  167.     else:
  168.         limit_per_pool = None
  169.        
  170.     queries = ['{} since:{} until:{}'.format(query, since, until)
  171.                for since, until in zip(dateranges[:-1], dateranges[1:])]
  172.    
  173.     all_tweets = []
  174.     try:
  175.         pool = Pool(poolsize)
  176.        
  177.         try:
  178.  
  179.             for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries):
  180.                 all_tweets.extend(new_tweets)
  181.                 logger.info("Got {} tweets ({} new).".format(
  182.                     len(all_tweets), len(new_tweets)))
  183.         except KeyboardInterrupt:
  184.             logger.info("Program interrupted by user. Returning all tweets "
  185.                          "gathered so far.")
  186.     finally:
  187.         pool.close()
  188.         pool.join()
  189.     return all_tweets
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement