Advertisement
ReeEra

Xenforo Archiver

Aug 25th, 2018
519
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.56 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. import argparse
  4. import requests
  5. import logging
  6. from urllib.parse import urljoin
  7.  
  8. def archive(target_url, archive_url="https://archive.is", proxy=None, agent='archiveis bot'):
  9.     """
  10.    Archives the provided URL using archive.is
  11.  
  12.    Returns the URL where the capture is stored.
  13.    """
  14.     logging.debug("Archiving {}".format(target_url))
  15.  
  16.     # Configure the request headers
  17.     headers = {
  18.         'User-Agent': agent,
  19.         'Host': 'archive.is',
  20.     }
  21.     # Setup Proxy
  22.     if proxy:
  23.         proxies = {
  24.             'http': 'socks5h://'+proxy,
  25.             'https': 'socks5h://'+proxy
  26.         }
  27.     else:
  28.         proxies = None
  29.  
  30.     # Put together the URL that will save our request
  31.     save_url = urljoin(archive_url, "/submit/")
  32.  
  33.     # Request a unique identifier for our activity
  34.     logging.debug("Requesting {}".format(archive_url + "/"))
  35.     response = requests.get(
  36.         archive_url + "/",
  37.         timeout=120,
  38.         allow_redirects=True,
  39.         headers=headers,
  40.         proxies=proxies
  41.     )
  42.     response.raise_for_status()
  43.  
  44.     # It will need to be parsed from the homepage response headers
  45.     html = str(response.content)
  46.     try:
  47.         unique_id = html.split('name="submitid', 1)[1].split('value="', 1)[1].split('"', 1)[0]
  48.         logging.debug("Unique identifier: {}".format(unique_id))
  49.     except IndexError:
  50.         logging.warn("Unable to extract unique identifier from archive.is. Submitting without it.")
  51.         unique_id = None
  52.  
  53.     # Send the capture request to archive.is with the unique id included
  54.     data = {
  55.         "url": target_url,
  56.         "anyway": 1,
  57.     }
  58.     if unique_id:
  59.         data.update({"submitid": unique_id})
  60.  
  61.     # logging.debug("Requesting {}".format(save_url))
  62.     response = requests.post(
  63.         save_url,
  64.         timeout=120,
  65.         allow_redirects=True,
  66.         headers=headers,
  67.         proxies=proxies,
  68.         data=data
  69.     )
  70.     response.raise_for_status()
  71.  
  72.     # Parse Memento API HTTP Head
  73.     if 'Refresh' in response.headers:
  74.         memento = str(response.headers['Refresh']).split(';url=')[1]
  75.         logging.debug("Memento from Refresh header: {}".format(memento))
  76.         return memento
  77.     if 'Location' in response.headers:
  78.         memento = response.headers['Location']
  79.         logging.debug("Memento from Location header: {}".format(memento))
  80.         return memento
  81.     logging.debug("Memento not found in response headers. Inspecting history.")
  82.     for i, r in enumerate(response.history):
  83.         logging.debug("Inspecting history request #{}".format(i))
  84.         logging.debug(r.headers)
  85.         if 'Location' in r.headers:
  86.             memento = r.headers['Location']
  87.             logging.debug("Memento from the Location header of {} history response: {}".format(i+1, memento))
  88.             return memento
  89.     # If there's nothing at this point, throw an error
  90.     logging.error("No memento returned by archive.is")
  91.     logging.error("Status code: {}".format(response.status_code))
  92.     logging.error(response.headers)
  93.     logging.error(response.text)
  94.     raise Exception("No memento returned by archive.is")
  95.  
  96. # Setup argparse
  97. parser = argparse.ArgumentParser(description='Process some integers.')
  98. parser.add_argument('URL', type=str,
  99.                     help='URL of the thread you want to archive')
  100. args = parser.parse_args()
  101.  
  102. # We need a trailing slash in the URL
  103. base_url = args.URL
  104. if args.URL[-1:] is not '/':
  105.     base_url += '/'
  106.  
  107. # Begin Archiving
  108. response = requests.get(base_url, allow_redirects=False)
  109. print(str(response.status_code)+": "+base_url)
  110.  
  111. # Check basic response from URL, throw if unexpected result
  112. if response.status_code is 200:
  113.     archive_dict = {}  # store our final results here
  114.  
  115.     # Archive the first page
  116.     archive_result = archive(base_url)
  117.     print(archive_result)
  118.     archive_dict[base_url] = archive_result
  119.  
  120.     # Loop and archive any additional pages
  121.     page_num = 2
  122.     while True:
  123.         request_url = base_url+'page-'+str(page_num)
  124.         response = requests.get(request_url, allow_redirects=False)
  125.         print(str(response.status_code)+": "+request_url)
  126.         page_num += 1
  127.         if response.status_code is 200:
  128.             archive_result = archive(request_url)
  129.             archive_dict[request_url] = archive_result
  130.             print(archive_result)
  131.         else:
  132.             break
  133.  
  134.     # Print out final results pretty-like
  135.     for k, v in archive_dict.items():
  136.         print("{} ({})".format(k, v))
  137. else:
  138.     raise ValueError('Bad URL', base_url, response.status_code)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement