Xenforo Archiver

#!/usr/bin/env python3

import argparse
import requests
import logging
from urllib.parse import urljoin

def archive(target_url, archive_url="https://archive.is", proxy=None, agent='archiveis bot'):
    """
    Archives the provided URL using archive.is

    Returns the URL where the capture is stored.
    """
    logging.debug("Archiving {}".format(target_url))

    # Configure the request headers
    headers = {
        'User-Agent': agent,
        'Host': 'archive.is',
    }
    # Setup Proxy
    if proxy:
        proxies = {
            'http': 'socks5h://'+proxy,
            'https': 'socks5h://'+proxy
        }
    else:
        proxies = None

    # Put together the URL that will save our request
    save_url = urljoin(archive_url, "/submit/")

    # Request a unique identifier for our activity
    logging.debug("Requesting {}".format(archive_url + "/"))
    response = requests.get(
        archive_url + "/",
        timeout=120,
        allow_redirects=True,
        headers=headers,
        proxies=proxies
    )
    response.raise_for_status()

    # It will need to be parsed from the homepage response headers
    html = str(response.content)
    try:
        unique_id = html.split('name="submitid', 1)[1].split('value="', 1)[1].split('"', 1)[0]
        logging.debug("Unique identifier: {}".format(unique_id))
    except IndexError:
        logging.warn("Unable to extract unique identifier from archive.is. Submitting without it.")
        unique_id = None

    # Send the capture request to archive.is with the unique id included
    data = {
        "url": target_url,
        "anyway": 1,
    }
    if unique_id:
        data.update({"submitid": unique_id})

    # logging.debug("Requesting {}".format(save_url))
    response = requests.post(
        save_url,
        timeout=120,
        allow_redirects=True,
        headers=headers,
        proxies=proxies,
        data=data
    )
    response.raise_for_status()

    # Parse Memento API HTTP Head
    if 'Refresh' in response.headers:
        memento = str(response.headers['Refresh']).split(';url=')[1]
        logging.debug("Memento from Refresh header: {}".format(memento))
        return memento
    if 'Location' in response.headers:
        memento = response.headers['Location']
        logging.debug("Memento from Location header: {}".format(memento))
        return memento
    logging.debug("Memento not found in response headers. Inspecting history.")
    for i, r in enumerate(response.history):
        logging.debug("Inspecting history request #{}".format(i))
        logging.debug(r.headers)
        if 'Location' in r.headers:
            memento = r.headers['Location']
            logging.debug("Memento from the Location header of {} history response: {}".format(i+1, memento))
            return memento
    # If there's nothing at this point, throw an error
    logging.error("No memento returned by archive.is")
    logging.error("Status code: {}".format(response.status_code))
    logging.error(response.headers)
    logging.error(response.text)
    raise Exception("No memento returned by archive.is")

# Setup argparse
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('URL', type=str,
                    help='URL of the thread you want to archive')
args = parser.parse_args()

# We need a trailing slash in the URL
base_url = args.URL
if args.URL[-1:] is not '/':
    base_url += '/'

# Begin Archiving
response = requests.get(base_url, allow_redirects=False)
print(str(response.status_code)+": "+base_url)

# Check basic response from URL, throw if unexpected result
if response.status_code is 200:
    archive_dict = {}  # store our final results here

    # Archive the first page
    archive_result = archive(base_url)
    print(archive_result)
    archive_dict[base_url] = archive_result

    # Loop and archive any additional pages
    page_num = 2
    while True:
        request_url = base_url+'page-'+str(page_num)
        response = requests.get(request_url, allow_redirects=False)
        print(str(response.status_code)+": "+request_url)
        page_num += 1
        if response.status_code is 200:
            archive_result = archive(request_url)
            archive_dict[request_url] = archive_result
            print(archive_result)
        else:
            break

    # Print out final results pretty-like
    for k, v in archive_dict.items():
        print("{} ({})".format(k, v))
else:
    raise ValueError('Bad URL', base_url, response.status_code)