Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import argparse
- import requests
- import logging
- from urllib.parse import urljoin
- def archive(target_url, archive_url="https://archive.is", proxy=None, agent='archiveis bot'):
- """
- Archives the provided URL using archive.is
- Returns the URL where the capture is stored.
- """
- logging.debug("Archiving {}".format(target_url))
- # Configure the request headers
- headers = {
- 'User-Agent': agent,
- 'Host': 'archive.is',
- }
- # Setup Proxy
- if proxy:
- proxies = {
- 'http': 'socks5h://'+proxy,
- 'https': 'socks5h://'+proxy
- }
- else:
- proxies = None
- # Put together the URL that will save our request
- save_url = urljoin(archive_url, "/submit/")
- # Request a unique identifier for our activity
- logging.debug("Requesting {}".format(archive_url + "/"))
- response = requests.get(
- archive_url + "/",
- timeout=120,
- allow_redirects=True,
- headers=headers,
- proxies=proxies
- )
- response.raise_for_status()
- # It will need to be parsed from the homepage response headers
- html = str(response.content)
- try:
- unique_id = html.split('name="submitid', 1)[1].split('value="', 1)[1].split('"', 1)[0]
- logging.debug("Unique identifier: {}".format(unique_id))
- except IndexError:
- logging.warn("Unable to extract unique identifier from archive.is. Submitting without it.")
- unique_id = None
- # Send the capture request to archive.is with the unique id included
- data = {
- "url": target_url,
- "anyway": 1,
- }
- if unique_id:
- data.update({"submitid": unique_id})
- # logging.debug("Requesting {}".format(save_url))
- response = requests.post(
- save_url,
- timeout=120,
- allow_redirects=True,
- headers=headers,
- proxies=proxies,
- data=data
- )
- response.raise_for_status()
- # Parse Memento API HTTP Head
- if 'Refresh' in response.headers:
- memento = str(response.headers['Refresh']).split(';url=')[1]
- logging.debug("Memento from Refresh header: {}".format(memento))
- return memento
- if 'Location' in response.headers:
- memento = response.headers['Location']
- logging.debug("Memento from Location header: {}".format(memento))
- return memento
- logging.debug("Memento not found in response headers. Inspecting history.")
- for i, r in enumerate(response.history):
- logging.debug("Inspecting history request #{}".format(i))
- logging.debug(r.headers)
- if 'Location' in r.headers:
- memento = r.headers['Location']
- logging.debug("Memento from the Location header of {} history response: {}".format(i+1, memento))
- return memento
- # If there's nothing at this point, throw an error
- logging.error("No memento returned by archive.is")
- logging.error("Status code: {}".format(response.status_code))
- logging.error(response.headers)
- logging.error(response.text)
- raise Exception("No memento returned by archive.is")
- # Setup argparse
- parser = argparse.ArgumentParser(description='Process some integers.')
- parser.add_argument('URL', type=str,
- help='URL of the thread you want to archive')
- args = parser.parse_args()
- # We need a trailing slash in the URL
- base_url = args.URL
- if args.URL[-1:] is not '/':
- base_url += '/'
- # Begin Archiving
- response = requests.get(base_url, allow_redirects=False)
- print(str(response.status_code)+": "+base_url)
- # Check basic response from URL, throw if unexpected result
- if response.status_code is 200:
- archive_dict = {} # store our final results here
- # Archive the first page
- archive_result = archive(base_url)
- print(archive_result)
- archive_dict[base_url] = archive_result
- # Loop and archive any additional pages
- page_num = 2
- while True:
- request_url = base_url+'page-'+str(page_num)
- response = requests.get(request_url, allow_redirects=False)
- print(str(response.status_code)+": "+request_url)
- page_num += 1
- if response.status_code is 200:
- archive_result = archive(request_url)
- archive_dict[request_url] = archive_result
- print(archive_result)
- else:
- break
- # Print out final results pretty-like
- for k, v in archive_dict.items():
- print("{} ({})".format(k, v))
- else:
- raise ValueError('Bad URL', base_url, response.status_code)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement