Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- from requests import get
- from lxml.html.soupparser import fromstring
- pages_scraped = set()
- pages_to_scrape = {
- "https://api.parliament.uk/historic-hansard/sittings/1877/jul/index.html": True
- }
- def get_text_and_links(url):
- html = get(url).text
- tree = fromstring(html)
- links = [
- resolved
- for link in tree.xpath("//a/@href")
- if (resolved := resolve_link(link, url)) is not None
- ]
- for script in tree.xpath("//script"):
- script.getparent().remove(script)
- return tree.text_content(), links
- def resolve_link(link_url, page_url):
- result = _resolve_link(link_url, page_url)
- print("Resolved", link_url, "to", result, "(context:", page_url, ")")
- return result
- def _resolve_link(link_url, page_url):
- if link_url.startswith("mailto:"):
- return None
- link_url = link_url.split("#")[0]
- if link_url.startswith("https://api.parliament.uk/"):
- link_url = link_url.removeprefix("https://api.parliament.uk")
- if "//" in link_url:
- return None
- if link_url.endswith("/"):
- link_url = link_url + "index.html"
- if not link_url.startswith("/"):
- link_url = (
- page_url.removeprefix("https://api.parliament.uk").rsplit("/", 1)[0]
- + "/"
- + link_url
- )
- link_bits = link_url.split("/")
- resolved_link_bits = []
- for link_bit in link_bits:
- if not link_bit:
- continue
- if link_bit == "..":
- resolved_link_bits.pop()
- else:
- resolved_link_bits.append(link_bit)
- return "https://api.parliament.uk/" + "/".join(resolved_link_bits)
- def scrape(page_url):
- print("Scraping", page_url)
- text, links = get_text_and_links(page_url)
- file_path = page_url.replace("https://api.parliament.uk/", "/home/mark/hansard/")
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
- open(file_path, "w").write(text)
- for link in links:
- if (
- "/1877/jul/" in link
- and "/lords/1877/jul/" not in link
- and link not in pages_scraped
- and link not in pages_to_scrape
- ):
- print("Adding", link, "to list of pages to scrape")
- pages_to_scrape[link] = True
- pages_scraped.add(link)
- del pages_to_scrape[page_url]
- while pages_to_scrape:
- print(len(pages_to_scrape), "pages left to scrape")
- scrape(next(iter(pages_to_scrape)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement