Advertisement
Guest User

Untitled

a guest
Feb 6th, 2024
45
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.48 KB | None | 0 0
  1. import os
  2. from requests import get
  3. from lxml.html.soupparser import fromstring
  4.  
  5. pages_scraped = set()
  6. pages_to_scrape = {
  7.     "https://api.parliament.uk/historic-hansard/sittings/1877/jul/index.html": True
  8. }
  9.  
  10.  
  11. def get_text_and_links(url):
  12.     html = get(url).text
  13.     tree = fromstring(html)
  14.     links = [
  15.         resolved
  16.         for link in tree.xpath("//a/@href")
  17.         if (resolved := resolve_link(link, url)) is not None
  18.     ]
  19.     for script in tree.xpath("//script"):
  20.         script.getparent().remove(script)
  21.     return tree.text_content(), links
  22.  
  23.  
  24. def resolve_link(link_url, page_url):
  25.     result = _resolve_link(link_url, page_url)
  26.     print("Resolved", link_url, "to", result, "(context:", page_url, ")")
  27.     return result
  28.  
  29.  
  30. def _resolve_link(link_url, page_url):
  31.     if link_url.startswith("mailto:"):
  32.         return None
  33.  
  34.     link_url = link_url.split("#")[0]
  35.  
  36.     if link_url.startswith("https://api.parliament.uk/"):
  37.         link_url = link_url.removeprefix("https://api.parliament.uk")
  38.  
  39.     if "//" in link_url:
  40.         return None
  41.  
  42.     if link_url.endswith("/"):
  43.         link_url = link_url + "index.html"
  44.  
  45.     if not link_url.startswith("/"):
  46.         link_url = (
  47.             page_url.removeprefix("https://api.parliament.uk").rsplit("/", 1)[0]
  48.             + "/"
  49.             + link_url
  50.         )
  51.  
  52.     link_bits = link_url.split("/")
  53.     resolved_link_bits = []
  54.     for link_bit in link_bits:
  55.         if not link_bit:
  56.             continue
  57.         if link_bit == "..":
  58.             resolved_link_bits.pop()
  59.         else:
  60.             resolved_link_bits.append(link_bit)
  61.  
  62.     return "https://api.parliament.uk/" + "/".join(resolved_link_bits)
  63.  
  64.  
  65. def scrape(page_url):
  66.     print("Scraping", page_url)
  67.  
  68.     text, links = get_text_and_links(page_url)
  69.     file_path = page_url.replace("https://api.parliament.uk/", "/home/mark/hansard/")
  70.     os.makedirs(os.path.dirname(file_path), exist_ok=True)
  71.     open(file_path, "w").write(text)
  72.     for link in links:
  73.         if (
  74.             "/1877/jul/" in link
  75.             and "/lords/1877/jul/" not in link
  76.             and link not in pages_scraped
  77.             and link not in pages_to_scrape
  78.         ):
  79.             print("Adding", link, "to list of pages to scrape")
  80.             pages_to_scrape[link] = True
  81.             pages_scraped.add(link)
  82.     del pages_to_scrape[page_url]
  83.  
  84.  
  85. while pages_to_scrape:
  86.     print(len(pages_to_scrape), "pages left to scrape")
  87.     scrape(next(iter(pages_to_scrape)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement