Advertisement
Guest User

cam_cs

a guest
Aug 24th, 2022
589
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.63 KB | None | 0 0
  1. """A script to scrape all online files from Cambridge's Computer Science course"""
  2.  
  3. import re
  4. import os
  5. import json
  6. import urllib3
  7.  
  8. from bs4 import BeautifulSoup
  9. from typing import Dict, List
  10. from dataclasses import dataclass
  11.  
  12.  
  13. BASE_URL = "https://www.cl.cam.ac.uk/teaching/2122/"
  14. PART_URL = BASE_URL + "part{}.html"
  15.  
  16. BASE_PATH = "./files"
  17. EXTS_IGNORE = ("html", "htm", "php")
  18.  
  19. HTTP_POOL = urllib3.PoolManager(
  20.     retries=urllib3.Retry(redirect=5, raise_on_redirect=False)
  21. )
  22.  
  23. TOPIC_REGEX = r"[A-Z][\w\+\-]+(?=/)"
  24. EXT_URL_REGEX = r"https?://(www\.)?([\w\-\~]+/)+\.[a-zA-Z]{1,4}"
  25. INT_URL_REGEX = r"[\.\w\-/\\]+\.[a-zA-Z]{1,4}"
  26.  
  27.  
  28. @dataclass
  29. class Part:
  30.     """A dataclass defining individual parts"""
  31.     proper_name: str
  32.     url_name: str
  33.  
  34.     def __hash__(self):
  35.         return hash(self.url_name)
  36.  
  37.  
  38. @dataclass
  39. class Topic:
  40.     """A dataclass defining individual topics"""
  41.     name: str
  42.     part: Part
  43.     url: str
  44.  
  45.     def __hash__(self):
  46.         return hash(self.name + self.part.proper_name)
  47.  
  48.  
  49. class File_Item:
  50.     """A class defining individual files"""
  51.  
  52.     def __init__(self, url: str, part: Part, topic: Topic):
  53.         """
  54.        params:
  55.            url (str) The fully-qualified url that points to this file
  56.            part (Part) The 'part' of the syllabus this file is under
  57.            topic (Topic) The 'topic' of the syllabus this file is under
  58.        """
  59.  
  60.         self.url = url
  61.         self.part = part
  62.         self.topic = topic
  63.  
  64.         self.local_dir = f"{BASE_PATH}/{part.proper_name}/{topic.name}"
  65.         self.fname = url[url.rfind('/') + 1 :]
  66.  
  67.         self.full_local_path = self.local_dir + '/' + self.fname
  68.  
  69.  
  70. def handle_GET(url: str, obj: Part | Topic | File_Item) -> urllib3.response.HTTPResponse | None:
  71.     """For handling get requests
  72.        params:
  73.            url (str) The fully-qualified URL to GET
  74.            obj (Part | Topic | File_Item) The object to append to `unsuccessful`,
  75.                    if the request does not work
  76.        returns:
  77.            (urllib3.response.HTTPResponse | None) Either the response (on success) or None
  78.    """
  79.  
  80.     try:
  81.         r = HTTP_POOL.request_encode_url("GET", url)
  82.     except urllib3.exceptions.HTTPError as e:
  83.         print(f"[!] Failed to get {url} ({e})")
  84.         unsuccessful.append(obj)
  85.         return None
  86.     else:
  87.         if r.status // 100 == 2:
  88.             return r
  89.         else:
  90.             print(f"[!] Non-200 status on {url} (code {r.status})")
  91.             unsuccessful.append(obj)
  92.             return None
  93.  
  94.  
  95. def get_links(r: urllib3.response.HTTPResponse) -> List[str]:
  96.     """For retrieving URLs from a HTTP(s) response
  97.        params:
  98.            r (urllib3.response.HTTPResponse) The HTTP(s) response
  99.        returns:
  100.            (List[str]) An list of URLs
  101.    """
  102.  
  103.     soup = BeautifulSoup(r.data, "html.parser")
  104.  
  105.     # Ignore indexes
  106.     if soup.title.string.startswith("Index of"):
  107.         return []
  108.  
  109.     a_tags = soup.find_all("a")
  110.     urls = []
  111.     for a in a_tags:
  112.         if "href" in a.attrs:
  113.             urls.append(a.attrs["href"])
  114.            
  115.     return urls
  116.  
  117.  
  118. PARTS = (
  119.     Part("IA", "1a"),
  120.     Part("IB", "1b"),
  121.     Part("II-50", "2-50"),
  122.     Part("II-75", "2-75")
  123. )
  124.  
  125. unsuccessful: List[Part | Topic | File_Item] = []
  126. site: Dict[
  127.     Part, Dict[
  128.         Topic, List[File_Item]
  129.     ]
  130. ] = {p: {} for p in PARTS}
  131.  
  132. for part in PARTS:
  133.     print(f"\n[*] Onto part {part.proper_name}\n")
  134.  
  135.     # Get 'part' page
  136.     p_url = PART_URL.format(part.url_name)
  137.     part_resp = handle_GET(p_url, part)
  138.     if part_resp is None:
  139.         continue
  140.  
  141.     all_urls = get_links(part_resp)
  142.  
  143.     # Get topic urls in the part
  144.     topics: List[Topic] = []
  145.     for url in all_urls:
  146.         m = re.match(TOPIC_REGEX, url)
  147.         if m is None:
  148.             continue
  149.        
  150.         topics.append(
  151.             Topic(m.group(), part, BASE_URL + m.group())
  152.         )
  153.    
  154.     # Get file urls in topics
  155.     for topic in topics:
  156.         site[part][topic] = []
  157.  
  158.         for loc in ("materials.html", "slides"):
  159.             topic_materials = handle_GET(topic.url + '/' + loc, topic)
  160.             if topic_materials is None:
  161.                 continue
  162.  
  163.             all_urls = get_links(topic_materials)
  164.             for url in all_urls:
  165.                 # Check if external file (and not disallowed extension)
  166.                 if (
  167.                     re.match(EXT_URL_REGEX, url) is not None and
  168.                     not any(map(lambda x: url.endswith(x), EXTS_IGNORE))
  169.                 ):
  170.                     f = File_Item(url, part, topic)
  171.                     site[part][topic].append(f)
  172.  
  173.                     # Get file
  174.                     file_resp = handle_GET(f.url, f)
  175.                     if file_resp is None:
  176.                         continue
  177.  
  178.                     # Save locally
  179.                     os.makedirs(os.path.dirname(f.full_local_path), exist_ok=True)
  180.                     with open(f.full_local_path, "wb") as f:
  181.                         f.write(file_resp.data)
  182.                 # Check if internal file (and not disallowed extension)
  183.                 elif (
  184.                     re.match(INT_URL_REGEX, url) is not None and
  185.                     not any(map(lambda x: url.endswith(x), EXTS_IGNORE))
  186.                 ):
  187.                     f = File_Item(topic.url + '/' + url, part, topic)
  188.                     site[part][topic].append(f)
  189.  
  190.                     # Get file
  191.                     file_resp = handle_GET(f.url, f)
  192.                     if file_resp is None:
  193.                        
  194.                         continue
  195.  
  196.                     # Save locally
  197.                     os.makedirs(os.path.dirname(f.full_local_path), exist_ok=True)
  198.                     with open(f.full_local_path, "wb") as f:
  199.                         f.write(file_resp.data)
  200.  
  201.  
  202. # Output other stuff
  203. with open(BASE_PATH + "/site.json", "w") as f:
  204.     __site = {}
  205.     # Broken
  206.     quit()
  207.     for part in site.keys():
  208.         _p = part.proper_name
  209.         for topic in site[part].keys():
  210.             _t = topic.name
  211.             for file in site[part]:
  212.                 __site[_p][_t][file.name] = vars(file)
  213.  
  214.     json.dump(__site, f)
  215.  
  216. with open(BASE_PATH + "/unsuccessful.txt", "w") as f:
  217.     for i in unsuccessful:
  218.         if type(i) == Part:
  219.             f.write(
  220.                 f"Part {i.proper_name}\n"
  221.             )
  222.         elif type(i) == Topic:
  223.             f.write(
  224.                 f"Topic {i.name} (in part {i.part.proper_name})\n"
  225.             )
  226.         else:
  227.             f.write(
  228.                 f"File {i.fname} (@{i.url}; in part {i.part.proper_name}, topic {i.topic.name})\n"
  229.             )
  230.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement