Mili-NT

Untitled

Oct 17th, 2021
848
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import os
  2. import random
  3. import logging
  4. import requests
  5. from bs4 import BeautifulSoup
  6. #
  7. # CLASSES
  8. #
  9. class item:
  10.     """
  11.    So I threw this class in here to make my life a bit easier. Each page on the website is really just a table, with
  12.    rows and columns. Each 'row' represents a file we need to download or a folder we need to go deeper into. This class
  13.    works as a template, which I feed a block of raw html, it sorts and acts on the data, and returns an instance of the
  14.    item class.
  15.    """
  16.     def __init__(self, cell):
  17.         self.cell = cell # cell is the html of the item
  18.         self.itemtype = None # itemtype is either "file" or "dir"
  19.         self.link = None # link is part of the URL that points to where the item is
  20.         self.filename = None # Same thing as link but without the slashes
  21.         """
  22.        You'll notice that I set all these to None initially, instead of doing something like:
  23.        
  24.        def __init__(self, cell, itemtype, link, filename"):
  25.            self.cell = cell
  26.            self.itemtype = itemtype
  27.            self.link = link
  28.            self.filename = filename
  29.            
  30.        This is because to do the above method, id have to filter all of those things out before initializing the class
  31.        every single time, and that's ugly. By feeding it a raw block of data, then having a method sort it out, the class
  32.        sort of becomes a magic box that I don't have to worry about. 1 data in, some magic happens in the class, 1 item out
  33.        """
  34.         self.parse_item_html()
  35.  
  36.     def parse_item_html(self):
  37.         """
  38.        This method actually sorts out the block of html (self.cell) into the class attributes (self.itemtype, self.link, self.name)
  39.        Note that it doesn't return anything, but rather just changes what the attributes are assigned to (None)
  40.  
  41.        About this first one... its a kinda quirky way to make a list with 1 element into just that element. In our case,
  42.        self.cell.attrs["class"] is always gonna be either ["dir"] or ["file"]. This just turns ["dir"] -> "dir" and
  43.        ["file"] -> "file". You could also just do self.itemtype = self.cell.attrs["class"][0]
  44.        """
  45.         (self.itemtype,) = self.cell.attrs["class"]
  46.         link = str(self.cell.find(href=True)["href"]) # We find the link based on its href tag. Note this is a local variable, it goes away as soon as this finishes.
  47.         self.link = link[1:] if link.startswith('.') else link # If it starts with a dot, we remove that. self.link is not a local variable, its here to stay
  48.         self.filename = self.link.replace("/", "")
  49.     def display(self):
  50.         """
  51.        This method is just used for debugging. It puts all the class attributes into a dict with their values and
  52.        prints them. Nothing special here.
  53.        """
  54.         itemdict = {"filename":self.filename, "item type":self.itemtype, "link":self.link}
  55.         for k,v in itemdict.items():
  56.             print(f"{k} -> {v}")
  57.         print("\n")
  58. #
  59. # LOGGING
  60. #
  61. """
  62. Logging is really useful for keeping track of whats going on in your program: https://docs.python.org/3/howto/logging.html
  63.  
  64. Here I just made a really basic logging function that logs to crawler.log, heres an example from the log:
  65.  
  66. 17-Oct-21 21:58:50 - (DEBUG) => https://cdn.preterhuman.net:443 "GET /texts/ HTTP/1.1" 200 None
  67.  
  68. This just tells us a request was peformed to that address and it came back ok (200). This can also log error tracebacks,
  69. which is nice
  70. """
  71. logging.basicConfig(filename='crawler.log',
  72.                     filemode='w',
  73.                     format='%(asctime)s - (%(levelname)s) => %(message)s\n',
  74.                     datefmt='%d-%b-%y %H:%M:%S',
  75.                     level=logging.NOTSET)
  76. def log(message, level='error'):
  77.     """
  78.    :param message: The message to log
  79.    :param level: the severity level to log as. This can be a string ('error', 'debug') or an int (40, 10). Note that this
  80.    is a named parameter! It defaults to error unless specifically provided something else. This is useful because instead
  81.    of having to specify each time, you can just say log('wuff') instead of log('wuff', 'error'). If you did need to overwrite
  82.    it, it would simply be log('wuff', level='critical')
  83.    :return: Nothing, writes to log file
  84.  
  85.  
  86.    The levels and their numeric values:
  87.    CRITICAL -> 50
  88.    ERROR -> 40
  89.    WARNING -> 30
  90.    INFO -> 20
  91.    DEBUG -> 10
  92.    NOTSET -> 0
  93.    """
  94.     level_map = {"critical":50,"error":40,"warning":30,"info":20,"debug":10,"notset":0}
  95.     # This all statement returns true if the level isnt in the keys or values of the dict
  96.     if all([level not in level_map.keys(), level not in level_map.values()]):
  97.         logging.log(30, f"Incorrect level passed to logging function: {level}")
  98.         logging.log(20, "Defaulting to level 40...")
  99.         logging.log(30, message)
  100.     if isinstance(level, str):
  101.         numeric_level = level_map[level]
  102.         logging.log(numeric_level, message)
  103.     elif isinstance(level, int):
  104.         logging.log(level, message)
  105. #
  106. # FUNCTIONS
  107. #
  108. def url_handler(*parameters):
  109.     """
  110.    :param parameters: A list of URL components to piece together. The asterisk means you can pass as many parameters
  111.    as you want, and they will be interpreted as a list
  112.    :return: A valid URL with a protocol and no repeated slashes
  113.  
  114.    Description: This function takes any amount of URL components and combines them into one full URL with no repeated slashes.
  115.    Examples:
  116.    url_handler("https://www.google.com/", "/example.txt") would return "https://www.google.com/example.txt"
  117.    url_handler("http://wuff.org", "/exampleone//", "exampletwo", "wuff.exe") would return "http://wuff.org/exampleone/exampletwo/wuff.exe
  118.    """
  119.     components = []
  120.     # First we loop through all of our passed in parameters
  121.     for parameter in parameters:
  122.         # if :// is in the parameter, we know it contains the protocol (HTTP/HTTPS), so we need to isolate that
  123.         if "://" in parameter:
  124.             # We split the parameter into two parts using the :// characters
  125.             protocol_split = parameter.split("://") # This would look like ["http", "www.google.com"]
  126.             protocol = f"{protocol_split[0]}://" # We add back the :// to the protocol
  127.             component = protocol_split[1] # and assign the rest to another variable
  128.             """
  129.            This next line contains a ternary. It looks weird, but its nothing more than an inline if/else block.
  130.            It is the same thing as this:
  131.            
  132.            if component.endswith('/'):
  133.                components.append(component[:-1])
  134.            else:
  135.                components.append(component)
  136.                
  137.            
  138.            Here we are adding component to the components array, but we have to remove any slashes at the end or it will
  139.            cause duplicates in the output. To do this, we can use splicing notation. Splicing notation is a little hard to
  140.            explain, but just know that [1:] removes the first character of a string/list and [:-1] removes the last.
  141.            
  142.            https://stackoverflow.com/questions/509211/understanding-slice-notation
  143.            """
  144.             components.append(component[:-1] if component.endswith("/") else component)
  145.         else:
  146.             # For all the pieces that don't have the protocol, we basically just have to remove any leading or trailing slashes
  147.             if all([parameter.startswith('/'), parameter.endswith('/')]): # all() takes a list of conditions and if all are true, it returns true. So in this case if it starts AND ends with a slash.
  148.                 components.append(parameter[1:-1]) # [1:-1] splices the first and last characters off
  149.             elif parameter.startswith("/"):
  150.                 components.append(parameter[1:])
  151.             elif parameter.endswith("/"):
  152.                 components.append(parameter[:-1])
  153.     # We finally combine the protocol and use the join command to combine everything else and separate them by a single slash
  154.     # join example: ','.join(['a', 'b', 'c'] --> a,b,c
  155.     return f"{protocol}{'/'.join(components)}"
  156. def random_headers():
  157.     """
  158.    :return: A dictionary containing header information with a random user-agent to send along with our HTTP request
  159.  
  160.    Description: This function picks a random User-Agent from a list and attaches it to the HTTP request.
  161.    A User-Agent is a string that identifies what device and browser the request was made from. This is important because
  162.    sites often use the User-Agent to identify bots, so by changing it we can prevent that from happening.
  163.    """
  164.     user_agents = [
  165.         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
  166.         'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
  167.         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
  168.         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
  169.         'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
  170.         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
  171.         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
  172.         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
  173.         'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
  174.         'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
  175.     ]
  176.     # After initializing the list of User-Agents, we can use random.choice() to pick a random one from the list
  177.     return {'User-Agent': random.choice(user_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
  178. def recursive_crawl():
  179.     """
  180.    So our main problem with this kind of work is that we arent just given a list of files. If you just have a list of
  181.    files, you can simply go down the list and get them all. We are given directories, which means we have to go down the
  182.    list and go into each of those and get THEIR files and directories, and so on.
  183.  
  184.    It's literally the ouroboros!
  185.  
  186.    We feed crawl_current_level() the starting URL (the main folder) and the path to save to
  187.    1. It connects to the page and grabs all the links from the table
  188.    2. It turns them into our item class and then puts the files into one list and the dirs into another
  189.    3. It downloads any files to the filepath for that level
  190.    4. Now for each directory, it takes the URL and makes a new filepath for it
  191.    5. That new URL and new filepath is then fed BACK into crawl_current_level() until there are no more dirs left.
  192.    """
  193.     def crawl_current_level(current_level_url, current_level_filepath):
  194.         # Fetching the page
  195.         page = requests.get(current_level_url, headers=random_headers())
  196.         soup = BeautifulSoup(page.text, "html.parser")
  197.         # Making all of the cells into items
  198.         items = [item(cell) for cell in soup.find_all("tr")[2:]]
  199.         # Sorting those into dirs and files
  200.         dirs = [x for x in items if x.itemtype == "dir"]
  201.         files = [x for x in items if x.itemtype == "file"]
  202.  
  203.         for file_item in files:
  204.             try:
  205.                 file_url = url_handler(current_level_url, file_item.link)
  206.                 file = requests.get(file_url, headers=random_headers())
  207.                 with open(f"{current_level_filepath}\\{file_item.filename}", "wb") as f:
  208.                     f.write(file.content)
  209.                 log(f"{file_url} downloaded.", 20)
  210.             except Exception as e:
  211.                 log(e, 40)
  212.         for dir_item in dirs:
  213.             new_filepath = f"{current_level_filepath}\\{dir_item.filename}"
  214.             new_url = url_handler(current_level_url, dir_item.link)
  215.             try:
  216.                 os.mkdir(new_filepath)
  217.             except FileExistsError:
  218.                 log(f"{new_filepath} already exists!", 30)
  219.             log(f"Starting on new directory: {new_filepath}", 20)
  220.             crawl_current_level(new_url, new_filepath)
  221.     crawl_current_level("https://cdn.preterhuman.net/texts/", "D:\Preterhuman")
  222. #
  223. # MAIN
  224. #
  225. def main():
  226.    recursive_crawl()
  227. if __name__ == '__main__':
  228.     main()
RAW Paste Data