SHARE
TWEET

Untitled

DoromaAnim Nov 19th, 2019 115 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import urllib.request
  2. import html.parser
  3. import sys
  4.  
  5.  
  6. def memoize(f):
  7.     memory = dict()
  8.  
  9.     def decorated(*args):
  10.         if args[0] in memory:
  11.             # aby nie uruchamiac fora po None
  12.             return []
  13.         res = f(*args)
  14.         memory[args[0]] = res
  15.         return res
  16.     return decorated
  17.  
  18.  
  19. def getHTMLcode(url):
  20.     """Pobranie kodu źródłowego strony o adresie url"""
  21.     try:
  22.         with urllib.request.urlopen(url) as website:
  23.             websitebytes = website.read()
  24.             try:
  25.                 code = websitebytes.decode("utf-8")
  26.             except UnicodeError:
  27.                 print(u"\u001b[38;5;196mBłąd podczas dekodowania strony: ", url)
  28.                 print(u"\u001b[0m", end="")
  29.                 return
  30.             return code
  31.     except (urllib.request.HTTPError, urllib.request.URLError):
  32.         print(u"\u001b[38;5;196mBłąd podczas otwierania strony: ", url)
  33.         print(u"\u001b[0m", end="")
  34.         return
  35.  
  36.  
  37. class MyHTMLParser(html.parser.HTMLParser):
  38.  
  39.     def __init__(self, action, url):
  40.         super().__init__()
  41.         self.links = []
  42.         self.result = []
  43.         self.url = url
  44.         self.action = action
  45.  
  46.     def handle_starttag(self, tag, attrs):
  47.         # Tag oznacza link
  48.         if tag == 'a' or tag == 'base':
  49.             for i in attrs:
  50.                 if(i[0] == "href"):
  51.                     self.links.append(urllib.parse.urljoin(self.url, i[1]))
  52.  
  53.         res = self.action([], starttag=(tag, attrs))
  54.         if res is not None:
  55.             self.result.extend(res)
  56.  
  57.     def handle_endtag(self, tag):
  58.         res = self.action([], endtag=(tag))
  59.         if res is not None:
  60.             self.result.extend(res)
  61.  
  62.     def handle_data(self, data):
  63.         res = self.action(data)
  64.         if res is not None:
  65.             self.result.extend(res)
  66.  
  67.  
  68. @memoize
  69. def crawl(start_page, distance, action):
  70.     result = []
  71.     websitecode = getHTMLcode(start_page)
  72.  
  73.     if websitecode is None:
  74.         yield None
  75.     else:
  76.         parser = MyHTMLParser(action, start_page)
  77.         parser.feed(websitecode)
  78.         result = parser.result
  79.         links = parser.links
  80.  
  81.         yield (start_page, result)
  82.  
  83.         if distance > 1:
  84.             for i in links:
  85.                 for j in crawl(i, distance - 1, action):
  86.                     yield j
  87.  
  88.  
  89. def patternsearch(pattern):
  90.     def action(arg, **kwargs):
  91.         result = []
  92.         # użycie tagu rozpoczynającego i jego argumentów
  93.         if 'starttag' in kwargs:
  94.             pass
  95.  
  96.         # użycie tagu kończącego i jego argumentów
  97.         if 'endtag' in kwargs:
  98.             pass
  99.  
  100.         # użycie danych tekstowych ze strony
  101.         if arg is not None and len(arg):
  102.             zdania = [i.strip() for i in arg.split('.') if len(i)]
  103.             for i in zdania:
  104.                 if i.lower().find(pattern) != -1:
  105.                     result.append(i)
  106.  
  107.         return result
  108.     return action
  109.  
  110.  
  111. def main():
  112.     if len(sys.argv) > 3:
  113.         crawler = crawl(sys.argv[1], int(sys.argv[2]), patternsearch(sys.argv[3]))
  114.         for i in crawler:
  115.             if i is not None:
  116.                 print(i)
  117.     else:
  118.         url = input("Podaj link do strony: ")
  119.         distance = int(input("Podaj głębokość przeszukiwania: "))
  120.         pattern = input("Podaj słowo do wyszukania: ")
  121.         crawler = crawl(url, distance, patternsearch(pattern))
  122.         for i in crawler:
  123.             if i is not None:
  124.                 print(i)
  125.  
  126.  
  127. main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top