daily pastebin goal
22%
SHARE
TWEET

Untitled

a guest Apr 13th, 2017 66 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from html.parser import HTMLParser  
  2. from urllib.request import urlopen  
  3. from urllib import parse
  4. import urllib.request
  5.  
  6. # We are going to create a class called LinkParser that inherits some
  7. # methods from HTMLParser which is why it is passed into the definition
  8. class LinkParser(HTMLParser):
  9.  
  10.     # This is a function that HTMLParser normally has
  11.     # but we are adding some functionality to it
  12.     def handle_starttag(self, tag, attrs):
  13.         # We are looking for the begining of a link. Links normally look
  14.         # like <a href="www.someurl.com"></a>
  15.         if tag == 'a':
  16.             for (key, value) in attrs:
  17.                 if key == 'href':
  18.                     # We are grabbing the new URL. We are also adding the
  19.                     # base URL to it. For example:
  20.                     # www.netinstructions.com is the base and
  21.                     # somepage.html is the new URL (a relative URL)
  22.                     #
  23.                     # We combine a relative URL with the base URL to create
  24.                     # an absolute URL like:
  25.                     # www.netinstructions.com/somepage.html
  26.                     newUrl = parse.urljoin(self.baseUrl, value)
  27.                     # And add it to our colection of links:
  28.                     self.links = self.links + [newUrl]
  29.  
  30.     # This is a new function that we are creating to get links
  31.     # that our spider() function will call
  32.     def getLinks(self, url):
  33.         self.links = []
  34.         # Remember the base URL which will be important when creating
  35.         # absolute URLs
  36.         self.baseUrl = url
  37.         # Use the urlopen function from the standard Python 3 library
  38.         response = urlopen(url)
  39.         # Make sure that we are looking at HTML and not other things that
  40.         # are floating around on the internet (such as
  41.         # JavaScript files, CSS, or .PDFs for example)
  42.         if response.getheader('Content-Type')=='text/html':
  43.             htmlBytes = response.read()
  44.             # Note that feed() handles Strings well, but not bytes
  45.             # (A change from Python 2.x to Python 3.x)
  46.             htmlString = htmlBytes.decode("utf-8")
  47.             self.feed(htmlString)
  48.             return htmlString, self.links
  49.         else:
  50.             return "",[]
  51.  
  52. # And finally here is our spider. It takes in an URL, a word to find,
  53. # and the number of pages to search through before giving up
  54. def spider(url, word, maxPages):  
  55.     pagesToVisit = [url]
  56.     numberVisited = 0
  57.     foundWord = False
  58.     foundURL = []
  59.     mainPNG = "/main.png"
  60.     # The main loop. Create a LinkParser and get all the links on the page.
  61.     # Also search the page for the word or string
  62.     # In our getLinks function we return the web page
  63.     # (this is useful for searching for the word)
  64.     # and we return a set of links from that web page
  65.     # (this is useful for where to go next)
  66.     while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
  67.         numberVisited = numberVisited +1
  68.         # Start from the beginning of our collection of pages to visit:
  69.         url = pagesToVisit[0]
  70.         pagesToVisit = pagesToVisit[1:]
  71.         try:
  72.             # print(numberVisited, "Visiting:", url)
  73.             parser = LinkParser()
  74.             data, links = parser.getLinks(url)
  75.             if data.find(word)>-1:
  76.                 foundWord = True
  77.                 # Add the pages that we visited to the end of our collection
  78.                 # of pages to visit:
  79.                 pagesToVisit = pagesToVisit + links
  80.                 # print(" **Success!**")
  81.                 for item in links:
  82.                     if word in item:
  83.                         if len(item) > 44:
  84.                             baseURL = item[:item.index("ers/") + 4]
  85.                             # print(item.split("ers/")[0])
  86.                             characterPart = item.split("ers/")[1].split(".html")[0]
  87.                             fullURL = baseURL + characterPart + mainPNG
  88.                             # print(fullURL)
  89.                             foundURL.append(fullURL)
  90.                             savedPicture = characterPart + ".png"
  91.                             # print(savedPicture)
  92.                             urllib.request.urlretrieve(fullURL, savedPicture)
  93.                             # foundURL = foundURL + item
  94.                             # print("FOUND: ", foundURL)
  95.         except Exception as err:
  96.             print(" **Failed!** \n\n" + err + "\n\n")
  97.     if foundWord:
  98.         foundURL = sorted(foundURL)
  99.         print("\n".join(foundURL))
  100.     else:
  101.         print("Word never found")
RAW Paste Data
Top