Advertisement
Guest User

Untitled

a guest
Apr 13th, 2017
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.55 KB | None | 0 0
  1. from html.parser import HTMLParser  
  2. from urllib.request import urlopen  
  3. from urllib import parse
  4. import urllib.request
  5.  
  6. # We are going to create a class called LinkParser that inherits some
  7. # methods from HTMLParser which is why it is passed into the definition
  8. class LinkParser(HTMLParser):
  9.  
  10.     # This is a function that HTMLParser normally has
  11.     # but we are adding some functionality to it
  12.     def handle_starttag(self, tag, attrs):
  13.         # We are looking for the begining of a link. Links normally look
  14.         # like <a href="www.someurl.com"></a>
  15.         if tag == 'a':
  16.             for (key, value) in attrs:
  17.                 if key == 'href':
  18.                     # We are grabbing the new URL. We are also adding the
  19.                     # base URL to it. For example:
  20.                     # www.netinstructions.com is the base and
  21.                     # somepage.html is the new URL (a relative URL)
  22.                     #
  23.                     # We combine a relative URL with the base URL to create
  24.                     # an absolute URL like:
  25.                     # www.netinstructions.com/somepage.html
  26.                     newUrl = parse.urljoin(self.baseUrl, value)
  27.                     # And add it to our colection of links:
  28.                     self.links = self.links + [newUrl]
  29.  
  30.     # This is a new function that we are creating to get links
  31.     # that our spider() function will call
  32.     def getLinks(self, url):
  33.         self.links = []
  34.         # Remember the base URL which will be important when creating
  35.         # absolute URLs
  36.         self.baseUrl = url
  37.         # Use the urlopen function from the standard Python 3 library
  38.         response = urlopen(url)
  39.         # Make sure that we are looking at HTML and not other things that
  40.         # are floating around on the internet (such as
  41.         # JavaScript files, CSS, or .PDFs for example)
  42.         if response.getheader('Content-Type')=='text/html':
  43.             htmlBytes = response.read()
  44.             # Note that feed() handles Strings well, but not bytes
  45.             # (A change from Python 2.x to Python 3.x)
  46.             htmlString = htmlBytes.decode("utf-8")
  47.             self.feed(htmlString)
  48.             return htmlString, self.links
  49.         else:
  50.             return "",[]
  51.  
  52. # And finally here is our spider. It takes in an URL, a word to find,
  53. # and the number of pages to search through before giving up
  54. def spider(url, word, maxPages):  
  55.     pagesToVisit = [url]
  56.     numberVisited = 0
  57.     foundWord = False
  58.     foundURL = []
  59.     mainPNG = "/main.png"
  60.     # The main loop. Create a LinkParser and get all the links on the page.
  61.     # Also search the page for the word or string
  62.     # In our getLinks function we return the web page
  63.     # (this is useful for searching for the word)
  64.     # and we return a set of links from that web page
  65.     # (this is useful for where to go next)
  66.     while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
  67.         numberVisited = numberVisited +1
  68.         # Start from the beginning of our collection of pages to visit:
  69.         url = pagesToVisit[0]
  70.         pagesToVisit = pagesToVisit[1:]
  71.         try:
  72.             # print(numberVisited, "Visiting:", url)
  73.             parser = LinkParser()
  74.             data, links = parser.getLinks(url)
  75.             if data.find(word)>-1:
  76.                 foundWord = True
  77.                 # Add the pages that we visited to the end of our collection
  78.                 # of pages to visit:
  79.                 pagesToVisit = pagesToVisit + links
  80.                 # print(" **Success!**")
  81.                 for item in links:
  82.                     if word in item:
  83.                         if len(item) > 44:
  84.                             baseURL = item[:item.index("ers/") + 4]
  85.                             # print(item.split("ers/")[0])
  86.                             characterPart = item.split("ers/")[1].split(".html")[0]
  87.                             fullURL = baseURL + characterPart + mainPNG
  88.                             # print(fullURL)
  89.                             foundURL.append(fullURL)
  90.                             savedPicture = characterPart + ".png"
  91.                             # print(savedPicture)
  92.                             urllib.request.urlretrieve(fullURL, savedPicture)
  93.                             # foundURL = foundURL + item
  94.                             # print("FOUND: ", foundURL)
  95.         except Exception as err:
  96.             print(" **Failed!** \n\n" + err + "\n\n")
  97.     if foundWord:
  98.         foundURL = sorted(foundURL)
  99.         print("\n".join(foundURL))
  100.     else:
  101.         print("Word never found")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement