Advertisement
marquessbr

homework 3.5

Mar 8th, 2012
431
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.66 KB | None | 0 0
  1. #The web crawler we built at the
  2. #end of Unit 2 has some serious
  3. #flaws if we were going to use
  4. #it in a real crawler. One
  5. #problem is if we start with
  6. #a good seed page, it might
  7. #run for an extremely long
  8. #time (even forever, since the
  9. #number of URLS on the web is not
  10. #actually finite). The final two
  11. #questions of the homework ask
  12. #you to explore two different ways
  13. #to limit the pages that it can
  14. #crawl.
  15.  
  16.  
  17. #######
  18.  
  19.  
  20. #Modify the crawl_web procedure
  21. #to take a second parameter,
  22. #max_pages, that limits the
  23. #number of pages to crawl.
  24. #Your procedure should
  25. #terminate the crawl after
  26. #max_pages different pages
  27. #have been crawled, or when
  28. #there are no more pages to crawl.
  29.  
  30.  
  31.  
  32. #The following definition of
  33. #get_page provides an interface
  34. #to the website found at
  35. #http://www.udacity.com/cs101x/index.html
  36.  
  37. #The function output order does not affect grading.
  38.  
  39. #crawl_web("http://www.udacity.com/cs101x/index.html",1) => ['http://www.udacity.com/cs101x/index.html']
  40. #crawl_web("http://www.udacity.com/cs101x/index.html",3) => ['http://www.udacity.com/cs101x/index.html', 'http://www.udacity.com/cs101x/flying.html', 'http://www.udacity.com/cs101x/walking.html']
  41. #crawl_web("http://www.udacity.com/cs101x/index.html",500) => ['http://www.udacity.com/cs101x/index.html', 'http://www.udacity.com/cs101x/flying.html', 'http://www.udacity.com/cs101x/walking.html', 'http://www.udacity.com/cs101x/crawling.html', 'http://www.udacity.com/cs101x/kicking.html']
  42.  
  43. def get_page(url):
  44.     try:
  45.         if url == "http://www.udacity.com/cs101x/index.html":
  46.             return  '<html> <body> This is a test page for learning to crawl! <p> It is a good idea to  <a href="http://www.udacity.com/cs101x/crawling.html">learn to crawl</a> before you try to  <a href="http://www.udacity.com/cs101x/walking.html">walk</a> or  <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. </p> </body> </html> '
  47.         elif url == "http://www.udacity.com/cs101x/crawling.html":
  48.             return  '<html> <body> I have not learned to crawl yet, but I am quite good at  <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>. </body> </html>'
  49.         elif url == "http://www.udacity.com/cs101x/walking.html":
  50.             return '<html> <body> I cant get enough  <a href="http://www.udacity.com/cs101x/index.html">crawling</a>! </body> </html>'
  51.         elif url == "http://www.udacity.com/cs101x/flying.html":
  52.             return '<html> <body> The magic words are Squeamish Ossifrage! </body> </html>'
  53.     except:
  54.         return ""
  55.     return ""
  56.  
  57. def get_next_target(page):
  58.     start_link = page.find('<a href=')
  59.     if start_link == -1:
  60.         return None, 0
  61.     start_quote = page.find('"', start_link)
  62.     end_quote = page.find('"', start_quote + 1)
  63.     url = page[start_quote + 1:end_quote]
  64.     return url, end_quote
  65.  
  66. def union(p,q):
  67.     for e in q:
  68.         if e not in p:
  69.             p.append(e)
  70.  
  71.  
  72. def get_all_links(page):
  73.     links = []
  74.     while True:
  75.         url,endpos = get_next_target(page)
  76.         if url:
  77.             links.append(url)
  78.             page = page[endpos:]
  79.         else:
  80.             break
  81.     return links
  82.  
  83.  
  84. def crawl_web(seed,max_pages):
  85.     tocrawl = [seed]
  86.     crawled = []
  87.     conta_paginas = 0
  88.     while tocrawl:
  89.         page = tocrawl.pop()
  90.         if page not in crawled:
  91.             union(tocrawl, get_all_links(get_page(page)))
  92.             crawled.append(page)
  93.             conta_paginas = conta_paginas + 1
  94.            
  95.         if max_pages == conta_paginas:
  96.             break
  97.        
  98.     return crawled
  99.  
  100. print(crawl_web("http://www.udacity.com/cs101x/index.html",2)) # => ['http://www.udacity.com/cs101x/index.html']
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement