homework 3.5

#The web crawler we built at the
#end of Unit 2 has some serious
#flaws if we were going to use
#it in a real crawler. One
#problem is if we start with
#a good seed page, it might
#run for an extremely long
#time (even forever, since the
#number of URLS on the web is not
#actually finite). The final two
#questions of the homework ask
#you to explore two different ways
#to limit the pages that it can
#crawl.


#######


#Modify the crawl_web procedure
#to take a second parameter,
#max_pages, that limits the
#number of pages to crawl.
#Your procedure should
#terminate the crawl after
#max_pages different pages
#have been crawled, or when
#there are no more pages to crawl.


#The following definition of
#get_page provides an interface
#to the website found at
#http://www.udacity.com/cs101x/index.html

#The function output order does not affect grading.

#crawl_web("http://www.udacity.com/cs101x/index.html",1) => ['http://www.udacity.com/cs101x/index.html']
#crawl_web("http://www.udacity.com/cs101x/index.html",3) => ['http://www.udacity.com/cs101x/index.html', 'http://www.udacity.com/cs101x/flying.html', 'http://www.udacity.com/cs101x/walking.html']
#crawl_web("http://www.udacity.com/cs101x/index.html",500) => ['http://www.udacity.com/cs101x/index.html', 'http://www.udacity.com/cs101x/flying.html', 'http://www.udacity.com/cs101x/walking.html', 'http://www.udacity.com/cs101x/crawling.html', 'http://www.udacity.com/cs101x/kicking.html']

def get_page(url):
    try:
        if url == "http://www.udacity.com/cs101x/index.html":
            return  '<html> <body> This is a test page for learning to crawl! <p> It is a good idea to  <a href="http://www.udacity.com/cs101x/crawling.html">learn to crawl</a> before you try to  <a href="http://www.udacity.com/cs101x/walking.html">walk</a> or  <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. </p> </body> </html> '
        elif url == "http://www.udacity.com/cs101x/crawling.html":
            return  '<html> <body> I have not learned to crawl yet, but I am quite good at  <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>. </body> </html>'
        elif url == "http://www.udacity.com/cs101x/walking.html":
            return '<html> <body> I cant get enough  <a href="http://www.udacity.com/cs101x/index.html">crawling</a>! </body> </html>'
        elif url == "http://www.udacity.com/cs101x/flying.html":
            return '<html> <body> The magic words are Squeamish Ossifrage! </body> </html>'
    except:
        return ""
    return ""

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)


def get_all_links(page):
    links = []
    while True:
        url,endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links


def crawl_web(seed,max_pages):
    tocrawl = [seed]
    crawled = []
    conta_paginas = 0
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            union(tocrawl, get_all_links(get_page(page)))
            crawled.append(page)
            conta_paginas = conta_paginas + 1

        if max_pages == conta_paginas:
            break

    return crawled

print(crawl_web("http://www.udacity.com/cs101x/index.html",2)) # => ['http://www.udacity.com/cs101x/index.html']