web_crawler.py

#!/usr/bin/python

# Call with two command line arguments for seed url and no of levels to go to
# Warning: More than 3 levels may take a long time to run and the number of
#          links to follow grows exponentially with each level added.

# example:
# python web_crawler.py www.google.co.uk 3

import urllib   # needed for get_page
import sys      # needed for cmd line args

# procedure to get target
def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        url, end_quote = None, 0
    else:
        start_quote = page.find('"', start_link)
        end_quote = page.find('"', start_quote + 1)
        url = page[start_quote + 1:end_quote]
    return url, end_quote

# code to loop through urls in a page
# consider adding code to get link descriptions as well
def get_links(page, source_url, no_of_levels):

    # set up variables
    level = 0
    start_from = 0
    headings = ['Source_URL', 'Link_URL', 'Found_on_loop']
    all_data = [headings]
    src_err = "** Error getting html source for page **"
    stats = [] # record level and no of links

    while level < no_of_levels:
        level = level + 1
        this_level = []
        print ""
        print "Starting level", level, "search for links..."

        # build temp list for this level containing all link urls
        # and page sources to loop through...
        if level > 1: # use to get all links from last level
            for row in all_data:
                if row[2] == (level -1):
                    try:
                        # gets url and src html for row in last level of table
                        ll_url, ll_src = get_page(row[1], False)
                    except:
                        ll_url = row[1]
                        ll_src = src_err
                    this_level.append([ll_url, ll_src])
        else: # first time round
            this_level = [[source_url, page]]

        item = 0 # track item within level (each of these will find links)
        lvl_cnt = 0 # track no of links found at each level
        for row in this_level:
            url = "initialised" # variable to get links within source url
            src = row[1]
            source_url = row[0]
            item = item + 1

            #if source_url in all_data[0]:
            #    print "*** Found Dup (", source_url, ")"
            #    continue # skip processing for row and go back to for loop

            print ""
            print "..starting item", item, "within level", level
            print "  source url (", source_url, ")"

            i = 0 # reset count for no of links
            while url:

                if src and src != src_err:

                    src = src[start_from:]

                    # get next link from source html and return end position
                    # to use as the start point for the next search
                    url, start_from = get_next_target(src)

                    # check if url needs to be prefixed with source
                    if url: # if string passed in is not empty
                        if url[0:4] != 'http':
                            url = source_url + url

                        # check if we already have source / url combination
                        got_already = False
                        for row in all_data:
                            if [row[0],row[1]] == [source_url, url]:
                                got_already = True
                        if got_already == False:
                            all_data.append([source_url, url, level])
                            i = i + 1

                        if len(url) > 60:
                            print "    found link: ", url[0:60] + "..."
                        else:
                            print "    found link: ", url

            print "  indentified", i, "new links from this url."
            lvl_cnt = lvl_cnt + i

        # record level stats
        stats.append([level, lvl_cnt])

        # break if no new data in level
        if lvl_cnt == 0:
            print ""
            print "** Breaking execution as no more new links have been found **"
            break

    # display results
    #print ""
    #print "All Data:"
    #for sublist in all_data:
    #    print sublist
    cnt = len(all_data)-1
    print ""
    print "*********************************************************"
    print "Process finished - over", level, "levels found:", cnt, "links."
    print ""
    print "Breakdown[level, count of new links]:"
    print "------------------------------------"
    for lvl in stats:
        print lvl
    print ""
    print "*********************************************************"

# code to get html source into a variable from the url
# also returns url as this can be amended by checks
def get_page(url, checks):
    if checks:
        if url: # if string passed in is not empty
            if url[0:4] != 'http': # add http prefix if needed
                url = 'http://'+url
                print "Added http prefix (" + url + ")"
            print "Attempting to retrieve source html for page..."
    try:
        return url, urllib.urlopen(url).read()
    except:
        return None

# control program
def main():

    # set up starting variables...
    seed_url = None # this can be sent in as cmd line arg
    no_of_levels = 2 # how many levels of checks to perform

    # if cmd line arg sent in use as seed url:
    if len(sys.argv) > 1:
        seed_url = sys.argv[1]
        if len(sys.argv) > 2:
            no_of_levels = int(sys.argv[2])

    seed_url, page = get_page(seed_url, True)

    # if no valid seed_url by now prompt user and loop until valid
    while not page:
        print "Please enter the seed url to look for links:"
        seed_url = raw_input()
        seed_url, page = get_page(seed_url, True)
        if not page: # if above did not pick up valid url
            print "The url entered ("+seed_url+") is not valid."
            print "Please try again..."
            print ""

    #print "Here is the page source:"
    #print page
    print "Source html retrieved, searching for links..."

    # when got valid url got to get_links
    get_links(page, seed_url, no_of_levels)

# call main to start program
if __name__ == '__main__':
    main()