Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # Call with two command line arguments for seed url and no of levels to go to
- # Warning: More than 3 levels may take a long time to run and the number of
- # links to follow grows exponentially with each level added.
- # example:
- # python web_crawler.py www.google.co.uk 3
- import urllib # needed for get_page
- import sys # needed for cmd line args
- # procedure to get target
- def get_next_target(page):
- start_link = page.find('<a href=')
- if start_link == -1:
- url, end_quote = None, 0
- else:
- start_quote = page.find('"', start_link)
- end_quote = page.find('"', start_quote + 1)
- url = page[start_quote + 1:end_quote]
- return url, end_quote
- # code to loop through urls in a page
- # consider adding code to get link descriptions as well
- def get_links(page, source_url, no_of_levels):
- # set up variables
- level = 0
- start_from = 0
- headings = ['Source_URL', 'Link_URL', 'Found_on_loop']
- all_data = [headings]
- src_err = "** Error getting html source for page **"
- stats = [] # record level and no of links
- while level < no_of_levels:
- level = level + 1
- this_level = []
- print ""
- print "Starting level", level, "search for links..."
- # build temp list for this level containing all link urls
- # and page sources to loop through...
- if level > 1: # use to get all links from last level
- for row in all_data:
- if row[2] == (level -1):
- try:
- # gets url and src html for row in last level of table
- ll_url, ll_src = get_page(row[1], False)
- except:
- ll_url = row[1]
- ll_src = src_err
- this_level.append([ll_url, ll_src])
- else: # first time round
- this_level = [[source_url, page]]
- item = 0 # track item within level (each of these will find links)
- lvl_cnt = 0 # track no of links found at each level
- for row in this_level:
- url = "initialised" # variable to get links within source url
- src = row[1]
- source_url = row[0]
- item = item + 1
- #if source_url in all_data[0]:
- # print "*** Found Dup (", source_url, ")"
- # continue # skip processing for row and go back to for loop
- print ""
- print "..starting item", item, "within level", level
- print " source url (", source_url, ")"
- i = 0 # reset count for no of links
- while url:
- if src and src != src_err:
- src = src[start_from:]
- # get next link from source html and return end position
- # to use as the start point for the next search
- url, start_from = get_next_target(src)
- # check if url needs to be prefixed with source
- if url: # if string passed in is not empty
- if url[0:4] != 'http':
- url = source_url + url
- # check if we already have source / url combination
- got_already = False
- for row in all_data:
- if [row[0],row[1]] == [source_url, url]:
- got_already = True
- if got_already == False:
- all_data.append([source_url, url, level])
- i = i + 1
- if len(url) > 60:
- print " found link: ", url[0:60] + "..."
- else:
- print " found link: ", url
- print " indentified", i, "new links from this url."
- lvl_cnt = lvl_cnt + i
- # record level stats
- stats.append([level, lvl_cnt])
- # break if no new data in level
- if lvl_cnt == 0:
- print ""
- print "** Breaking execution as no more new links have been found **"
- break
- # display results
- #print ""
- #print "All Data:"
- #for sublist in all_data:
- # print sublist
- cnt = len(all_data)-1
- print ""
- print "*********************************************************"
- print "Process finished - over", level, "levels found:", cnt, "links."
- print ""
- print "Breakdown[level, count of new links]:"
- print "------------------------------------"
- for lvl in stats:
- print lvl
- print ""
- print "*********************************************************"
- # code to get html source into a variable from the url
- # also returns url as this can be amended by checks
- def get_page(url, checks):
- if checks:
- if url: # if string passed in is not empty
- if url[0:4] != 'http': # add http prefix if needed
- url = 'http://'+url
- print "Added http prefix (" + url + ")"
- print "Attempting to retrieve source html for page..."
- try:
- return url, urllib.urlopen(url).read()
- except:
- return None
- # control program
- def main():
- # set up starting variables...
- seed_url = None # this can be sent in as cmd line arg
- no_of_levels = 2 # how many levels of checks to perform
- # if cmd line arg sent in use as seed url:
- if len(sys.argv) > 1:
- seed_url = sys.argv[1]
- if len(sys.argv) > 2:
- no_of_levels = int(sys.argv[2])
- seed_url, page = get_page(seed_url, True)
- # if no valid seed_url by now prompt user and loop until valid
- while not page:
- print "Please enter the seed url to look for links:"
- seed_url = raw_input()
- seed_url, page = get_page(seed_url, True)
- if not page: # if above did not pick up valid url
- print "The url entered ("+seed_url+") is not valid."
- print "Please try again..."
- print ""
- #print "Here is the page source:"
- #print page
- print "Source html retrieved, searching for links..."
- # when got valid url got to get_links
- get_links(page, seed_url, no_of_levels)
- # call main to start program
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement