Advertisement
ChrisProsser

web_crawler.py

Jun 28th, 2013
280
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.41 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. # Call with two command line arguments for seed url and no of levels to go to
  4. # Warning: More than 3 levels may take a long time to run and the number of
  5. #          links to follow grows exponentially with each level added.
  6.  
  7. # example:
  8. # python web_crawler.py www.google.co.uk 3
  9.  
  10. import urllib   # needed for get_page
  11. import sys      # needed for cmd line args
  12.  
  13. # procedure to get target
  14. def get_next_target(page):
  15.     start_link = page.find('<a href=')
  16.     if start_link == -1:
  17.         url, end_quote = None, 0
  18.     else:
  19.         start_quote = page.find('"', start_link)
  20.         end_quote = page.find('"', start_quote + 1)
  21.         url = page[start_quote + 1:end_quote]
  22.     return url, end_quote
  23.  
  24. # code to loop through urls in a page
  25. # consider adding code to get link descriptions as well
  26. def get_links(page, source_url, no_of_levels):
  27.  
  28.     # set up variables
  29.     level = 0
  30.     start_from = 0
  31.     headings = ['Source_URL', 'Link_URL', 'Found_on_loop']
  32.     all_data = [headings]
  33.     src_err = "** Error getting html source for page **"
  34.     stats = [] # record level and no of links
  35.  
  36.     while level < no_of_levels:
  37.         level = level + 1
  38.         this_level = []
  39.         print ""
  40.         print "Starting level", level, "search for links..."
  41.  
  42.         # build temp list for this level containing all link urls
  43.         # and page sources to loop through...
  44.         if level > 1: # use to get all links from last level
  45.             for row in all_data:
  46.                 if row[2] == (level -1):
  47.                     try:
  48.                         # gets url and src html for row in last level of table
  49.                         ll_url, ll_src = get_page(row[1], False)
  50.                     except:
  51.                         ll_url = row[1]
  52.                         ll_src = src_err
  53.                     this_level.append([ll_url, ll_src])
  54.         else: # first time round
  55.             this_level = [[source_url, page]]
  56.  
  57.         item = 0 # track item within level (each of these will find links)
  58.         lvl_cnt = 0 # track no of links found at each level
  59.         for row in this_level:
  60.             url = "initialised" # variable to get links within source url
  61.             src = row[1]
  62.             source_url = row[0]
  63.             item = item + 1
  64.  
  65.             #if source_url in all_data[0]:
  66.             #    print "*** Found Dup (", source_url, ")"
  67.             #    continue # skip processing for row and go back to for loop
  68.  
  69.             print ""
  70.             print "..starting item", item, "within level", level
  71.             print "  source url (", source_url, ")"
  72.  
  73.             i = 0 # reset count for no of links
  74.             while url:
  75.                
  76.                 if src and src != src_err:
  77.                                
  78.                     src = src[start_from:]
  79.  
  80.                     # get next link from source html and return end position
  81.                     # to use as the start point for the next search
  82.                     url, start_from = get_next_target(src)
  83.  
  84.                     # check if url needs to be prefixed with source
  85.                     if url: # if string passed in is not empty
  86.                         if url[0:4] != 'http':
  87.                             url = source_url + url
  88.  
  89.                         # check if we already have source / url combination
  90.                         got_already = False
  91.                         for row in all_data:
  92.                             if [row[0],row[1]] == [source_url, url]:
  93.                                 got_already = True
  94.                         if got_already == False:
  95.                             all_data.append([source_url, url, level])
  96.                             i = i + 1
  97.  
  98.                         if len(url) > 60:
  99.                             print "    found link: ", url[0:60] + "..."
  100.                         else:
  101.                             print "    found link: ", url
  102.                
  103.             print "  indentified", i, "new links from this url."
  104.             lvl_cnt = lvl_cnt + i
  105.  
  106.         # record level stats
  107.         stats.append([level, lvl_cnt])
  108.  
  109.         # break if no new data in level
  110.         if lvl_cnt == 0:
  111.             print ""
  112.             print "** Breaking execution as no more new links have been found **"
  113.             break
  114.  
  115.     # display results
  116.     #print ""
  117.     #print "All Data:"
  118.     #for sublist in all_data:
  119.     #    print sublist
  120.     cnt = len(all_data)-1
  121.     print ""
  122.     print "*********************************************************"
  123.     print "Process finished - over", level, "levels found:", cnt, "links."
  124.     print ""
  125.     print "Breakdown[level, count of new links]:"
  126.     print "------------------------------------"
  127.     for lvl in stats:
  128.         print lvl
  129.     print ""
  130.     print "*********************************************************"
  131.  
  132. # code to get html source into a variable from the url
  133. # also returns url as this can be amended by checks
  134. def get_page(url, checks):
  135.     if checks:
  136.         if url: # if string passed in is not empty
  137.             if url[0:4] != 'http': # add http prefix if needed
  138.                 url = 'http://'+url
  139.                 print "Added http prefix (" + url + ")"
  140.             print "Attempting to retrieve source html for page..."
  141.     try:
  142.         return url, urllib.urlopen(url).read()
  143.     except:
  144.         return None
  145.  
  146. # control program
  147. def main():
  148.    
  149.     # set up starting variables...
  150.     seed_url = None # this can be sent in as cmd line arg
  151.     no_of_levels = 2 # how many levels of checks to perform
  152.    
  153.     # if cmd line arg sent in use as seed url:
  154.     if len(sys.argv) > 1:
  155.         seed_url = sys.argv[1]
  156.         if len(sys.argv) > 2:
  157.             no_of_levels = int(sys.argv[2])
  158.            
  159.     seed_url, page = get_page(seed_url, True)
  160.  
  161.     # if no valid seed_url by now prompt user and loop until valid
  162.     while not page:
  163.         print "Please enter the seed url to look for links:"
  164.         seed_url = raw_input()
  165.         seed_url, page = get_page(seed_url, True)
  166.         if not page: # if above did not pick up valid url
  167.             print "The url entered ("+seed_url+") is not valid."
  168.             print "Please try again..."
  169.             print ""
  170.    
  171.     #print "Here is the page source:"
  172.     #print page
  173.     print "Source html retrieved, searching for links..."
  174.  
  175.     # when got valid url got to get_links
  176.     get_links(page, seed_url, no_of_levels)
  177.  
  178. # call main to start program
  179. if __name__ == '__main__':
  180.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement