Advertisement
yanirx

webcrawler

Jul 21st, 2014
292
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.76 KB | None | 0 0
  1. import urllib2
  2. import re
  3. from threading import Thread, Lock
  4. import urlparse
  5.  
  6. _allowedtypes = ['html', 'htm', 'aspx', 'php']
  7. _allowedsuffixes = ['.co.il', '.com', '.net']
  8. downloadedfiles = [] # This list saves all files that have been downloaded already to prevent file duplications.
  9.  
  10. def getsource(url): # Returns webpage source, parameter is a URL.
  11.     page = urllib2.urlopen(url) # Open webpage session
  12.     return page.read() # read content and return them as a string
  13.  
  14. def retrievelinks(sourcecode): # Receives sourcecode string as paraemter, returns list of links.
  15.         links = re.findall('href=\"(.*?)\"', sourcecode)# Link with " in the end
  16.         links = map(removeslash, links) # Making sure that /Default.aspx won't be seen as different from Default.aspx in the links filtering.
  17.         links = list(set(links)) # Use the 'set' data container to remove duplicates from the list.
  18.         links = filter(lambda x: any(ftype in x for ftype in _allowedtypes) or any(x.endswith(suffix) for suffix in _allowedsuffixes), links) # Filter out links that don't match with the allowed suffixes and types, increases performance
  19.  
  20.         return links
  21.  
  22. def downloadweb(url, path, depth=1): # Better not use Google, as it won't match google.com links to google.co.il.
  23.     """
  24.     URL: Simply the website url.
  25.  
  26.     Path: A certain folder path in the PC.
  27.  
  28.     Depth:
  29.     depth=1 will download the pages from the given url only.
  30.     depth=2 will seek in the pages that have been downloaded.
  31.     depth=3 will seek in the pages that have been downloaded from the previous pages seeked.
  32.     and so on.
  33.  
  34.     """
  35.     global downloadedfiles
  36.     threadedurls = [] # A list which saves all pages we need to download and look for more links in.
  37.     src = getsource(url)
  38.     urls = retrievelinks(src)
  39.     if not path.endswith('/'): # Makes
  40.         path = path + '/'
  41.  
  42.  
  43.     urls = retrievelinks(src)
  44.     print 'URLS found in ' + url + ' :'
  45.     print urls
  46.     print ' '
  47.     #url =
  48.     for i in range(len(urls)):
  49.         if urls[i] in downloadedfiles: # Checks if the file has already been downloaded
  50.             continue
  51.  
  52.         if urls[i].startswith('mailto:'): # Could be found as a link, should be ignored.
  53.             continue
  54.  
  55.         if not urls[i].startswith('http://') and not urls[i].startswith('www.'): # Checks if the file is an internal files. Links can come up as page2.html and should be http://web.com/page2.html
  56.             if url.endswith('/'):  # Adds the / for redirection purposes
  57.                 fileurl = url + urls[i]
  58.             else:
  59.                 fileurl = url + '/' + urls[i]
  60.         else: # If not an internal file just put it.
  61.             fileurl = urls[i]
  62.         if not fileurl.startswith('http://'): # Add http protocol prefix
  63.             fileurl = 'http://' + fileurl  
  64.  
  65.         if url in fileurl or url in fileurl.partition('.')[2]: # Test if the domain or subdomain match.
  66.             downloadedfiles.append(urls[i])
  67.             threadedurls.append(fileurl)
  68.  
  69.     for i in threadedurls: # Checks if the link is under the domain or subdomain
  70.         print i
  71.         print "Starting thread for " + i
  72.         t = Thread(target = savepage, args = (i,path,)) #  Create thread
  73.         t.start()   # start thread
  74.         print " "
  75.  
  76.     if depth != 1: # Depth of websites, works as a countdown recursively
  77.         for i in threadedurls:
  78.             downloadweb(i, path, depth-1) # Send with depth -1
  79.         print threadedurls 
  80.  
  81. def savepage(url, path): # Saves the webpage source to the path given
  82.     src = getsource(url) # Gets the source code
  83.     if not path.endswith('/'): # Fixes path incase its wrong
  84.         path = path + '/'
  85.  
  86.     fname = "".join(x for x in url if x.isalnum()) # Removes non-alpha characters to make sure the filename is valid
  87.     fname = path + fname # Add the path to the filename
  88.     f=open(fname, 'w') # Create the file
  89.     f.write(src) # Write the source
  90.     f.close() # Close the file
  91.    
  92. def removeslash(x):
  93.     if x.startswith('/'):
  94.         return x[1:]
  95.     else:
  96.         return x   
  97.  
  98.  
  99. # Example:
  100. #downloadweb('http://magshimim.net', 'webpages/', 2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement