webcrawler

import urllib2
import re
from threading import Thread, Lock
import urlparse

_allowedtypes = ['html', 'htm', 'aspx', 'php']
_allowedsuffixes = ['.co.il', '.com', '.net']
downloadedfiles = [] # This list saves all files that have been downloaded already to prevent file duplications.

def getsource(url): # Returns webpage source, parameter is a URL.
    page = urllib2.urlopen(url) # Open webpage session
    return page.read() # read content and return them as a string

def retrievelinks(sourcecode): # Receives sourcecode string as paraemter, returns list of links.
        links = re.findall('href=\"(.*?)\"', sourcecode)# Link with " in the end
        links = map(removeslash, links) # Making sure that /Default.aspx won't be seen as different from Default.aspx in the links filtering.
        links = list(set(links)) # Use the 'set' data container to remove duplicates from the list.
        links = filter(lambda x: any(ftype in x for ftype in _allowedtypes) or any(x.endswith(suffix) for suffix in _allowedsuffixes), links) # Filter out links that don't match with the allowed suffixes and types, increases performance

        return links

def downloadweb(url, path, depth=1): # Better not use Google, as it won't match google.com links to google.co.il.
    """
    URL: Simply the website url.

    Path: A certain folder path in the PC.

    Depth:
    depth=1 will download the pages from the given url only.
    depth=2 will seek in the pages that have been downloaded.
    depth=3 will seek in the pages that have been downloaded from the previous pages seeked.
    and so on.

    """
    global downloadedfiles
    threadedurls = [] # A list which saves all pages we need to download and look for more links in.
    src = getsource(url)
    urls = retrievelinks(src)
    if not path.endswith('/'): # Makes
        path = path + '/'


    urls = retrievelinks(src)
    print 'URLS found in ' + url + ' :'
    print urls
    print ' '
    #url =
    for i in range(len(urls)):
        if urls[i] in downloadedfiles: # Checks if the file has already been downloaded
            continue

        if urls[i].startswith('mailto:'): # Could be found as a link, should be ignored.
            continue

        if not urls[i].startswith('http://') and not urls[i].startswith('www.'): # Checks if the file is an internal files. Links can come up as page2.html and should be http://web.com/page2.html
            if url.endswith('/'):  # Adds the / for redirection purposes
                fileurl = url + urls[i]
            else:
                fileurl = url + '/' + urls[i]
        else: # If not an internal file just put it.
            fileurl = urls[i]
        if not fileurl.startswith('http://'): # Add http protocol prefix
            fileurl = 'http://' + fileurl

        if url in fileurl or url in fileurl.partition('.')[2]: # Test if the domain or subdomain match.
            downloadedfiles.append(urls[i])
            threadedurls.append(fileurl)

    for i in threadedurls: # Checks if the link is under the domain or subdomain
        print i
        print "Starting thread for " + i
        t = Thread(target = savepage, args = (i,path,)) #  Create thread
        t.start()   # start thread
        print " "

    if depth != 1: # Depth of websites, works as a countdown recursively
        for i in threadedurls:
            downloadweb(i, path, depth-1) # Send with depth -1
        print threadedurls

def savepage(url, path): # Saves the webpage source to the path given
    src = getsource(url) # Gets the source code
    if not path.endswith('/'): # Fixes path incase its wrong
        path = path + '/'

    fname = "".join(x for x in url if x.isalnum()) # Removes non-alpha characters to make sure the filename is valid
    fname = path + fname # Add the path to the filename
    f=open(fname, 'w') # Create the file
    f.write(src) # Write the source
    f.close() # Close the file

def removeslash(x):
    if x.startswith('/'):
        return x[1:]
    else:
        return x


# Example:
#downloadweb('http://magshimim.net', 'webpages/', 2)