Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- import re
- from threading import Thread, Lock
- import urlparse
- _allowedtypes = ['html', 'htm', 'aspx', 'php']
- _allowedsuffixes = ['.co.il', '.com', '.net']
- downloadedfiles = [] # This list saves all files that have been downloaded already to prevent file duplications.
- def getsource(url): # Returns webpage source, parameter is a URL.
- page = urllib2.urlopen(url) # Open webpage session
- return page.read() # read content and return them as a string
- def retrievelinks(sourcecode): # Receives sourcecode string as paraemter, returns list of links.
- links = re.findall('href=\"(.*?)\"', sourcecode)# Link with " in the end
- links = map(removeslash, links) # Making sure that /Default.aspx won't be seen as different from Default.aspx in the links filtering.
- links = list(set(links)) # Use the 'set' data container to remove duplicates from the list.
- links = filter(lambda x: any(ftype in x for ftype in _allowedtypes) or any(x.endswith(suffix) for suffix in _allowedsuffixes), links) # Filter out links that don't match with the allowed suffixes and types, increases performance
- return links
- def downloadweb(url, path, depth=1): # Better not use Google, as it won't match google.com links to google.co.il.
- """
- URL: Simply the website url.
- Path: A certain folder path in the PC.
- Depth:
- depth=1 will download the pages from the given url only.
- depth=2 will seek in the pages that have been downloaded.
- depth=3 will seek in the pages that have been downloaded from the previous pages seeked.
- and so on.
- """
- global downloadedfiles
- threadedurls = [] # A list which saves all pages we need to download and look for more links in.
- src = getsource(url)
- urls = retrievelinks(src)
- if not path.endswith('/'): # Makes
- path = path + '/'
- urls = retrievelinks(src)
- print 'URLS found in ' + url + ' :'
- print urls
- print ' '
- #url =
- for i in range(len(urls)):
- if urls[i] in downloadedfiles: # Checks if the file has already been downloaded
- continue
- if urls[i].startswith('mailto:'): # Could be found as a link, should be ignored.
- continue
- if not urls[i].startswith('http://') and not urls[i].startswith('www.'): # Checks if the file is an internal files. Links can come up as page2.html and should be http://web.com/page2.html
- if url.endswith('/'): # Adds the / for redirection purposes
- fileurl = url + urls[i]
- else:
- fileurl = url + '/' + urls[i]
- else: # If not an internal file just put it.
- fileurl = urls[i]
- if not fileurl.startswith('http://'): # Add http protocol prefix
- fileurl = 'http://' + fileurl
- if url in fileurl or url in fileurl.partition('.')[2]: # Test if the domain or subdomain match.
- downloadedfiles.append(urls[i])
- threadedurls.append(fileurl)
- for i in threadedurls: # Checks if the link is under the domain or subdomain
- print i
- print "Starting thread for " + i
- t = Thread(target = savepage, args = (i,path,)) # Create thread
- t.start() # start thread
- print " "
- if depth != 1: # Depth of websites, works as a countdown recursively
- for i in threadedurls:
- downloadweb(i, path, depth-1) # Send with depth -1
- print threadedurls
- def savepage(url, path): # Saves the webpage source to the path given
- src = getsource(url) # Gets the source code
- if not path.endswith('/'): # Fixes path incase its wrong
- path = path + '/'
- fname = "".join(x for x in url if x.isalnum()) # Removes non-alpha characters to make sure the filename is valid
- fname = path + fname # Add the path to the filename
- f=open(fname, 'w') # Create the file
- f.write(src) # Write the source
- f.close() # Close the file
- def removeslash(x):
- if x.startswith('/'):
- return x[1:]
- else:
- return x
- # Example:
- #downloadweb('http://magshimim.net', 'webpages/', 2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement