Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env python
- # Copyright (C) 2009 Veronica Valeros
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- #
- #
- # Author:
- # Veronica Valeros vero.valeros@gmail.com
- #
- # Changelog
- # - Implemented a depth limit in crawling logic.
- # - Added summary at the end of crawling with statistical data about the crawling results
- # - Incresed crawl speed.
- # - Implemented HEAD method for analysing file types before crawling.
- # - Almost all from the last published version!
- #
- # ToDo
- # - [!] Exception inside crawl() function. While statement rise the exception.
- # <class 'httplib.IncompleteRead'>
- # ...
- # IncompleteRead(2020 bytes read, 4429 more expected)
- # standar imports
- import sys
- import re
- import getopt
- import urllib2
- import urlparse
- import httplib
- import copy
- import os
- import time
- import socket
- import datetime
- import getpass
- ####################
- # Global Variables
- debug=False
- vernum='1.0.1'
- verbose=False
- log=False
- auth=False
- time_responses = []
- # This is for identify links in a HTTP answer
- #linkregex = re.compile('[^>](?:href=|src=|content=\"http)[\'*|\"*](.*?)[\'|\"]',re.IGNORECASE)
- linkregex = re.compile('[^>](?:href\=|src\=|content\=\"http)[\'*|\"*](.*?)[\'|\"].*?>',re.IGNORECASE)
- linkredirect = re.compile('(?:open\\(\"|url=|URL=|location=\'|src=\"|href=\")(.*?)[\'|\"]')
- linksrobots = re.compile('(?:Allow\:|Disallow\:|sitemap\:).*',re.IGNORECASE)
- information_disclosure = re.compile('(?:<address>)(.*)[<]',re.IGNORECASE)
- # HTTP Response Codes
- # -------------------
- error_codes={}
- error_codes['0']='Keyboard Interrupt exception'
- error_codes['1']='Skypping url'
- error_codes['-2']='Name or service not known'
- error_codes['22']='22 Unknown error'
- error_codes['104']='104 Connection reset by peer'
- error_codes['110']='110 Connection timed out'
- error_codes['111']='111 Connection refused'
- error_codes['200']='200 OK'
- error_codes['300']='300 Multiple Choices'
- error_codes['301']='301 Moved Permanently'
- error_codes['302']='Moved'
- error_codes['305']='305 Use Proxy'
- error_codes['307']='307 Temporary Redirect'
- error_codes['400']='400 Bad Request'
- error_codes['401']='401 Unauthorized'
- error_codes['403']='403 Forbidden'
- error_codes['404']='404 Not Found'
- error_codes['405']='405 Method Not Allowed'
- error_codes['407']='407 Proxy Authentication Required'
- error_codes['408']='408 Request Timeout'
- error_codes['500']='500 Internal Server Error'
- error_codes['503']='503 Service Unavailable'
- error_codes['504']='504 Gateway Timeout'
- error_codes['505']='505 HTTP Version Not Supported'
- error_codes['9999']='Server responds with a HTTP status code that we do not understand'
- # End of global variables
- ###########################
- # Print version information and exit
- def version():
- """
- This function prints the version of this program. It doesn't allow any argument.
- """
- print "+----------------------------------------------------------------------+"
- print "| "+ sys.argv[0] + " Version "+ vernum +" |"
- print "| This program is free software; you can redistribute it and/or modify |"
- print "| it under the terms of the GNU General Public License as published by |"
- print "| the Free Software Foundation; either version 2 of the License, or |"
- print "| (at your option) any later version. |"
- print "| |"
- print "| Author: Veronica Valeros, vero.valeros@gmail.com |"
- print "+----------------------------------------------------------------------+"
- print
- # Print help information and exit:
- def usage():
- """
- This function prints the posible options of this program.
- No parameters are needed.
- """
- print "+----------------------------------------------------------------------+"
- print "| "+ sys.argv[0] + " Version "+ vernum +" |"
- print "| This program is free software; you can redistribute it and/or modify |"
- print "| it under the terms of the GNU General Public License as published by |"
- print "| the Free Software Foundation; either version 2 of the License, or |"
- print "| (at your option) any later version. |"
- print "| |"
- print "| Author: Veronica Valeros, vero.valeros@gmail.com |"
- print "+----------------------------------------------------------------------+"
- print
- print "\nUsage: %s <options>" % sys.argv[0]
- print "Options:"
- print " -h, --help Show this help message and exit"
- print " -V, --version Output version information and exit"
- print " -v, --verbose Be verbose"
- print " -D, --debug Debug"
- print " -u, --url URL to start crawling"
- print " -w, --write Save crawl output to a local file"
- print " -L, --common-log-format Generate log of the requests in CLF"
- print " -e, --export-file-list Creates a file with all the URLs to found files during crawling. You can use wget to download the entire list"
- print " -l, --crawl-limit Maximum links to crawl"
- print " -C, --crawl-depth Limit the crawling depth according to the value specified. Ex.: -C 2. "
- print " -d, --download-file Specify the file type of the files to download: png,pdf,jpeg,gif,css,x-javascript,x-shockwave-flash"
- print " -i, --interactive-download Before downloading files allow user to specify manually the type of files to download"
- print " -U, --usuario User name for authentication"
- print " -P, --password Request password for authentication"
- print
- print "Example: python crawler.py -u http://www.example.com -w -C 10 -i "
- print
- sys.exit(1)
- def printout(input_text,output_file):
- """
- To main functionalities are covered in this function:
- 1. Prints a text in the stdout
- 2. Write a text in the given file.
- Not return any value.
- """
- global debug
- global verbose
- try:
- print input_text
- if output_file:
- try:
- output_file.write(input_text+'\n')
- except:
- print '[!] Not saving data in output'
- except Exception as inst:
- print '[!] Exception in printout() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- def check_url(url):
- """
- This function verifies that the given 'url' is well formatted, this means that it has defined a protocol and a domain.
- The urlparse.urlparse() function is used.
- The return values can be 'True'/'False'.
- """
- global debug
- global verbose
- try:
- url_parsed = urlparse.urlparse(url)
- if url_parsed.scheme and url_parsed.netloc:
- return True
- else:
- return False
- except Exception as inst:
- print '[!] Exception in check_url() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- def encode_url(url):
- """
- This function encode the URL according to Percentage or URL encoding.
- Actually only replaces a 'space' to '%20'.
- Returns an URL.
- """
- global debug
- global verbose
- url_encoded = ""
- try:
- url_encoded = url.replace(" ","%20")
- #url_encoded = url_encoded.replace("&","&")
- return url_encoded
- except Exception as inst:
- print '[!] Exception in encode_url() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- def log_line(request, response_code, response_size,log_file):
- """
- This function generates an output line of a given HTTP request in CLF (Common Log Format)
- Not return any value.
- """
- global debug
- global verbose
- try:
- try:
- if response_size == -1:
- content_size = '-'
- else:
- content_size = str(response_size)
- local_hostname = socket.gethostname()
- local_user = os.getenv('USER')
- timestamp = time.strftime('%e/%b/%Y:%X %z').strip()
- method = request.get_method()
- protocol = 'HTTP/1.1' # This is the version of the protocol that urllib2 uses
- user_agent = request.get_header('User-agent')
- url = request.get_full_url()
- # COMMON LOG FORMAT
- log_file.write(local_hostname+' '+'-'+' '+local_user+' '+'['+timestamp+']'+' '+'"'+method+' '+url+' '+protocol+'"'+' '+str(response_code)+' '+content_size+' "-" "'+user_agent+'"\n')
- # URLSNARF FORMAT
- #log_file.write(local_hostname+' '+'- - '+'['+timestamp+']'+' '+'"'+method+' '+url+' '+protocol+'"'+' - - "-" "'+user_agent+'"\n')
- except:
- print 'Not logging the following request: {0}'.format(request.get_full_url())
- except Exception as inst:
- print '[!] Exception in log_line() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- def get_url(url, host, username, password, download_files_flag):
- """
- This function does a HTTP request of the given URL using the urllib2 python library.
- Returns two values: [request,response]
- """
- global debug
- global verbose
- global auth
- #Vector to save time responses of each request. For now it is a global variable.
- global time_responses
- starttime=0
- endtime=0
- handler=""
- try:
- try:
- starttime= time.time()
- url = encode_url(url)
- if debug:
- print 'Encoded URL: '+url
- request = urllib2.Request(url)
- request.add_header('User-Agent','Mozilla/4.0 (compatible;MSIE 5.5; Windows NT 5.0)')
- if auth:
- password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
- password_manager.add_password(None, host, username, password)
- handler = urllib2.HTTPBasicAuthHandler(password_manager)
- if not download_files_flag:
- #First we do a head request to see the type of url we are going to crawl
- request.get_method = lambda : 'HEAD'
- if handler:
- opener_web = urllib2.build_opener(handler)
- else:
- opener_web = urllib2.build_opener()
- response = opener_web.open(request)
- # If it is a file, we don get the content
- if 'text/html' not in response.headers.typeheader:
- opener_web.close()
- endtime= time.time()
- time_responses.append(endtime-starttime)
- return [request,response]
- request.get_method = lambda : 'GET'
- if handler:
- opener_web = urllib2.build_opener(handler)
- else:
- opener_web = urllib2.build_opener()
- response = opener_web.open(request)
- opener_web.close()
- endtime= time.time()
- time_responses.append(endtime-starttime)
- return [request,response]
- except urllib2.HTTPError,error_code:
- return [request,error_code.getcode()]
- except urllib2.URLError,error_code:
- error = error_code.args[0]
- return [request,error[0]]
- except socket.error,error_code:
- error = error_code.args[0]
- try:
- error = error[0]
- except:
- pass
- return [request,error]
- except KeyboardInterrupt:
- try:
- print '\t[!] Press a key to continue'
- raw_input()
- return ["",1]
- except KeyboardInterrupt:
- return ["",0]
- except Exception as inst:
- print '[!] Exception in get_url() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- def get_links(link_host, link_path, content):
- """
- This function uses a regular expresion to find links in a HTML source page.
- The regular expresion used is defined in the 'linkregex' variable.
- Returns a vector of extracted links
- """
- global debug
- global verbose
- global linkregex
- try:
- # We obtain the links in the given response
- links = linkregex.findall(content)
- # We analyze each link
- for link in links:
- try:
- link_clean = link.strip(' ')
- except:
- print 'error'
- parsed_link = urlparse.urlparse(link_clean)
- if not parsed_link.scheme and not parsed_link.netloc:
- if link_clean.startswith('/'):
- if link_host.endswith('/'):
- links[links.index(link)] = link_host.rstrip('/')+link_clean
- else:
- links[links.index(link)] = link_host+link_clean
- elif link_clean.startswith('./'):
- links[links.index(link)] = link_host+link_clean
- else:
- links[links.index(link)] = link_path+link_clean
- else:
- links[links.index(link)] = link_clean
- for link in links:
- links[links.index(link)] = link.split('#')[0]
- return links
- except Exception as inst:
- print '[!] Exception in get_links() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- def crawl(url,usuario,password,output_filename,crawl_limit=0,log=False,log_filename='none',crawl_depth=0):
- """
- Crawl a given url using a breadth first exploration.
- The function returns the following values: [links_crawled, urls_not_crawled, links_to_files]
- """
- global debug
- global verbose
- global error_codes
- # Vector that stores the remaining URLs to crawl
- urls_to_crawl = []
- urls_not_crawled = []
- links_crawled = []
- links_extracted = []
- files=[]
- crawl_limit_flag=False
- urls_to_crawl.append(url)
- if (crawl_limit>0):
- crawl_limit_flag=True
- if crawl_depth > 0:
- crawl_depth = crawl_depth + 3
- try:
- printout('[+] Site to crawl: '+url,output_filename)
- printout('[+] Start time: '+str(datetime.datetime.today()),output_filename)
- if output_filename:
- printout('[+] Output file: '+output_filename.name,output_filename)
- if log:
- printout('[+] Common log format output: '+log_filename.name,output_filename)
- printout('',output_filename)
- printout('[+] Crawling',output_filename)
- while urls_to_crawl:
- if crawl_limit_flag:
- if (len(links_crawled) >= crawl_limit):
- break
- try:
- # We extract the next url to crawl
- url = urls_to_crawl[0]
- urls_to_crawl.remove(url)
- # Here we limit the crawl depth
- if crawl_depth > 0:
- if url.endswith('/'):
- if url.rpartition('/')[0].count('/') >= crawl_depth:
- continue
- elif url.count('/') >= crawl_depth:
- continue
- # We add the url to the links crawled
- links_crawled.append(url)
- # We print the URL that is being crawled
- printout(' [-] '+str(url),output_filename)
- # We extract the host of the crawled URL
- parsed_url = urlparse.urlparse(url)
- host = parsed_url.scheme + '://' + parsed_url.netloc
- if parsed_url.path.endswith('/'):
- link_path = host + parsed_url.path
- else:
- link_path = host + parsed_url.path.rpartition('/')[0] + '/'
- # We obtain the response of the URL
- [request,response] = get_url(url,host,usuario, password,False)
- # If there is a response
- if response:
- #If the server didn't return an HTTP Error
- if not isinstance(response, int):
- content = response.read()
- if log:
- log_line(request,response.getcode(),len(content),log_filename)
- # We print the file type of the crawled page
- if response.headers.typeheader:
- # If it isn't an HTML file
- if 'text/html' not in response.headers.typeheader:
- if url not in files:
- files.append([url,str(response.headers.typeheader.split('/')[1].split(';')[0])])
- if verbose:
- printout('\t[-] ('+str(response.getcode())+') '+str(response.headers.typeheader),output_filename)
- else:
- #if verbose:
- # printout('\t[-] ('+str(response.getcode())+') '+str(response.headers.typeheader),output_filename)
- links_extracted = get_links(host, link_path, content)
- links_extracted.sort()
- # We add new links to the list of urls to crawl
- for link in links_extracted:
- if debug:
- print '\t [i] {0}'.format(link)
- parsed_link= urlparse.urlparse(link)
- link_host = parsed_link.scheme + '://' + parsed_link.netloc
- # We just crawl URLs of the same host
- if link_host == host:
- if link not in links_crawled and link not in urls_to_crawl:
- urls_to_crawl.append(link)
- elif link not in urls_not_crawled:
- urls_not_crawled.append(link)
- else:
- # We print the error code if neccesary
- printout('\t[i] '+error_codes[str(response)],output_filename)
- if log:
- log_line(request,response,-1,log_filename)
- else:
- if response==1:
- continue
- if response==0:
- print '[!] Skypping the rest of the urls'
- break
- except KeyboardInterrupt:
- try:
- print '[!] Press a key to continue'
- raw_input()
- continue
- except KeyboardInterrupt:
- print '[!] Exiting'
- break
- except Exception as inst:
- print '[!] Exception inside crawl() function. While statement rise the exception.'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- print 'Response: {0}'.format(response)
- break
- printout('[+] Total urls crawled: '+str(len(links_crawled)),output_filename)
- printout('',output_filename)
- return [links_crawled,urls_not_crawled,files]
- except KeyboardInterrupt:
- try:
- print '[!] Press a key to continue'
- raw_input()
- return 1
- except KeyboardInterrupt:
- print '[!] Keyboard interruption. Exiting'
- return 1
- except Exception as inst:
- print '[!] Exception in crawl() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- def external_links(root_url,external_vector,output_filename):
- """
- This function detects external links from a lists of given URLs. The links not maching the root URL are considered as external.
- Not return any values.
- """
- global debug
- global verbose
- external_websites = []
- try:
- parsed_url = urlparse.urlparse(root_url)
- link_host = parsed_url.scheme + '://' + parsed_url.netloc
- domain = parsed_url.netloc.split('www.')[-1]
- printout('',output_filename)
- printout('[+] Related subdomains found: ',output_filename)
- tmp=[]
- for link in external_vector:
- parsed = urlparse.urlparse(link)
- if domain in parsed.netloc:
- subdomain = parsed.scheme+'://'+parsed.netloc
- if subdomain not in tmp:
- tmp.append(subdomain)
- printout(' [-] '+subdomain,output_filename)
- printout('[+] Total: '+str(len(tmp)),output_filename)
- printout('',output_filename)
- printout('[+] Email addresses found: ',output_filename)
- for link in external_vector:
- if 'mailto' in urlparse.urlparse(link).scheme:
- printout(' [-] '+link.split(':')[1].split('?')[0],output_filename)
- printout('',output_filename)
- printout('[+] This website have references to the following websites: ',output_filename)
- for link in external_vector:
- parsed = urlparse.urlparse(link)
- if parsed.netloc:
- if domain not in parsed.netloc:
- external_domain = parsed.scheme+'://'+parsed.netloc
- if external_domain not in external_websites:
- external_websites.append(external_domain)
- external_websites.sort()
- for link in external_websites:
- printout(' [-] '+link,output_filename)
- printout('[+] Total: '+str(len(external_websites)),output_filename)
- except Exception as inst:
- print '[!] Exception in external_links() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- def indexing_search(usuario, password,links_vector,output_filename):
- """
- This function identifies directories and search for indexing in them from a given list of URLs.
- This function returns the following values: [directories found, directories_with_indexing]
- """
- global debug
- global verbose
- global error_codes
- directories=[]
- indexing=[]
- request=""
- response=""
- title_start_position = -1
- title_end_position = -1
- title=""
- try:
- # Identifying directories
- for i in links_vector:
- while ( len(i.split('/')) > 4 ):
- i=i.rpartition('/')[0]
- if ( ( i+'/' ) not in directories ):
- directories.append(i+'/')
- # We sort the directories vector for proper visualization of the data
- directories.sort()
- printout('[+] Directories found:',output_filename)
- for directory in directories:
- printout(' [-] '+directory,output_filename)
- printout('[+] Total directories: '+str(len(directories)),output_filename)
- printout('',output_filename)
- printout('[+] Directory with indexing',output_filename)
- dots='.'
- for directory in directories:
- sys.stdout.flush()
- sys.stdout.write('\r\x1b'+dots)
- if len(dots)>30:
- dots='.'
- dots=dots+'.'
- try:
- # We extract the host of the crawled URL
- parsed_url = urlparse.urlparse(directory)
- host = parsed_url.scheme + '://' + parsed_url.netloc
- # We obtain the response of the URL
- [request,response] = get_url(directory, host, usuario, password,False)
- # If there is a response
- if response:
- #If the server didn't return an HTTP Error
- if not isinstance(response, int):
- content = response.read()
- title_start_position = content.find('<title>')
- if title_start_position != -1:
- title_end_position = content.find('</title>', title_start_position+7)
- if title_end_position != -1:
- title = content[title_start_position+7:title_end_position]
- if title:
- if title.find('Index of') != -1:
- printout('\n [!] '+directory,output_filename)
- indexing.append(directory)
- elif verbose:
- printout(' [-] '+directory,output_filename)
- else:
- if debug:
- # We print the error code if neccesary
- printout(' [-] '+directory+' ('+error_codes[str(response)]+')',output_filename)
- else:
- if response==1:
- continue
- if response==0:
- print '[!] Skypping the rest of the directories'
- break
- except KeyboardInterrupt:
- try:
- print '[!] Press a key to continue'
- raw_input()
- pass
- except KeyboardInterrupt:
- print '[!] Exiting'
- break
- printout('\n[+] Total directories with indexing: '+str(len(indexing)),output_filename)
- printout('',output_filename)
- return [directories,indexing]
- except Exception as inst:
- print '[!] Exception in indexing_search() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return 1
- def report_files(export_file_list,files_vector,output_filename):
- """
- This function export in a output file a list of the URLs of the found files during crawling.
- """
- global debug
- global verbose
- try:
- if len(files_vector)>0:
- printout('[+] Files found:',output_filename)
- if export_file_list:
- try:
- local_file = open(output_name.rpartition('.')[0]+'.files','w')
- printout('[+] Exporting list of files found to: '+output_name.rpartition('.')[0]+'.files',output_filename)
- except OSError,error:
- if 'File exists' in error:
- printout('[+] Exporting list of files found to: '+output_name.rpartition('.')[0]+'.files',output_filename)
- pass
- else:
- print '[+] Error creating output file to export list of files.'
- export_file_list=False
- # We print the files found during the crawling
- for [i,j] in files_vector:
- printout(' [-] '+str(i)+' ('+str(j)+')',output_filename)
- if export_file_list:
- local_file.write(i+'\n')
- printout('[+] Total files: '+str(len(files_vector)),output_filename)
- if export_file_list:
- local_file.close()
- except Exception as inst:
- print '[!] Exception in report_files() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return 1
- def download_files(extensions_to_download,files_vector,usuario,password,interactive_flag,output_filename):
- """
- This function downloads a set of files which extensions match with the given in extensions_to_download.
- If the interactive_flag is set on True, then the user can select manually the files to download choosing from the files
- extensions found during crawling.
- This function returns a list of extensions in the found files during crawling.
- """
- global debug
- global verbose
- list_of_files_to_download=[]
- extensions_found=[]
- try:
- if len(files_vector)>0:
- # Looking for the types of files found during crawling
- for [i,j] in files_vector:
- if j not in extensions_found:
- extensions_found.append( j )
- #If the interactive mode is enabled, we ask user which files to downlaod
- if interactive_flag:
- print
- print '[+] Starting to download files'
- print '[+] The following files were found during crawling:'
- print ' ',
- print extensions_found
- print ' Select next wich type of files you want to download. Ex.: png,pdf,css.'
- extensions_to_download= raw_input(' ')
- # Looking for files matching the download criteria
- for [i,j] in files_vector:
- if (j in extensions_to_download):
- list_of_files_to_download.append(i)
- # If there is at least one file matching the download criteria, we create a output directory and download them
- if ( len(list_of_files_to_download) > 0 ):
- # Fetching found files
- printout('',output_filename)
- printout('[+] Downloading specified files: '+extensions_to_download,output_filename)
- printout('[+] Total files to download: '+str(len(list_of_files_to_download)),output_filename)
- # Creating output directory download files
- try:
- output_directory = output_name.rpartition('.')[0]+'_files'
- os.mkdir(output_directory)
- printout('[+] Output directory: '+output_directory,output_filename)
- except OSError, error:
- if 'File exists' in error:
- print '\n[!] Directory already exists. Press a key to ovewrite or CTRL+C cancel download'
- try:
- raw_input()
- printout('[+] Output directory: '+output_directory,output_filename)
- except KeyboardInterrupt:
- printout('\n[+] Download files aborted',output_filename)
- return 1
- else:
- printout('\n[!] Download files aborted. Error while creating output directory.',output_filename)
- #Downloading files
- for i in list_of_files_to_download:
- printout(' [-] '+i,output_filename)
- # We extract the host of the crawled URL
- parsed_url = urlparse.urlparse(i)
- host = parsed_url.scheme + '://' + parsed_url.netloc
- [request,response] = get_url(i.replace(' ','%20'), host, usuario, password, True)
- if response:
- if not isinstance(response, int):
- response = response.read()
- try:
- local_file=open(output_directory+'/'+i.rpartition('/')[2],'w')
- except OSError, error:
- if 'File exists' in error:
- pass
- else:
- printout(' [-] Impossible to create output file for: '+output_directory+'/'+i.rpartition('/')[2],output_filename)
- if local_file:
- local_file.write(response)
- local_file.close()
- printout('[+] Download complete',output_filename)
- printout('',output_filename)
- return extensions_found
- except Exception as inst:
- print '[!] Exception in download_files() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- ####################
- #STATISTICS FUNCTION
- ####################
- def statistics(global_time, directories, indexing, links_crawled, files, extensions_found, output_filename):
- global debug
- global verbose
- global time_responses
- queries_time = 0
- avg_time_per_query = 0
- amt_files_per_extension = {}
- try:
- print
- if len(links_crawled) > 1:
- # Calculating avg time per query
- for i in time_responses:
- queries_time = queries_time + i
- try:
- avg_time_per_query = (queries_time / len(time_responses))
- except:
- avg_time_per_query = 0
- # Calculating incidence of files
- for [link,extension] in files:
- amt_files_per_extension[extension] = 0
- for [link,extension] in files:
- amt_files_per_extension[extension] += 1
- print '___________'
- print
- print 'Summary'
- print '___________'
- print
- if output_filename:
- print '[+] Output file stored at: {0}'.format(os.path.realpath(output_name))
- print
- print '[+] Total elapsed time: {0} seconds ({1} min)'.format(round(global_time,2),round((global_time/60),2))
- print '[+] AVG time per query: {0} seconds'.format(round(avg_time_per_query,2))
- print
- print '[+] Total links crawled\t{0}'.format(str(len(links_crawled)-len(files)))
- print '[+] Total directories\t{0}'.format(str(len(directories)))
- print ' [-] Indexing\t{0}'.format(str(len(indexing)))
- print '[+] Total found files\t{0}'.format(str(len(files)))
- for key in amt_files_per_extension.keys():
- print ' | '+key+'\t'+str(amt_files_per_extension[key])
- except Exception as inst:
- print '[!] Exception in statistics() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- return -1
- ##########
- # MAIN
- ##########
- def main():
- global debug
- global verbose
- global log
- global auth
- global output
- global output_name
- url_to_crawl = ""
- usuario = "crawler123"
- password = "crawler123"
- crawl_limit = 0
- extensions_to_download = ""
- download_files_flag=False
- export_file_list = False
- interactive_flag=False
- starttime=0
- endtime=0
- #Data lists
- directories = []
- indexing = []
- links_crawled = []
- externals_url_vector = []
- files_vector = []
- extensions_found = []
- crawl_depth = 0
- save_output=False
- output_name = ""
- output_file = ""
- log_name = ""
- log_file = ""
- try:
- # By default we crawl a max of 5000 distinct URLs
- opts, args = getopt.getopt(sys.argv[1:], "hVDwu:vLU:Pl:[d:]eiC:", ["help","version","debug","write","url=","verbose","common-log-format","usuario=","password","crawl-limit=","[download-file=]","export-file-list","interactive-download","crawl-depth="])
- except getopt.GetoptError: usage()
- for opt, arg in opts:
- if opt in ("-h", "--help"): usage()
- if opt in ("-V", "--version"): version();exit(1)
- if opt in ("-D", "--debug"): debug=True
- if opt in ("-w", "--write"): save_output=True
- if opt in ("-u", "--url"): url_to_crawl = arg
- if opt in ("-v", "--verbose"): verbose = True
- if opt in ("-L", "--common-log-format"): log = True
- if opt in ("-U", "--usuario"): usuario = arg
- if opt in ("-P", "--password"): password = getpass.getpass() ; auth = True
- if opt in ("-l", "--crawl-limit"): crawl_limit = int(arg)
- if opt in ("-d", "--download-file"): extensions_to_download = arg ; download_files_flag=True
- if opt in ("-i", "--interactive-download"): interactive_flag=True
- if opt in ("-e", "--export-file-list"): export_file_list = True
- if opt in ("-C", "--crawl-depth"): crawl_depth = arg
- try:
- if debug:
- print '[+] Debugging mode enabled'
- if check_url(url_to_crawl):
- date = str(datetime.datetime.today()).rpartition('.')[0].replace('-','').replace(' ','_').replace(':','')
- if save_output:
- output_name = urlparse.urlparse(url_to_crawl).netloc+'.crawler'
- try:
- output_file = open(output_name,'w')
- except OSError, error:
- if 'File exists' in error:
- pass
- else:
- output_name = ""
- else:
- output_name = ""
- if log:
- log_name = date +'_'+ urlparse.urlparse(url_to_crawl).netloc + '.log'
- try:
- log_file = open(log_name,'w')
- except OSError, error:
- if 'File exists' in error:
- pass
- else:
- log=False
- starttime=time.time()
- # Crawl function
- [links_crawled,externals_url_vector, files_vector] = crawl(url_to_crawl, usuario, password, output_file, crawl_limit, log,log_file,int(crawl_depth))
- # Indexing search
- [directories, indexing] = indexing_search(usuario, password,links_crawled,output_file)
- # Printing found files and exporting files to an output file
- report_files(export_file_list,files_vector,output_file)
- # Searching for external links
- external_links(url_to_crawl,externals_url_vector,output_file)
- # Download files
- if download_files_flag or interactive_flag:
- extensions_found = download_files(extensions_to_download,files_vector,usuario,password,interactive_flag,output_file)
- printout('',output_file)
- printout('[+] End time: '+str(datetime.datetime.today()),output_file)
- endtime=time.time()
- # Printing statistics
- statistics(endtime-starttime,directories,indexing,links_crawled,files_vector,extensions_found,output_name)
- try:
- output_file.close()
- except:
- pass
- try:
- log_file.close()
- except:
- pass
- else:
- print
- print '[!] Check the URL provided, it should be like: http://www.example.com or http://asdf.com'
- print
- usage()
- except KeyboardInterrupt:
- # CTRL-C pretty handling
- print 'Keyboard Interruption!. Exiting.'
- sys.exit(1)
- except Exception as inst:
- print '[!] Exception in main() function'
- print type(inst) # the exception instance
- print inst.args # arguments stored in .args
- print inst # __str__ allows args to printed directly
- x, y = inst # __getitem__ allows args to be unpacked directly
- print 'x =', x
- print 'y =', y
- sys.exit(1)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement