Advertisement
Gentoo7

webcrawler

Jul 19th, 2016
26,572
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 36.24 KB | None | 0 0
  1. #! /usr/bin/env python
  2. #  Copyright (C) 2009  Veronica Valeros
  3. #
  4. #  This program is free software; you can redistribute it and/or modify
  5. #  it under the terms of the GNU General Public License as published by
  6. #  the Free Software Foundation; either version 2 of the License, or
  7. #  (at your option) any later version.
  8. #
  9. #  This program is distributed in the hope that it will be useful,
  10. #  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12. #  GNU General Public License for more details.
  13. #
  14. #  You should have received a copy of the GNU General Public License
  15. #  along with this program; if not, write to the Free Software
  16. #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17. #
  18. #
  19. # Author:
  20. # Veronica Valeros vero.valeros@gmail.com
  21. #
  22. # Changelog
  23. # - Implemented a depth limit in crawling logic.
  24. # - Added summary at the end of crawling with statistical data about the crawling results
  25. # - Incresed crawl speed.
  26. # - Implemented HEAD method for analysing file types before crawling.
  27. # - Almost all from the last published version!
  28. #
  29. # ToDo
  30. # - [!] Exception inside crawl() function. While statement rise the exception.
  31. #   <class 'httplib.IncompleteRead'>
  32. #   ...
  33. #   IncompleteRead(2020 bytes read, 4429 more expected)
  34.  
  35.  
  36. # standar imports
  37. import sys
  38. import re
  39. import getopt
  40. import urllib2
  41. import urlparse
  42. import httplib
  43. import copy
  44. import os
  45. import time
  46. import socket
  47. import datetime
  48.  
  49. import getpass
  50.  
  51. ####################
  52. # Global Variables
  53. debug=False
  54. vernum='1.0.1'
  55. verbose=False
  56. log=False
  57. auth=False
  58.  
  59. time_responses = []
  60.  
  61. # This is for identify links in a HTTP answer
  62. #linkregex = re.compile('[^>](?:href=|src=|content=\"http)[\'*|\"*](.*?)[\'|\"]',re.IGNORECASE)
  63. linkregex = re.compile('[^>](?:href\=|src\=|content\=\"http)[\'*|\"*](.*?)[\'|\"].*?>',re.IGNORECASE)
  64. linkredirect = re.compile('(?:open\\(\"|url=|URL=|location=\'|src=\"|href=\")(.*?)[\'|\"]')
  65. linksrobots = re.compile('(?:Allow\:|Disallow\:|sitemap\:).*',re.IGNORECASE)
  66. information_disclosure = re.compile('(?:<address>)(.*)[<]',re.IGNORECASE)
  67.  
  68.  
  69. # HTTP Response Codes
  70. # -------------------
  71. error_codes={}
  72. error_codes['0']='Keyboard Interrupt exception'
  73. error_codes['1']='Skypping url'
  74. error_codes['-2']='Name or service not known'
  75. error_codes['22']='22 Unknown error'
  76. error_codes['104']='104 Connection reset by peer'
  77. error_codes['110']='110 Connection timed out'
  78. error_codes['111']='111 Connection refused'
  79. error_codes['200']='200 OK'
  80. error_codes['300']='300 Multiple Choices'
  81. error_codes['301']='301 Moved Permanently'
  82. error_codes['302']='Moved'
  83. error_codes['305']='305 Use Proxy'
  84. error_codes['307']='307 Temporary Redirect'
  85. error_codes['400']='400 Bad Request'
  86. error_codes['401']='401 Unauthorized'
  87. error_codes['403']='403 Forbidden'
  88. error_codes['404']='404 Not Found'
  89. error_codes['405']='405 Method Not Allowed'
  90. error_codes['407']='407 Proxy Authentication Required'
  91. error_codes['408']='408 Request Timeout'
  92. error_codes['500']='500 Internal Server Error'
  93. error_codes['503']='503 Service Unavailable'
  94. error_codes['504']='504 Gateway Timeout'
  95. error_codes['505']='505 HTTP Version Not Supported'
  96. error_codes['9999']='Server responds with a HTTP status code that we do not understand'
  97.  
  98.  
  99. # End of global variables
  100. ###########################
  101.  
  102.  
  103. # Print version information and exit
  104. def version():
  105.     """
  106.     This function prints the version of this program. It doesn't allow any argument.
  107.     """
  108.     print "+----------------------------------------------------------------------+"
  109.         print "| "+ sys.argv[0] + " Version "+ vernum +"                                      |"
  110.     print "| This program is free software; you can redistribute it and/or modify |"
  111.     print "| it under the terms of the GNU General Public License as published by |"
  112.     print "| the Free Software Foundation; either version 2 of the License, or    |"
  113.     print "| (at your option) any later version.                                  |"
  114.     print "|                                                                      |"
  115.     print "| Author: Veronica Valeros, vero.valeros@gmail.com                     |"
  116.     print "+----------------------------------------------------------------------+"
  117.     print
  118.  
  119. # Print help information and exit:
  120. def usage():
  121.     """
  122.     This function prints the posible options of this program.
  123.  
  124.     No parameters are needed.
  125.     """
  126.     print "+----------------------------------------------------------------------+"
  127.     print "| "+ sys.argv[0] + " Version "+ vernum +"                                      |"
  128.     print "| This program is free software; you can redistribute it and/or modify |"
  129.     print "| it under the terms of the GNU General Public License as published by |"
  130.     print "| the Free Software Foundation; either version 2 of the License, or    |"
  131.     print "| (at your option) any later version.                                  |"
  132.     print "|                                                                      |"
  133.     print "| Author: Veronica Valeros, vero.valeros@gmail.com                     |"
  134.     print "+----------------------------------------------------------------------+"
  135.     print
  136.     print "\nUsage: %s <options>" % sys.argv[0]
  137.     print "Options:"
  138.         print "  -h, --help                           Show this help message and exit"
  139.         print "  -V, --version                        Output version information and exit"
  140.     print "  -v, --verbose                        Be verbose"
  141.         print "  -D, --debug                          Debug"
  142.     print "  -u, --url                            URL to start crawling"
  143.         print "  -w, --write                          Save crawl output to a local file"
  144.         print "  -L, --common-log-format              Generate log of the requests in CLF"
  145.         print "  -e, --export-file-list               Creates a file with all the URLs to found files during crawling. You can use wget to download the entire list"
  146.         print "  -l, --crawl-limit                    Maximum links to crawl"
  147.     print "  -C, --crawl-depth                    Limit the crawling depth according to the value specified. Ex.: -C 2. "
  148.     print "  -d, --download-file                  Specify the file type of the files to download: png,pdf,jpeg,gif,css,x-javascript,x-shockwave-flash"
  149.         print "  -i, --interactive-download           Before downloading files allow user to specify manually the type of files to download"
  150.         print "  -U, --usuario                        User name for authentication"
  151.         print "  -P, --password                       Request password for authentication"
  152.     print
  153.     print "Example: python crawler.py -u http://www.example.com -w -C 10 -i "
  154.     print
  155.     sys.exit(1)
  156.  
  157. def printout(input_text,output_file):
  158.  
  159.     """
  160.     To main functionalities are covered in this function:
  161.     1. Prints a text in the stdout
  162.     2. Write a text in the given file.
  163.  
  164.     Not return any value.
  165.     """
  166.  
  167.     global debug
  168.     global verbose
  169.  
  170.     try:
  171.         print input_text
  172.         if output_file:
  173.             try:
  174.                 output_file.write(input_text+'\n')
  175.             except:
  176.                 print '[!] Not saving data in output'
  177.  
  178.         except Exception as inst:
  179.         print '[!] Exception in printout() function'
  180.         print type(inst)     # the exception instance
  181.         print inst.args      # arguments stored in .args
  182.         print inst           # __str__ allows args to printed directly
  183.         x, y = inst          # __getitem__ allows args to be unpacked directly
  184.         print 'x =', x
  185.         print 'y =', y
  186.         return -1
  187.  
  188. def check_url(url):
  189.  
  190.     """
  191.     This function verifies that the given 'url' is well formatted, this means that it has defined a protocol and a domain.
  192.     The urlparse.urlparse() function is used.
  193.  
  194.     The return values can be 'True'/'False'.
  195.     """
  196.  
  197.     global debug
  198.     global verbose
  199.  
  200.     try:
  201.         url_parsed = urlparse.urlparse(url)
  202.         if url_parsed.scheme and url_parsed.netloc:
  203.             return True
  204.         else:
  205.             return False
  206.  
  207.         except Exception as inst:
  208.         print '[!] Exception in check_url() function'
  209.         print type(inst)     # the exception instance
  210.         print inst.args      # arguments stored in .args
  211.         print inst           # __str__ allows args to printed directly
  212.         x, y = inst          # __getitem__ allows args to be unpacked directly
  213.         print 'x =', x
  214.         print 'y =', y
  215.         return -1
  216.  
  217. def encode_url(url):
  218.  
  219.     """
  220.     This function encode the URL according to Percentage or URL encoding.  
  221.     Actually only replaces a 'space' to '%20'.
  222.  
  223.     Returns an URL.
  224.     """
  225.  
  226.     global debug
  227.     global verbose
  228.  
  229.     url_encoded = ""
  230.     try:   
  231.         url_encoded = url.replace(" ","%20")
  232.         #url_encoded = url_encoded.replace("&amp;","&")
  233.        
  234.         return url_encoded
  235.  
  236.         except Exception as inst:
  237.         print '[!] Exception in encode_url() function'
  238.         print type(inst)     # the exception instance
  239.         print inst.args      # arguments stored in .args
  240.         print inst           # __str__ allows args to printed directly
  241.         x, y = inst          # __getitem__ allows args to be unpacked directly
  242.         print 'x =', x
  243.         print 'y =', y
  244.         return -1
  245.  
  246. def log_line(request, response_code, response_size,log_file):
  247.  
  248.     """
  249.     This function generates an output line of a given HTTP request in CLF (Common Log Format)
  250.  
  251.     Not return any value.
  252.     """
  253.  
  254.     global debug
  255.     global verbose
  256.  
  257.     try:
  258.         try:
  259.             if response_size == -1:
  260.                 content_size = '-'
  261.             else:
  262.                 content_size = str(response_size)
  263.             local_hostname = socket.gethostname()
  264.             local_user = os.getenv('USER')
  265.             timestamp = time.strftime('%e/%b/%Y:%X %z').strip()
  266.             method = request.get_method()
  267.             protocol = 'HTTP/1.1'   # This is the version of the protocol that urllib2 uses
  268.             user_agent = request.get_header('User-agent')
  269.             url = request.get_full_url()
  270.            
  271.             # COMMON LOG FORMAT
  272.             log_file.write(local_hostname+' '+'-'+' '+local_user+' '+'['+timestamp+']'+' '+'"'+method+' '+url+' '+protocol+'"'+' '+str(response_code)+' '+content_size+' "-" "'+user_agent+'"\n')
  273.  
  274.             # URLSNARF FORMAT
  275.             #log_file.write(local_hostname+' '+'- - '+'['+timestamp+']'+' '+'"'+method+' '+url+' '+protocol+'"'+' - - "-" "'+user_agent+'"\n')
  276.         except:
  277.             print 'Not logging the following request: {0}'.format(request.get_full_url())
  278.  
  279.     except Exception as inst:
  280.         print '[!] Exception in log_line() function'
  281.         print type(inst)     # the exception instance
  282.         print inst.args      # arguments stored in .args
  283.         print inst           # __str__ allows args to printed directly
  284.         x, y = inst          # __getitem__ allows args to be unpacked directly
  285.         print 'x =', x
  286.         print 'y =', y
  287.  
  288. def get_url(url, host, username, password, download_files_flag):
  289.  
  290.     """
  291.     This function does a HTTP request of the given URL using the urllib2 python library.
  292.  
  293.     Returns two values: [request,response]
  294.     """
  295.  
  296.     global debug
  297.     global verbose
  298.     global auth
  299.  
  300.     #Vector to save time responses of each request. For now it is a global variable.
  301.     global time_responses
  302.  
  303.  
  304.     starttime=0
  305.     endtime=0
  306.     handler=""
  307.  
  308.     try:
  309.         try:
  310.             starttime= time.time()
  311.  
  312.             url = encode_url(url)
  313.             if debug:
  314.                 print 'Encoded URL: '+url
  315.             request = urllib2.Request(url)
  316.             request.add_header('User-Agent','Mozilla/4.0 (compatible;MSIE 5.5; Windows NT 5.0)')
  317.            
  318.             if auth:
  319.                 password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
  320.                 password_manager.add_password(None, host, username, password)
  321.  
  322.                 handler = urllib2.HTTPBasicAuthHandler(password_manager)
  323.  
  324.             if not download_files_flag:
  325.                 #First we do a head request to see the type of url we are going to crawl
  326.                 request.get_method = lambda : 'HEAD'
  327.  
  328.                 if handler:
  329.                     opener_web = urllib2.build_opener(handler)
  330.                 else:
  331.                     opener_web = urllib2.build_opener()
  332.  
  333.                 response = opener_web.open(request)
  334.  
  335.                 # If it is a file, we don get the content
  336.                 if 'text/html' not in response.headers.typeheader:
  337.                     opener_web.close()
  338.                    
  339.                     endtime= time.time()
  340.                     time_responses.append(endtime-starttime)
  341.  
  342.                     return [request,response]
  343.            
  344.             request.get_method = lambda : 'GET'
  345.             if handler:
  346.                 opener_web = urllib2.build_opener(handler)
  347.             else:
  348.                 opener_web = urllib2.build_opener()
  349.  
  350.             response = opener_web.open(request)
  351.  
  352.             opener_web.close()
  353.  
  354.             endtime= time.time()
  355.             time_responses.append(endtime-starttime)
  356.  
  357.             return [request,response]
  358.  
  359.  
  360.                 except urllib2.HTTPError,error_code:
  361.             return [request,error_code.getcode()]
  362.         except urllib2.URLError,error_code:
  363.             error = error_code.args[0]
  364.             return [request,error[0]]
  365.         except socket.error,error_code:
  366.             error = error_code.args[0]
  367.             try:
  368.                 error = error[0]
  369.             except:
  370.                 pass
  371.             return [request,error]
  372.            
  373.     except KeyboardInterrupt:
  374.         try:
  375.             print '\t[!] Press a key to continue'
  376.             raw_input()
  377.             return ["",1]
  378.         except KeyboardInterrupt:
  379.             return ["",0]
  380.         except Exception as inst:
  381.         print '[!] Exception in get_url() function'
  382.         print type(inst)     # the exception instance
  383.         print inst.args      # arguments stored in .args
  384.         print inst           # __str__ allows args to printed directly
  385.         x, y = inst          # __getitem__ allows args to be unpacked directly
  386.         print 'x =', x
  387.         print 'y =', y
  388.         return -1  
  389.  
  390. def get_links(link_host, link_path, content):
  391.  
  392.     """
  393.     This function uses a regular expresion to find links in a HTML source page.
  394.     The regular expresion used is defined in the 'linkregex' variable.
  395.  
  396.     Returns a vector of extracted links
  397.     """
  398.  
  399.     global debug
  400.     global verbose
  401.     global linkregex
  402.  
  403.     try:
  404.         # We obtain the links in the given response
  405.         links = linkregex.findall(content)
  406.  
  407.         # We analyze each link
  408.         for link in links:
  409.             try:
  410.                 link_clean = link.strip(' ')
  411.             except:
  412.                 print 'error'
  413.             parsed_link = urlparse.urlparse(link_clean)
  414.             if not parsed_link.scheme and not parsed_link.netloc:
  415.                 if link_clean.startswith('/'):
  416.                     if link_host.endswith('/'):
  417.                         links[links.index(link)] = link_host.rstrip('/')+link_clean
  418.                     else:
  419.                         links[links.index(link)] = link_host+link_clean
  420.                 elif link_clean.startswith('./'):
  421.                         links[links.index(link)] = link_host+link_clean
  422.                 else:
  423.                     links[links.index(link)] = link_path+link_clean
  424.             else:
  425.                 links[links.index(link)] = link_clean
  426.  
  427.         for link in links:
  428.             links[links.index(link)] = link.split('#')[0]
  429.  
  430.         return links
  431.  
  432.         except Exception as inst:
  433.         print '[!] Exception in get_links() function'
  434.         print type(inst)     # the exception instance
  435.         print inst.args      # arguments stored in .args
  436.         print inst           # __str__ allows args to printed directly
  437.         x, y = inst          # __getitem__ allows args to be unpacked directly
  438.         print 'x =', x
  439.         print 'y =', y
  440.         return -1
  441.  
  442. def crawl(url,usuario,password,output_filename,crawl_limit=0,log=False,log_filename='none',crawl_depth=0):
  443.    
  444.     """
  445.     Crawl a given url using a breadth first exploration.
  446.  
  447.     The function returns the following values: [links_crawled, urls_not_crawled, links_to_files]
  448.     """
  449.  
  450.     global debug
  451.     global verbose
  452.     global error_codes
  453.    
  454.     # Vector that stores the remaining URLs to crawl
  455.     urls_to_crawl = []
  456.     urls_not_crawled = []
  457.     links_crawled = []
  458.     links_extracted = []
  459.     files=[]
  460.     crawl_limit_flag=False
  461.  
  462.     urls_to_crawl.append(url)
  463.  
  464.     if (crawl_limit>0):
  465.         crawl_limit_flag=True
  466.     if crawl_depth > 0:
  467.         crawl_depth = crawl_depth + 3
  468.     try:
  469.         printout('[+] Site to crawl: '+url,output_filename)
  470.         printout('[+] Start time: '+str(datetime.datetime.today()),output_filename)
  471.         if output_filename:
  472.             printout('[+] Output file: '+output_filename.name,output_filename)
  473.         if log:
  474.             printout('[+] Common log format output: '+log_filename.name,output_filename)
  475.  
  476.         printout('',output_filename)
  477.         printout('[+] Crawling',output_filename)
  478.  
  479.         while urls_to_crawl:
  480.             if crawl_limit_flag:
  481.                 if (len(links_crawled) >= crawl_limit):
  482.                     break
  483.             try:
  484.                 # We extract the next url to crawl
  485.                 url = urls_to_crawl[0]
  486.                 urls_to_crawl.remove(url)
  487.  
  488.                 # Here we limit the crawl depth
  489.                 if crawl_depth > 0:
  490.                     if url.endswith('/'):
  491.                         if url.rpartition('/')[0].count('/') >= crawl_depth:
  492.                             continue
  493.                     elif url.count('/') >= crawl_depth:
  494.                             continue
  495.  
  496.                 # We add the url to the links crawled
  497.                 links_crawled.append(url)
  498.  
  499.                 # We print the URL that is being crawled
  500.                 printout('   [-] '+str(url),output_filename)
  501.  
  502.                 # We extract the host of the crawled URL   
  503.                 parsed_url = urlparse.urlparse(url)
  504.                 host = parsed_url.scheme + '://' + parsed_url.netloc
  505.  
  506.                 if parsed_url.path.endswith('/'):
  507.                     link_path = host + parsed_url.path
  508.                 else:
  509.                     link_path = host + parsed_url.path.rpartition('/')[0] + '/'
  510.  
  511.                 # We obtain the response of the URL
  512.                 [request,response] = get_url(url,host,usuario, password,False)
  513.  
  514.                 # If there is a response
  515.                 if response:
  516.                     #If the server didn't return an HTTP Error
  517.                     if not isinstance(response, int):
  518.                         content = response.read()
  519.  
  520.                         if log:
  521.                             log_line(request,response.getcode(),len(content),log_filename)
  522.  
  523.                         # We print the file type of the crawled page
  524.                         if response.headers.typeheader:
  525.                             # If it isn't an HTML file
  526.                             if 'text/html' not in response.headers.typeheader:
  527.                                 if url not in files:
  528.                                     files.append([url,str(response.headers.typeheader.split('/')[1].split(';')[0])])
  529.                                 if verbose:
  530.                                     printout('\t[-] ('+str(response.getcode())+') '+str(response.headers.typeheader),output_filename)
  531.                             else:
  532.                                 #if verbose:
  533.                                 #   printout('\t[-] ('+str(response.getcode())+') '+str(response.headers.typeheader),output_filename)
  534.  
  535.                                 links_extracted = get_links(host, link_path, content)
  536.                                 links_extracted.sort()
  537.  
  538.                                 # We add new links to the list of urls to crawl
  539.                                 for link in links_extracted:
  540.                                     if debug:
  541.                                         print '\t   [i] {0}'.format(link)
  542.                                     parsed_link= urlparse.urlparse(link)
  543.                                     link_host = parsed_link.scheme + '://' + parsed_link.netloc
  544.  
  545.                                     # We just crawl URLs of the same host
  546.                                     if link_host == host:
  547.                                         if link not in links_crawled and link not in urls_to_crawl:
  548.                                             urls_to_crawl.append(link)
  549.                                     elif link not in urls_not_crawled:
  550.                                         urls_not_crawled.append(link)
  551.                     else:
  552.                         # We print the error code if neccesary
  553.                         printout('\t[i] '+error_codes[str(response)],output_filename)
  554.                         if log:
  555.                             log_line(request,response,-1,log_filename)
  556.                 else:
  557.                     if response==1:
  558.                         continue
  559.                     if response==0:
  560.                         print '[!] Skypping the rest of the urls'
  561.                         break
  562.  
  563.             except KeyboardInterrupt:
  564.                 try:
  565.                     print '[!] Press a key to continue'
  566.                     raw_input()
  567.                     continue
  568.                 except KeyboardInterrupt:
  569.                     print '[!] Exiting'
  570.                     break  
  571.  
  572.             except Exception as inst:
  573.                 print '[!] Exception inside crawl() function. While statement rise the exception.'
  574.                 print type(inst)     # the exception instance
  575.                 print inst.args      # arguments stored in .args
  576.                 print inst           # __str__ allows args to printed directly
  577.                 x, y = inst          # __getitem__ allows args to be unpacked directly
  578.                 print 'x =', x
  579.                 print 'y =', y
  580.                 print 'Response: {0}'.format(response)
  581.                 break
  582.        
  583.         printout('[+] Total urls crawled: '+str(len(links_crawled)),output_filename)
  584.         printout('',output_filename)
  585.  
  586.         return [links_crawled,urls_not_crawled,files]
  587.  
  588.     except KeyboardInterrupt:
  589.         try:
  590.             print '[!] Press a key to continue'
  591.             raw_input()
  592.             return 1
  593.         except KeyboardInterrupt:
  594.             print '[!] Keyboard interruption. Exiting'
  595.             return 1
  596.        
  597.     except Exception as inst:
  598.         print '[!] Exception in crawl() function'
  599.         print type(inst)     # the exception instance
  600.         print inst.args      # arguments stored in .args
  601.         print inst           # __str__ allows args to printed directly
  602.         x, y = inst          # __getitem__ allows args to be unpacked directly
  603.         print 'x =', x
  604.         print 'y =', y
  605.         return -1
  606.  
  607. def external_links(root_url,external_vector,output_filename):
  608.    
  609.     """
  610.     This function detects external links from a lists of given URLs. The links not maching the root URL are considered as external.
  611.  
  612.     Not return any values.
  613.     """
  614.  
  615.     global debug
  616.     global verbose
  617.  
  618.     external_websites = []
  619.  
  620.     try:
  621.         parsed_url = urlparse.urlparse(root_url)
  622.         link_host = parsed_url.scheme + '://' + parsed_url.netloc
  623.         domain = parsed_url.netloc.split('www.')[-1]
  624.  
  625.         printout('',output_filename)
  626.         printout('[+] Related subdomains found: ',output_filename)
  627.         tmp=[]
  628.         for link in external_vector:
  629.             parsed = urlparse.urlparse(link)
  630.             if domain in parsed.netloc:
  631.                 subdomain = parsed.scheme+'://'+parsed.netloc
  632.                 if subdomain not in tmp:
  633.                     tmp.append(subdomain)
  634.                     printout('   [-] '+subdomain,output_filename)
  635.         printout('[+] Total:  '+str(len(tmp)),output_filename)
  636.        
  637.         printout('',output_filename)
  638.         printout('[+] Email addresses found: ',output_filename)
  639.         for link in external_vector:
  640.             if 'mailto' in urlparse.urlparse(link).scheme:
  641.                 printout('   [-] '+link.split(':')[1].split('?')[0],output_filename)
  642.        
  643.         printout('',output_filename)
  644.         printout('[+] This website have references to the following websites: ',output_filename)
  645.         for link in external_vector:
  646.             parsed = urlparse.urlparse(link)
  647.             if parsed.netloc:
  648.                 if domain not in parsed.netloc:
  649.                     external_domain = parsed.scheme+'://'+parsed.netloc
  650.                     if external_domain not in external_websites:
  651.                         external_websites.append(external_domain)
  652.         external_websites.sort()
  653.         for link in external_websites:
  654.             printout('   [-] '+link,output_filename)
  655.         printout('[+] Total:  '+str(len(external_websites)),output_filename)
  656.      
  657.     except Exception as inst:
  658.         print '[!] Exception in external_links() function'
  659.         print type(inst)     # the exception instance
  660.         print inst.args      # arguments stored in .args
  661.         print inst           # __str__ allows args to printed directly
  662.         x, y = inst          # __getitem__ allows args to be unpacked directly
  663.         print 'x =', x
  664.         print 'y =', y
  665.         return -1
  666.  
  667.  
  668. def indexing_search(usuario, password,links_vector,output_filename):
  669.  
  670.     """
  671.     This function identifies directories and search for indexing in them from a given list of URLs.
  672.  
  673.     This function returns the following values: [directories found, directories_with_indexing]
  674.     """
  675.    
  676.     global debug
  677.     global verbose
  678.     global error_codes
  679.  
  680.     directories=[]
  681.     indexing=[]
  682.     request=""
  683.     response=""
  684.  
  685.     title_start_position = -1
  686.     title_end_position = -1
  687.     title=""
  688.  
  689.     try:
  690.  
  691.         # Identifying directories
  692.         for i in links_vector:
  693.             while ( len(i.split('/')) > 4 ):
  694.                 i=i.rpartition('/')[0]
  695.                 if ( ( i+'/' )  not in directories ):
  696.                     directories.append(i+'/')
  697.  
  698.         # We sort the directories vector for proper visualization of the data
  699.         directories.sort()
  700.        
  701.         printout('[+] Directories found:',output_filename)
  702.         for directory in directories:
  703.             printout('   [-] '+directory,output_filename)
  704.         printout('[+] Total directories: '+str(len(directories)),output_filename)
  705.         printout('',output_filename)
  706.  
  707.         printout('[+] Directory with indexing',output_filename)
  708.         dots='.'
  709.         for directory in directories:
  710.             sys.stdout.flush()
  711.             sys.stdout.write('\r\x1b'+dots)
  712.             if len(dots)>30:
  713.                 dots='.'
  714.             dots=dots+'.'
  715.             try:
  716.                 # We extract the host of the crawled URL   
  717.                 parsed_url = urlparse.urlparse(directory)
  718.                 host = parsed_url.scheme + '://' + parsed_url.netloc
  719.                
  720.                 # We obtain the response of the URL
  721.                 [request,response] = get_url(directory, host, usuario, password,False)     
  722.  
  723.                 # If there is a response                                           
  724.                 if response:
  725.                     #If the server didn't return an HTTP Error             
  726.                     if not isinstance(response, int):
  727.                         content = response.read()
  728.  
  729.                         title_start_position = content.find('<title>')
  730.                         if title_start_position != -1:
  731.                             title_end_position = content.find('</title>', title_start_position+7)
  732.                         if title_end_position != -1:
  733.                             title = content[title_start_position+7:title_end_position]
  734.  
  735.                         if title:
  736.                             if title.find('Index of') != -1:
  737.                                 printout('\n   [!] '+directory,output_filename)
  738.                                 indexing.append(directory)
  739.                             elif verbose:
  740.                                 printout('   [-] '+directory,output_filename)
  741.  
  742.                     else:
  743.                         if debug:
  744.                             # We print the error code if neccesary
  745.                             printout('   [-] '+directory+' ('+error_codes[str(response)]+')',output_filename)
  746.                 else:
  747.                     if response==1:
  748.                         continue
  749.                     if response==0:
  750.                         print '[!] Skypping the rest of the directories'
  751.                         break
  752.  
  753.             except KeyboardInterrupt:
  754.                 try:
  755.                     print '[!] Press a key to continue'
  756.                     raw_input()
  757.                     pass
  758.                 except KeyboardInterrupt:
  759.                     print '[!] Exiting'
  760.                     break  
  761.  
  762.         printout('\n[+] Total directories with indexing: '+str(len(indexing)),output_filename)
  763.         printout('',output_filename)
  764.  
  765.         return [directories,indexing]
  766.  
  767.     except Exception as inst:
  768.         print '[!] Exception in indexing_search() function'
  769.         print type(inst)     # the exception instance
  770.         print inst.args      # arguments stored in .args
  771.         print inst           # __str__ allows args to printed directly
  772.         x, y = inst          # __getitem__ allows args to be unpacked directly
  773.         print 'x =', x
  774.         print 'y =', y
  775.         return 1
  776.  
  777. def report_files(export_file_list,files_vector,output_filename):
  778.    
  779.     """
  780.     This function export in a output file a list of the URLs of the found files during crawling.
  781.     """
  782.  
  783.     global debug
  784.     global verbose
  785.  
  786.     try:
  787.         if len(files_vector)>0:
  788.             printout('[+] Files found:',output_filename)
  789.             if export_file_list:
  790.                 try:
  791.                     local_file = open(output_name.rpartition('.')[0]+'.files','w')
  792.                     printout('[+] Exporting list of files found to: '+output_name.rpartition('.')[0]+'.files',output_filename)
  793.                 except OSError,error:
  794.                     if 'File exists' in error:
  795.                             printout('[+] Exporting list of files found to: '+output_name.rpartition('.')[0]+'.files',output_filename)
  796.                             pass
  797.                     else:
  798.                         print '[+] Error creating output file to export list of files.'
  799.                         export_file_list=False
  800.            
  801.             # We print the files found during the crawling
  802.             for [i,j] in files_vector:
  803.                 printout('   [-] '+str(i)+'  ('+str(j)+')',output_filename)
  804.                 if export_file_list:
  805.                     local_file.write(i+'\n')
  806.             printout('[+] Total files: '+str(len(files_vector)),output_filename)
  807.  
  808.         if export_file_list:
  809.             local_file.close()
  810.  
  811.     except Exception as inst:
  812.         print '[!] Exception in report_files() function'
  813.         print type(inst)     # the exception instance
  814.         print inst.args      # arguments stored in .args
  815.         print inst           # __str__ allows args to printed directly
  816.         x, y = inst          # __getitem__ allows args to be unpacked directly
  817.         print 'x =', x
  818.         print 'y =', y
  819.         return 1
  820.  
  821. def download_files(extensions_to_download,files_vector,usuario,password,interactive_flag,output_filename):
  822.  
  823.     """
  824.     This function downloads a set of files which extensions match with the given in extensions_to_download.
  825.     If the interactive_flag is set on True, then the user can select manually the files to download choosing from the files
  826.     extensions found during crawling.
  827.  
  828.     This function returns a list of extensions in the found files during crawling.
  829.     """
  830.  
  831.     global debug
  832.     global verbose
  833.  
  834.     list_of_files_to_download=[]
  835.     extensions_found=[]
  836.  
  837.     try:
  838.         if len(files_vector)>0:
  839.             # Looking for the types of files found during crawling 
  840.             for [i,j] in files_vector:
  841.                 if j not in extensions_found:
  842.                     extensions_found.append( j )
  843.  
  844.             #If the interactive mode is enabled, we ask user which files to downlaod
  845.             if interactive_flag:
  846.                 print  
  847.                 print '[+] Starting to download files'
  848.                 print '[+] The following files were found during crawling:'
  849.                 print '   ',
  850.                 print extensions_found
  851.                 print '    Select next wich type of files you want to download. Ex.: png,pdf,css.'
  852.                 extensions_to_download= raw_input('    ')
  853.  
  854.             # Looking for files matching the download criteria 
  855.             for [i,j] in files_vector:
  856.                 if (j in extensions_to_download):
  857.                     list_of_files_to_download.append(i)
  858.  
  859.             #  If there is at least one file matching the download criteria, we create a output directory and download them
  860.             if ( len(list_of_files_to_download) > 0 ):
  861.                 # Fetching found files
  862.                 printout('',output_filename)
  863.                 printout('[+] Downloading specified files: '+extensions_to_download,output_filename)
  864.                 printout('[+] Total files to download: '+str(len(list_of_files_to_download)),output_filename)
  865.  
  866.                 # Creating output directory download files
  867.                 try:
  868.                     output_directory = output_name.rpartition('.')[0]+'_files'
  869.                     os.mkdir(output_directory)
  870.                     printout('[+] Output directory: '+output_directory,output_filename)
  871.                 except OSError, error:
  872.                     if 'File exists' in error:
  873.                         print '\n[!] Directory already exists. Press a key to ovewrite or CTRL+C cancel download'
  874.                         try:
  875.                             raw_input()
  876.                             printout('[+] Output directory: '+output_directory,output_filename)
  877.                         except KeyboardInterrupt:
  878.                             printout('\n[+] Download files aborted',output_filename)
  879.                             return 1
  880.                     else:
  881.                         printout('\n[!] Download files aborted. Error while creating output directory.',output_filename)
  882.  
  883.  
  884.                 #Downloading files
  885.                 for i in list_of_files_to_download:
  886.                     printout('   [-] '+i,output_filename)
  887.  
  888.                     # We extract the host of the crawled URL   
  889.                     parsed_url = urlparse.urlparse(i)
  890.                     host = parsed_url.scheme + '://' + parsed_url.netloc
  891.  
  892.                     [request,response] = get_url(i.replace(' ','%20'), host, usuario, password, True)      
  893.  
  894.  
  895.                     if response:
  896.                         if not isinstance(response, int):
  897.                             response = response.read()
  898.                             try:
  899.                                 local_file=open(output_directory+'/'+i.rpartition('/')[2],'w')
  900.                             except OSError, error:
  901.                                 if 'File exists' in error:
  902.                                     pass
  903.                                 else:
  904.                                     printout('   [-] Impossible to create output file for: '+output_directory+'/'+i.rpartition('/')[2],output_filename)
  905.  
  906.                             if local_file:
  907.                                 local_file.write(response)
  908.                                 local_file.close()
  909.  
  910.             printout('[+] Download complete',output_filename)
  911.             printout('',output_filename)
  912.  
  913.             return extensions_found
  914.                    
  915.     except Exception as inst:
  916.         print '[!] Exception in download_files() function'
  917.         print type(inst)     # the exception instance
  918.         print inst.args      # arguments stored in .args
  919.         print inst           # __str__ allows args to printed directly
  920.         x, y = inst          # __getitem__ allows args to be unpacked directly
  921.         print 'x =', x
  922.         print 'y =', y
  923.         return -1
  924.  
  925. ####################   
  926. #STATISTICS FUNCTION
  927. ####################
  928. def statistics(global_time, directories, indexing, links_crawled, files, extensions_found, output_filename):
  929.     global debug
  930.     global verbose
  931.     global time_responses
  932.    
  933.     queries_time = 0
  934.     avg_time_per_query = 0
  935.     amt_files_per_extension = {}
  936.  
  937.     try:
  938.         print
  939.  
  940.         if len(links_crawled) > 1:
  941.             # Calculating avg time per query
  942.             for i in time_responses:
  943.                 queries_time = queries_time + i
  944.             try:
  945.                 avg_time_per_query = (queries_time / len(time_responses))
  946.             except:
  947.                 avg_time_per_query = 0
  948.  
  949.             # Calculating incidence of files
  950.             for [link,extension] in files:
  951.                 amt_files_per_extension[extension] = 0
  952.             for [link,extension] in files:
  953.                 amt_files_per_extension[extension] += 1
  954.  
  955.             print '___________'
  956.             print
  957.             print 'Summary'
  958.             print '___________'
  959.             print
  960.             if output_filename:
  961.                 print '[+] Output file stored at: {0}'.format(os.path.realpath(output_name))
  962.                 print
  963.             print '[+] Total elapsed time: {0} seconds ({1} min)'.format(round(global_time,2),round((global_time/60),2))
  964.             print '[+] AVG time per query: {0} seconds'.format(round(avg_time_per_query,2))
  965.             print
  966.             print '[+] Total links crawled\t{0}'.format(str(len(links_crawled)-len(files)))
  967.             print '[+] Total directories\t{0}'.format(str(len(directories)))
  968.             print '   [-] Indexing\t{0}'.format(str(len(indexing)))
  969.             print '[+] Total found files\t{0}'.format(str(len(files)))
  970.             for key in amt_files_per_extension.keys():
  971.                 print '       | '+key+'\t'+str(amt_files_per_extension[key])
  972.  
  973.  
  974.     except Exception as inst:
  975.         print '[!] Exception in statistics() function'
  976.         print type(inst)     # the exception instance
  977.         print inst.args      # arguments stored in .args
  978.         print inst           # __str__ allows args to printed directly
  979.         x, y = inst          # __getitem__ allows args to be unpacked directly
  980.         print 'x =', x
  981.         print 'y =', y
  982.         return -1
  983.  
  984.  
  985. ##########
  986. # MAIN
  987. ##########
  988. def main():
  989.  
  990.     global debug
  991.     global verbose
  992.     global log
  993.     global auth
  994.     global output
  995.     global output_name
  996.  
  997.     url_to_crawl = ""
  998.     usuario = "crawler123"
  999.     password = "crawler123"
  1000.     crawl_limit = 0
  1001.     extensions_to_download = ""
  1002.     download_files_flag=False
  1003.     export_file_list = False
  1004.     interactive_flag=False
  1005.  
  1006.     starttime=0
  1007.     endtime=0
  1008.  
  1009.     #Data lists
  1010.     directories = []
  1011.     indexing = []
  1012.     links_crawled = []
  1013.     externals_url_vector = []
  1014.     files_vector = []
  1015.     extensions_found = []
  1016.     crawl_depth = 0
  1017.     save_output=False
  1018.     output_name = ""
  1019.     output_file = ""
  1020.     log_name = ""
  1021.     log_file = ""
  1022.  
  1023.     try:
  1024.  
  1025.         # By default we crawl a max of 5000 distinct URLs
  1026.         opts, args = getopt.getopt(sys.argv[1:], "hVDwu:vLU:Pl:[d:]eiC:", ["help","version","debug","write","url=","verbose","common-log-format","usuario=","password","crawl-limit=","[download-file=]","export-file-list","interactive-download","crawl-depth="])
  1027.  
  1028.  
  1029.     except getopt.GetoptError: usage() 
  1030.  
  1031.     for opt, arg in opts:
  1032.         if opt in ("-h", "--help"): usage()
  1033.         if opt in ("-V", "--version"): version();exit(1)
  1034.         if opt in ("-D", "--debug"): debug=True
  1035.         if opt in ("-w", "--write"): save_output=True
  1036.         if opt in ("-u", "--url"): url_to_crawl = arg
  1037.         if opt in ("-v", "--verbose"): verbose = True
  1038.         if opt in ("-L", "--common-log-format"): log = True
  1039.         if opt in ("-U", "--usuario"): usuario = arg
  1040.         if opt in ("-P", "--password"): password = getpass.getpass() ; auth = True
  1041.         if opt in ("-l", "--crawl-limit"): crawl_limit = int(arg)
  1042.         if opt in ("-d", "--download-file"): extensions_to_download = arg ; download_files_flag=True
  1043.         if opt in ("-i", "--interactive-download"): interactive_flag=True
  1044.         if opt in ("-e", "--export-file-list"): export_file_list = True
  1045.         if opt in ("-C", "--crawl-depth"): crawl_depth = arg
  1046.     try:
  1047.  
  1048.         if debug:
  1049.             print '[+] Debugging mode enabled'
  1050.  
  1051.         if check_url(url_to_crawl):
  1052.  
  1053.             date = str(datetime.datetime.today()).rpartition('.')[0].replace('-','').replace(' ','_').replace(':','')
  1054.             if save_output:
  1055.                 output_name = urlparse.urlparse(url_to_crawl).netloc+'.crawler'
  1056.                 try:
  1057.                     output_file = open(output_name,'w')
  1058.                 except OSError, error:
  1059.                     if 'File exists' in error:
  1060.                         pass
  1061.                     else:
  1062.                         output_name = ""
  1063.             else:
  1064.                 output_name = ""
  1065.            
  1066.             if log:
  1067.                 log_name = date +'_'+ urlparse.urlparse(url_to_crawl).netloc + '.log'
  1068.                 try:
  1069.                     log_file = open(log_name,'w')
  1070.                 except OSError, error:
  1071.                     if 'File exists' in error:
  1072.                         pass
  1073.                     else:
  1074.                         log=False
  1075.  
  1076.             starttime=time.time()
  1077.  
  1078.             # Crawl function
  1079.             [links_crawled,externals_url_vector, files_vector] = crawl(url_to_crawl, usuario, password, output_file, crawl_limit, log,log_file,int(crawl_depth))
  1080.            
  1081.             # Indexing search
  1082.             [directories, indexing] = indexing_search(usuario, password,links_crawled,output_file)
  1083.            
  1084.             # Printing found files and exporting files to an output file
  1085.             report_files(export_file_list,files_vector,output_file)
  1086.  
  1087.             # Searching for external links
  1088.             external_links(url_to_crawl,externals_url_vector,output_file)
  1089.            
  1090.             # Download files
  1091.             if download_files_flag or interactive_flag:
  1092.                 extensions_found = download_files(extensions_to_download,files_vector,usuario,password,interactive_flag,output_file)
  1093.            
  1094.             printout('',output_file)
  1095.             printout('[+] End time: '+str(datetime.datetime.today()),output_file)
  1096.  
  1097.             endtime=time.time()
  1098.             # Printing statistics
  1099.             statistics(endtime-starttime,directories,indexing,links_crawled,files_vector,extensions_found,output_name)
  1100.  
  1101.             try:
  1102.                 output_file.close()
  1103.             except:
  1104.                 pass
  1105.             try:
  1106.                 log_file.close()
  1107.             except:
  1108.                 pass
  1109.  
  1110.         else:
  1111.             print
  1112.             print '[!] Check the URL provided, it should be like: http://www.example.com or http://asdf.com'
  1113.             print
  1114.             usage()
  1115.  
  1116.     except KeyboardInterrupt:
  1117.         # CTRL-C pretty handling
  1118.         print 'Keyboard Interruption!. Exiting.'
  1119.         sys.exit(1)
  1120.     except Exception as inst:
  1121.         print '[!] Exception in main() function'
  1122.         print type(inst)     # the exception instance
  1123.         print inst.args      # arguments stored in .args
  1124.         print inst           # __str__ allows args to printed directly
  1125.         x, y = inst          # __getitem__ allows args to be unpacked directly
  1126.         print 'x =', x
  1127.         print 'y =', y
  1128.         sys.exit(1)
  1129.  
  1130.  
  1131. if __name__ == '__main__':
  1132.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement