#!/usr/bin/env python """Extract list of URLs in a web page This program is part of "Dive Into Python", a free Python book for experienced programmers. Visit http://diveintopython.org/ for the latest version. """ __author__ = "Mark Pilgrim (mark@diveintopython.org)" __version__ = "$Revision: 1.2 $" __date__ = "$Date: 2004/05/05 21:57:19 $" __copyright__ = "Copyright (c) 2001 Mark Pilgrim" __license__ = "Python" from sgmllib import SGMLParser import sys, urllib, getopt, socket, pdb from urlparse import urlparse try: import psyco psyco.full() except ImportError: print "Psyco JIT Compiler (http://psyco.sf.net) not installed, performance may not be optimal" # defaults resolve_opt = 0 file_opt = 0 offline_opt = 0 live_opt = 0 name = [] def showBanner(): print "Extract URLS From a Web Page\nUseage: list-url.py [-ahr] [-l url] [-o file] [-f filename]\n\t-l Online Page Retrive and Parse EX.. www.fark.com" print "\t-o Offline Page Read and Parse EX.. ./index.html\n\t-r Resolve Hostnames to IP\n\t-f Output to a file EX.. Filename.txt\n\t-a Print the About\n\t-h Show this Menu" def about(): print "\n##########################################################" print "# List URLS 3.0 #" print "# Extract URLS from a web page #" print "# Original code muts [AT] remote-exploit.com #" print "# Feature update TheX1le [AT] gmail.com #" print "# #" print "##########################################################\n" def parse(url): lookup = urlparse(url) #I will note there is probably a faster way way to do this. name.append(lookup.hostname) if name.count(lookup.hostname) > 1: name.remove(lookup.hostname) else: try: resolve(lookup.hostname,url) except: pass def savefile(output): FILE = open(filename, "a") FILE.writelines(output + '\n') FILE.close def resolve(name,url): result = socket.getaddrinfo(name, None, 0, socket.SOCK_STREAM) ip = [x[4][0] for x in result] jip = ' '.join(ip) result = name,jip,url if file_opt == 1: savefile(','.join(result)) else: print ','.join(result) def live_operation(uri): usock = urllib.urlopen(uri) parser = URLLister() parser.feed(usock.read()) parser.close() usock.close() return (parser) def url_prep(item): parse = urlparse(item) item = ('http://' + parse.netloc + parse.path + parse.params + parse.query + parse.fragment) return item def offline_operation(): index = open(files, "r") parser = URLLister() parser.feed(index.read()) parser.close() index.close() return (parser) def press(parser): if resolve_opt == 0: for url in parser.urls: if parser.urls.count(url) > 1: parser.urls.remove(url) else: if file_opt == 1: savefile(url) else: print url else: for url in parser.urls: if parser.urls.count(url) > 1: parser.urls.remove(url) else: parse(url) class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls = [] def start_a(self, attrs): href = [v for k, v in attrs if k=='href'] if href: self.urls.extend(href) if __name__ == "__main__": if len(sys.argv) <= 1: about() showBanner() sys.exit(1) try: opts, args = getopt.getopt(sys.argv[1:], 'l:o:r,p:f:a,h') except getopt.GetoptError, e: print e for o, a in opts: if o == '-r': resolve_opt = 1 elif o == '-f': filename = a file_opt = 1 elif o == '-o': files = a offline_opt = 1 elif o == '-l': link = a test = urlparse(link) live_opt = 1 elif o == '-a': about() sys.exit(0) elif o == '-h': about() showBanner() sys.exit(0) if live_opt == 1 and offline_opt == 1: print "You can not use Offline and Online at the same time" sys.exit(1) elif live_opt == 0 and offline_opt == 0: print "You must select either Offine or Online to use this tool" showBanner() elif live_opt == 1: return_var = url_prep(link) return_var = live_operation(return_var) return_var = press(return_var) elif offline_opt == 1: return_var = offline_operation() return_var = press(return_var)