#!/usr/bin/env python
"""Extract list of URLs in a web page

This program is part of "Dive Into Python", a free Python book for
experienced programmers.  Visit http://diveintopython.org/ for the
latest version.
"""

__author__ = "Mark Pilgrim (mark@diveintopython.org)"
__version__ = "$Revision: 1.2 $"
__date__ = "$Date: 2004/05/05 21:57:19 $"
__copyright__ = "Copyright (c) 2001 Mark Pilgrim"
__license__ = "Python"

from sgmllib import SGMLParser
import sys, urllib, getopt, socket, pdb
from urlparse import urlparse
try:
        import psyco
	psyco.full()
except ImportError:
        print "Psyco JIT Compiler (http://psyco.sf.net) not installed, performance may not be optimal"

# defaults
resolve_opt = 0
file_opt = 0
offline_opt = 0
live_opt = 0
name = []
def showBanner():
        print "Extract URLS From a Web Page\nUseage: list-url.py [-ahr] [-l url] [-o file] [-f filename]\n\t-l Online Page Retrive and Parse EX.. www.fark.com"
	print "\t-o Offline Page Read and Parse EX.. ./index.html\n\t-r Resolve Hostnames to IP\n\t-f Output to a file EX.. Filename.txt\n\t-a Print the About\n\t-h Show this Menu"

def about():
        print "\n##########################################################"
        print "#                   List URLS 3.0                        #"
        print "#             Extract URLS from a web page               #"
        print "#       Original code muts [AT] remote-exploit.com       #"
        print "#         Feature update TheX1le [AT] gmail.com          #"
        print "#                                                        #"
        print "##########################################################\n"

def parse(url):
        lookup = urlparse(url) #I will note there is probably a faster way way to do this.
        name.append(lookup.hostname)
        if name.count(lookup.hostname) > 1:
                name.remove(lookup.hostname)
        else:
                try:
                        resolve(lookup.hostname,url)
                except:
                        pass

def savefile(output):
        FILE = open(filename, "a")
        FILE.writelines(output + '\n')
        FILE.close

def resolve(name,url):
        result = socket.getaddrinfo(name, None, 0, socket.SOCK_STREAM)
        ip = [x[4][0] for x in result]
        jip = ' '.join(ip)
        result = name,jip,url
        if file_opt == 1:
                savefile(','.join(result))
        else:
                print ','.join(result)

def live_operation(uri):
        usock = urllib.urlopen(uri)
        parser = URLLister()
        parser.feed(usock.read())
        parser.close()
        usock.close()
        return (parser)

def url_prep(item): 
	parse = urlparse(item)
	item = ('http://' + parse.netloc + parse.path + parse.params + parse.query + parse.fragment)
	return item	
def offline_operation():
        index = open(files, "r")
        parser = URLLister()
        parser.feed(index.read())
        parser.close()
        index.close()
        return (parser)

def press(parser):
        if resolve_opt == 0:
                for url in parser.urls:
                        if parser.urls.count(url) > 1:
                                parser.urls.remove(url)
                        else:
                               if file_opt == 1:
                                        savefile(url)
                               else:
                                        print url

        else:
                for url in parser.urls:
                        if parser.urls.count(url) > 1:
                                parser.urls.remove(url)
                        else:
                                parse(url)
class URLLister(SGMLParser):
        def reset(self):
                SGMLParser.reset(self)
                self.urls = []

        def start_a(self, attrs):
                href = [v for k, v in attrs if k=='href']
                if href:
                        self.urls.extend(href)

if __name__ == "__main__":
        if len(sys.argv) <= 1:
                about()
                showBanner()
                sys.exit(1)


        try:
                opts, args = getopt.getopt(sys.argv[1:], 'l:o:r,p:f:a,h')

        except getopt.GetoptError, e:
                print e

        for o, a in opts:
                if o == '-r':
                        resolve_opt = 1

                elif o == '-f':
                        filename = a
                        file_opt = 1

                elif o == '-o':
                        files = a
                        offline_opt = 1

                elif o == '-l':
                        link = a
                        test = urlparse(link)
			live_opt = 1

                elif o == '-a':
                        about()
                        sys.exit(0)
                elif o == '-h':
                        about()
			showBanner()
                        sys.exit(0)

	if live_opt == 1 and offline_opt == 1:
		print "You can not use Offline and Online at the same time"
		sys.exit(1)
	elif live_opt == 0 and offline_opt == 0:
		print "You must select either Offine or Online to use this tool"
		showBanner()
	elif live_opt == 1:
		return_var = url_prep(link)
		return_var = live_operation(return_var)
		return_var = press(return_var)
	elif offline_opt == 1:
		return_var = offline_operation()
		return_var = press(return_var)