Share Pastebin
Guest
Public paste!

List-urls.py

By: a guest | Mar 3rd, 2010 | Syntax: None | Size: 1.69 KB | Hits: 978 | Expires: Never
Copy text to clipboard
  1. #!/usr/bin/python
  2. """Extract list of URLs in a web page
  3.  
  4. This program is part of "Dive Into Python", a free Python book for
  5. experienced programmers.  Visit http://diveintopython.org/ for the
  6. latest version.
  7. """
  8.  
  9. __author__ = "Mark Pilgrim (mark@diveintopython.org)"
  10. __version__ = "$Revision: 1.2 $"
  11. __date__ = "$Date: 2004/05/05 21:57:19 $"
  12. __copyright__ = "Copyright (c) 2001 Mark Pilgrim"
  13. __license__ = "Python"
  14.  
  15. from sgmllib import SGMLParser
  16. import sys
  17.  
  18. if len(sys.argv) != 2:
  19.         print "\n\n+++++++++++++++++++++++++++++++++++++++++++++++++++++"
  20.         print "Extract links form webpage - v.0.1            "
  21.         print "+++++++++++++++++++++++++++++++++++++++++++++++++++++"
  22.         print "\nUsage : ./list-urls.py <web-page>            "
  23.         print "Eg: ./list-urls.py http://www.whoppix.net          "
  24.         print "\n+++++++++++++++++++++++++++++++++++++++++++++++++++++"
  25.         sys.exit(1)
  26.  
  27.  
  28.  
  29. class URLLister(SGMLParser):
  30.         def reset(self):
  31.                 SGMLParser.reset(self)
  32.                 self.urls = []
  33.  
  34.         def start_a(self, attrs):
  35.                 href = [v for k, v in attrs if k=='href']
  36.                 if href:
  37.                         self.urls.extend(href)
  38.  
  39. if __name__ == "__main__":
  40.  
  41.         import urllib
  42.         print "\n##########################################################"
  43.         print "#                                                        #"
  44.         print "#             Extract URLS from a web page               #"
  45.         print "#                muts@whitehat.co.il                     #"
  46.         print "#                                                        #"
  47.         print "##########################################################\n"
  48.         link = sys.argv[1]
  49.         try:
  50.                 usock = urllib.urlopen(link)
  51.                 parser = URLLister()
  52.                 parser.feed(usock.read())
  53.                 parser.close()
  54.                 usock.close()
  55.                 for url in parser.urls: print url
  56.         except:
  57.                 print "Could not reach "+ sys.argv[1]+ " !"
  58.                 print "Did you remember to put an http:// before the domain name?"