Advertisement
Guest User

List-urls.py

a guest
Mar 3rd, 2010
2,815
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.69 KB | None | 0 0
  1. #!/usr/bin/python
  2. """Extract list of URLs in a web page
  3.  
  4. This program is part of "Dive Into Python", a free Python book for
  5. experienced programmers. Visit http://diveintopython.org/ for the
  6. latest version.
  7. """
  8.  
  9. __author__ = "Mark Pilgrim (mark@diveintopython.org)"
  10. __version__ = "$Revision: 1.2 $"
  11. __date__ = "$Date: 2004/05/05 21:57:19 $"
  12. __copyright__ = "Copyright (c) 2001 Mark Pilgrim"
  13. __license__ = "Python"
  14.  
  15. from sgmllib import SGMLParser
  16. import sys
  17.  
  18. if len(sys.argv) != 2:
  19. print "\n\n+++++++++++++++++++++++++++++++++++++++++++++++++++++"
  20. print "Extract links form webpage - v.0.1 "
  21. print "+++++++++++++++++++++++++++++++++++++++++++++++++++++"
  22. print "\nUsage : ./list-urls.py <web-page> "
  23. print "Eg: ./list-urls.py http://www.whoppix.net "
  24. print "\n+++++++++++++++++++++++++++++++++++++++++++++++++++++"
  25. sys.exit(1)
  26.  
  27.  
  28.  
  29. class URLLister(SGMLParser):
  30. def reset(self):
  31. SGMLParser.reset(self)
  32. self.urls = []
  33.  
  34. def start_a(self, attrs):
  35. href = [v for k, v in attrs if k=='href']
  36. if href:
  37. self.urls.extend(href)
  38.  
  39. if __name__ == "__main__":
  40.  
  41. import urllib
  42. print "\n##########################################################"
  43. print "# #"
  44. print "# Extract URLS from a web page #"
  45. print "# muts@whitehat.co.il #"
  46. print "# #"
  47. print "##########################################################\n"
  48. link = sys.argv[1]
  49. try:
  50. usock = urllib.urlopen(link)
  51. parser = URLLister()
  52. parser.feed(usock.read())
  53. parser.close()
  54. usock.close()
  55. for url in parser.urls: print url
  56. except:
  57. print "Could not reach "+ sys.argv[1]+ " !"
  58. print "Did you remember to put an http:// before the domain name?"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement