Advertisement
ZeekoSec

Pegasus ~ [ Site crawler ]

Jan 21st, 2016
436
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.20 KB | None | 0 0
  1. import urllib2
  2. import re
  3.  
  4. def get_html(host):
  5.     headers = { 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_6) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.698.0 Safari/534.24' }
  6.     req = urllib2.Request('https://' + host, None, headers) # - Change 'https' to ssl check result
  7.     global html
  8.     html = urllib2.urlopen(req).read()
  9.     return html
  10.  
  11. # - Clean up code, remove 'emails' and figure out a way to pass it to output another way
  12.  
  13. def extract_url(host):
  14.     found = []
  15.     extracted = []
  16.     emails = []
  17.     html = get_html(host)
  18.     urls = re.findall(r'href=[\'"]?([^\'" >]+)', html)
  19.     for x in urls:
  20.         if host in x:
  21.             if 'mailto:' in x:
  22.                 emails.append(x[7:])
  23.                 found.append(x[7:])
  24.             else:
  25.                 extracted.append(x)
  26.                 found.append(x)
  27.     return found
  28.  
  29. # - /
  30.  
  31. # - Add ssl identification
  32.  
  33.  
  34. # - Using this as input until this module gets imported into framework
  35. host = raw_input('Host: ')
  36. # - /
  37.  
  38.  
  39.  
  40. # - Fix this to make it visit html of all the urls it get's from first run
  41. for x in extract_url(host):
  42.     print x
  43. # - /
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement