Pegasus ~ [ Site crawler ]

import urllib2
import re

def get_html(host):
    headers = { 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_6) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.698.0 Safari/534.24' }
    req = urllib2.Request('https://' + host, None, headers) # - Change 'https' to ssl check result
    global html
    html = urllib2.urlopen(req).read()
    return html

# - Clean up code, remove 'emails' and figure out a way to pass it to output another way

def extract_url(host):
    found = []
    extracted = []
    emails = []
    html = get_html(host)
    urls = re.findall(r'href=[\'"]?([^\'" >]+)', html)
    for x in urls:
        if host in x:
            if 'mailto:' in x:
                emails.append(x[7:])
                found.append(x[7:])
            else:
                extracted.append(x)
                found.append(x)
    return found

# - /

# - Add ssl identification


# - Using this as input until this module gets imported into framework
host = raw_input('Host: ')
# - /


# - Fix this to make it visit html of all the urls it get's from first run
for x in extract_url(host):
    print x
# - /