HrefScraper.py

from bs4 import BeautifulSoup as bs
from urlparse import urljoin as ujoin
import requests

def main():
    base = raw_input('Please Enter Url: ')
    url = base # base is for urlparse.urljoin(base,href)
    scraper = HrefScraper(url,base)
    scraper.hrefParser()
    x = scraper.persistence()
    for i in x:
        # if xxx.example.com so we dont crawl the web
        if base[base.find('.')+1 :  ] in i and 'http' in i:
            crawl = HrefScraper(i,base)
            x = crawl.persistence()
            for i in x:
                print i

class HrefScraper(object):
    _list = []
    def __init__(self,url,base):
        self.base = base
        self.url = url

    def requestUrl(self):
        # request page
        getpage = requests.get(self.url)
        return getpage.content

    def hrefParser(self):
        # beginning scraping urls
        links = []
        soup = bs(self.requestUrl())
        anchors = soup.findAll('a')
        for items in anchors:
            href = items.get('href',None)
            links.append(href)
        return links

    def combatDuplicatesAbsolutes(self):
        # get rid of lonely filepaths and start working on duplicates
        # without this and the next function crawl is unpredictable
        cleaned = []
        for links in self.hrefParser():
            if links not in cleaned:
                cleaned.append(ujoin(self.base,links))
        return cleaned

    def persistence(self):
        # Save links during life cycle of crawl
        for url in self.combatDuplicatesAbsolutes():
            if url not in HrefScraper._list:
                HrefScraper._list.append(url)
        return HrefScraper._list

if __name__=='__main__':
    main()