Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup as bs
- from urlparse import urljoin as ujoin
- import requests
- def main():
- base = raw_input('Please Enter Url: ')
- url = base # base is for urlparse.urljoin(base,href)
- scraper = HrefScraper(url,base)
- scraper.hrefParser()
- x = scraper.persistence()
- for i in x:
- # if xxx.example.com so we dont crawl the web
- if base[base.find('.')+1 : ] in i and 'http' in i:
- crawl = HrefScraper(i,base)
- x = crawl.persistence()
- for i in x:
- print i
- class HrefScraper(object):
- _list = []
- def __init__(self,url,base):
- self.base = base
- self.url = url
- def requestUrl(self):
- # request page
- getpage = requests.get(self.url)
- return getpage.content
- def hrefParser(self):
- # beginning scraping urls
- links = []
- soup = bs(self.requestUrl())
- anchors = soup.findAll('a')
- for items in anchors:
- href = items.get('href',None)
- links.append(href)
- return links
- def combatDuplicatesAbsolutes(self):
- # get rid of lonely filepaths and start working on duplicates
- # without this and the next function crawl is unpredictable
- cleaned = []
- for links in self.hrefParser():
- if links not in cleaned:
- cleaned.append(ujoin(self.base,links))
- return cleaned
- def persistence(self):
- # Save links during life cycle of crawl
- for url in self.combatDuplicatesAbsolutes():
- if url not in HrefScraper._list:
- HrefScraper._list.append(url)
- return HrefScraper._list
- if __name__=='__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement