Advertisement
OrphanSec

HrefScraper.py

Mar 30th, 2015
328
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.76 KB | None | 0 0
  1. from bs4 import BeautifulSoup as bs
  2. from urlparse import urljoin as ujoin
  3. import requests
  4.  
  5. def main():
  6.     base = raw_input('Please Enter Url: ')
  7.     url = base # base is for urlparse.urljoin(base,href)
  8.     scraper = HrefScraper(url,base)
  9.     scraper.hrefParser()
  10.     x = scraper.persistence()
  11.     for i in x:
  12.         # if xxx.example.com so we dont crawl the web
  13.         if base[base.find('.')+1 :  ] in i and 'http' in i:
  14.             crawl = HrefScraper(i,base)
  15.             x = crawl.persistence()
  16.             for i in x:
  17.                 print i
  18.                
  19. class HrefScraper(object):
  20.     _list = []
  21.     def __init__(self,url,base):
  22.         self.base = base
  23.         self.url = url
  24.  
  25.     def requestUrl(self):
  26.         # request page
  27.         getpage = requests.get(self.url)
  28.         return getpage.content
  29.    
  30.     def hrefParser(self):
  31.         # beginning scraping urls
  32.         links = []
  33.         soup = bs(self.requestUrl())
  34.         anchors = soup.findAll('a')      
  35.         for items in anchors:
  36.             href = items.get('href',None)
  37.             links.append(href)
  38.         return links
  39.  
  40.     def combatDuplicatesAbsolutes(self):
  41.         # get rid of lonely filepaths and start working on duplicates
  42.         # without this and the next function crawl is unpredictable
  43.         cleaned = []
  44.         for links in self.hrefParser():
  45.             if links not in cleaned:
  46.                 cleaned.append(ujoin(self.base,links))
  47.         return cleaned
  48.  
  49.     def persistence(self):
  50.         # Save links during life cycle of crawl
  51.         for url in self.combatDuplicatesAbsolutes():
  52.             if url not in HrefScraper._list:
  53.                 HrefScraper._list.append(url)
  54.         return HrefScraper._list
  55.  
  56. if __name__=='__main__':
  57.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement