Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests as req
- from bs4 import BeautifulSoup
- import pickle
- class Crawler(object):
- """Краулер. Просто краулер"""
- links = []
- innerlinks = []
- outerlinks = []
- doneLinks = []
- def __init__(self, url):
- self.url = url
- def crawl(self):
- try:
- self.getHTML()
- self.getLinks()
- self.divLinks()
- self.getTitle()
- self.doneLinks.append(self.url)
- max = len(self.links)
- for n in range(0, max):
- if self.links[n] not in self.doneLinks:
- newUrl = self.links[n]
- crawler = Crawler(newUrl)
- crawler.crawl()
- del(self)
- except KeyboardInterrupt:
- sys.exit(0)
- except req.exceptions.InvalidSchema:
- del(self)
- except req.exceptions.MissingSchema:
- del(self)
- except TypeError:
- print('TypeError')
- del(self)
- def getHTML(self):
- try:
- self.html = req.get(self.url).text
- except req.exceptions.HTTPError as e:
- print ("HTTPError")
- self.html = None
- def getLinks(self):
- self.bsObj = BeautifulSoup(self.html, "lxml")
- tmp = 0
- for link in self.bsObj.findAll('a'):
- if 'href' in link.attrs:
- tmp=link.attrs['href']
- if tmp not in self.links:
- self.links.append(tmp)
- def divLinks(self):
- key = (self.url.split('/')[2])
- for link in self.links:
- if key in link:
- self.innerlinks.append(link)
- else:
- self.outerlinks.append(link)
- def getTitle(self):
- self.bsObj = BeautifulSoup(self.html, "lxml")
- title = self.bsObj.find('title').text
- str = (self.url + " : " + title + "\n")
- print(str)
- with open ("titles", "a", encoding='utf8') as f:
- f.write(str)
- c = Crawler("http://ibm.com/index.html")
- c.crawl()
- print (c.links)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement