Advertisement
Guest User

Untitled

a guest
Sep 14th, 2017
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.11 KB | None | 0 0
  1. import requests as req
  2. from bs4 import BeautifulSoup
  3. import pickle
  4.  
  5.  
  6.  
  7. class Crawler(object):
  8.     """Краулер. Просто краулер"""
  9.     links = []
  10.     innerlinks = []
  11.     outerlinks = []
  12.     doneLinks = []
  13.  
  14.     def __init__(self, url):
  15.  
  16.         self.url = url
  17.  
  18.     def crawl(self):
  19.         try:
  20.             self.getHTML()
  21.             self.getLinks()
  22.             self.divLinks()
  23.             self.getTitle()
  24.             self.doneLinks.append(self.url)
  25.             max = len(self.links)
  26.             for n in range(0, max):
  27.                 if self.links[n] not in self.doneLinks:
  28.                     newUrl = self.links[n]
  29.                     crawler = Crawler(newUrl)
  30.                     crawler.crawl()
  31.             del(self)
  32.  
  33.         except KeyboardInterrupt:
  34.             sys.exit(0)
  35.  
  36.         except req.exceptions.InvalidSchema:
  37.             del(self)
  38.         except req.exceptions.MissingSchema:
  39.             del(self)
  40.         except TypeError:
  41.             print('TypeError')
  42.             del(self)
  43.  
  44.  
  45.     def getHTML(self):
  46.  
  47.         try:
  48.             self.html = req.get(self.url).text
  49.  
  50.         except req.exceptions.HTTPError as e:
  51.             print ("HTTPError")
  52.             self.html = None
  53.  
  54.     def getLinks(self):
  55.  
  56.         self.bsObj = BeautifulSoup(self.html, "lxml")
  57.         tmp = 0
  58.         for link in self.bsObj.findAll('a'):
  59.             if 'href' in link.attrs:
  60.                 tmp=link.attrs['href']
  61.             if tmp not in self.links:
  62.                 self.links.append(tmp)
  63.  
  64.     def divLinks(self):
  65.         key = (self.url.split('/')[2])
  66.         for link in self.links:
  67.             if key in link:
  68.                 self.innerlinks.append(link)
  69.             else:
  70.                 self.outerlinks.append(link)
  71.  
  72.     def getTitle(self):
  73.         self.bsObj = BeautifulSoup(self.html, "lxml")
  74.         title = self.bsObj.find('title').text
  75.         str = (self.url + " : " + title + "\n")
  76.         print(str)
  77.         with open ("titles", "a", encoding='utf8') as f:
  78.             f.write(str)
  79.  
  80.  
  81.  
  82. c = Crawler("http://ibm.com/index.html")
  83. c.crawl()
  84. print (c.links)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement