Advertisement
Guest User

Untitled

a guest
Feb 20th, 2020
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.43 KB | None | 0 0
  1. import scrapy
  2. import pythonwhois
  3. import re
  4.  
  5. class QuotesSpider(scrapy.Spider):
  6. name = "quotes"
  7. start_urls = [
  8. 'https://www.wikipedia.org/',
  9. ]
  10. def parse(self, response):
  11.  
  12.  
  13.  
  14. for quote in response.css('body'):
  15. links = quote.xpath('//a[contains(@href, "https")]/@href').getall()
  16. yield {
  17. 'url': response.request.url,
  18. # 'h1': quote.css('h1::text').getall(),
  19. }
  20.  
  21. for href in range(len(links)):
  22.  
  23.  
  24. if links[href].find("wiki")==-1:
  25. removed_link = re.sub('https://','',links[href])
  26. removed_link = removed_link.split('/')[0]
  27. try:
  28. pythonwhois.get_whois(removed_link)
  29. except Exception as exception:
  30. yield{'domains': removed_link,
  31. 'exception': type(exception).__name__,
  32. }
  33.  
  34. for pages in range(len(response.css('body a::attr(href)'))):
  35. next_page=response.css('body a::attr(href)').extract()[pages]
  36. if next_page is not None:
  37. next_page = response.urljoin(next_page)
  38. yield scrapy.Request(next_page, callback=self.parse, dont_filter=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement