Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- import pythonwhois
- import re
- class QuotesSpider(scrapy.Spider):
- name = "quotes"
- start_urls = [
- 'https://www.wikipedia.org/',
- ]
- def parse(self, response):
- for quote in response.css('body'):
- links = quote.xpath('//a[contains(@href, "https")]/@href').getall()
- yield {
- 'url': response.request.url,
- # 'h1': quote.css('h1::text').getall(),
- }
- for href in range(len(links)):
- if links[href].find("wiki")==-1:
- removed_link = re.sub('https://','',links[href])
- removed_link = removed_link.split('/')[0]
- try:
- pythonwhois.get_whois(removed_link)
- except Exception as exception:
- yield{'domains': removed_link,
- 'exception': type(exception).__name__,
- }
- for pages in range(len(response.css('body a::attr(href)'))):
- next_page=response.css('body a::attr(href)').extract()[pages]
- if next_page is not None:
- next_page = response.urljoin(next_page)
- yield scrapy.Request(next_page, callback=self.parse, dont_filter=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement