Advertisement
Ralichet

Untitled

Apr 22nd, 2019
177
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.72 KB | None | 0 0
  1. import scrapy
  2. from scrapy.crawler import CrawlerProcess
  3. from scrapy.utils.log import configure_logging
  4.  
  5.  
  6. class BlogSpider(scrapy.Spider):
  7. name = 'blog'
  8. start_urls=['https://www.dinneratthezoo.com/']
  9.  
  10. def parse(self, response):
  11. for article_url in response.css('.entry-title a ::attr("href")').extract():
  12. yield response.follow(article_url, callback=self.parse_article)
  13.  
  14. def parse_article(self, response):
  15. content = response.xpath(".//div[@class='entry-content']/descendant::text()").extract()
  16. yield {'article': ''.join(content)}
  17.  
  18.  
  19. if __name__ == "__main__":
  20. LOG_ENABLED = False
  21. process = CrawlerProcess({
  22. 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
  23. })
  24. process.crawl(BlogSpider)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement