Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.crawler import CrawlerProcess
- from scrapy.utils.log import configure_logging
- class BlogSpider(scrapy.Spider):
- name = 'blog'
- start_urls=['https://www.dinneratthezoo.com/']
- def parse(self, response):
- for article_url in response.css('.entry-title a ::attr("href")').extract():
- yield response.follow(article_url, callback=self.parse_article)
- def parse_article(self, response):
- content = response.xpath(".//div[@class='entry-content']/descendant::text()").extract()
- yield {'article': ''.join(content)}
- if __name__ == "__main__":
- LOG_ENABLED = False
- process = CrawlerProcess({
- 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
- })
- process.crawl(BlogSpider)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement