Advertisement
yordan_filipov

Collecting articles

Dec 9th, 2022 (edited)
941
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.08 KB | None | 0 0
  1. import scrapy
  2.  
  3.  
  4. class ArticleItem(scrapy.Item):
  5.     article_title = scrapy.Field()
  6.     article_content = scrapy.Field()
  7.  
  8.  
  9. class ArticleSpider(scrapy.Spider):
  10.     name = "articles"
  11.     start_urls = [
  12.         "https://news.bg/world",
  13.         "https://news.bg/economics"
  14.     ]
  15.  
  16.     def parse(self, response):
  17.         links = response.css('ul.secondary-articles li div.topic a.title::attr(href)').extract()
  18.         for link in links:
  19.             yield scrapy.Request(link, callback=self.parse_attr)
  20.  
  21.         next_page = response.css("ul.pagination li:nth-child(3) a::attr(href)").get()
  22.         if next_page is not None:
  23.             yield response.follow(next_page, self.parse)
  24.  
  25.     def parse_attr(self, response):
  26.         item = ArticleItem()
  27.         item["article_title"] = "".join(response.xpath("//h1[@itemprop='headline']//text()").extract()).replace(":", " ")
  28.         item["article_content"] = "".join(response.css("div.article-text p ::text").extract()).strip()
  29.         f = open(f"{item['article_title']}.txt", "w")
  30.         f.write(item["article_content"])
  31.         return item
  32.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement