Advertisement
uopspop

Untitled

Dec 16th, 2019
226
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.85 KB | None | 0 0
  1. # my_sls_scraper/spiders/header_spider.py
  2. from scrapy.spiders import CrawlSpider, Rule
  3. from scrapy.linkextractors import LinkExtractor
  4.  
  5.  
  6. class HeaderSpider(CrawlSpider):
  7. name = "header_spider"
  8.  
  9. start_urls = ["https://scrapy.org"]
  10. allowed_domains = ["scrapy.org"]
  11. rules = [ # Get all links on start url
  12. Rule(
  13. link_extractor=LinkExtractor(
  14. deny=r"\?",
  15. ),
  16. follow=False,
  17. callback="parse_page",
  18. )
  19. ]
  20.  
  21. def parse_start_url(self, response):
  22. return self.parse_page(response)
  23.  
  24. def parse_page(self, response):
  25. header = response.css("h1, h2").extract_first(
  26. ) or response.css("title").extract_first() or response.url
  27. return {
  28. "header": remove_tags(header),
  29. "url": response.url,
  30. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement