Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from CrawlVerbraucherwelt.items import VerbraucherweltProdukt
- import re
- import datetime
- class ProductSpyder(scrapy.Spider):
- name = "product"
- start_urls = [
- 'https://www.verbraucherwelt.de/computer-elektronik/',
- 'https://www.verbraucherwelt.de/garten-baumarkt/',
- 'https://www.verbraucherwelt.de/haushalt-familie/',
- 'https://www.verbraucherwelt.de/schoenheit-gesundheit/',
- 'https://www.verbraucherwelt.de/sport-freizeit/',
- ]
- def parse(self, response):
- for href in response.xpath('//ul[@class="cats"]//a/@href').extract():
- yield scrapy.Request(response.urljoin(href),
- callback=self.parse_product_page)
- def parse_product_page(self, response):
- products = response.xpath('//div[@class="content"]//div[@class="tov-rows"]//div[@class="t-row"]')
- category = response.xpath('//h1/text()').extract_first()
- for x, product in enumerate(products): #ERROR: Just gives an item for the first product
- product_loader = VerbraucherweltProdukt()
- product_loader['name'] = product.xpath(
- '//div[@class="t-center"]//div[@class="t-name"]/text()').extract_first()
- product_loader['nr'] = x
- product_loader['kategorie'] = category
- link = product.xpath('//div[@class="t-right"]//a/@href').extract_first()
- non_ref_link = re.sub(r'\/ref=.*', '', link)
- product_loader['auszeichnung'] = product.xpath(
- 'preceding-sibling::div[1][starts-with(@class,"c-line")]/text()').extract_first()
- product_loader['last_updated'] = datetime.datetime.now().strftime('%d/%m/%Y')
- request = scrapy.Request(non_ref_link,callback=self.test_link, errback=self.test_link)
- request.meta['item'] = product_loader
- yield request
- def test_link(self, response):
- item = response.meta['item']
- item['link_fehlerhaft'] = response.status
- yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement