Advertisement
Guest User

Untitled

a guest
Jan 17th, 2017
183
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.02 KB | None | 0 0
  1. import scrapy
  2. from CrawlVerbraucherwelt.items import VerbraucherweltProdukt
  3. import re
  4. import datetime
  5.  
  6.  
  7. class ProductSpyder(scrapy.Spider):
  8.     name = "product"
  9.     start_urls = [
  10.         'https://www.verbraucherwelt.de/computer-elektronik/',
  11.         'https://www.verbraucherwelt.de/garten-baumarkt/',
  12.         'https://www.verbraucherwelt.de/haushalt-familie/',
  13.         'https://www.verbraucherwelt.de/schoenheit-gesundheit/',
  14.         'https://www.verbraucherwelt.de/sport-freizeit/',
  15.     ]
  16.  
  17.     def parse(self, response):
  18.         for href in response.xpath('//ul[@class="cats"]//a/@href').extract():
  19.             yield scrapy.Request(response.urljoin(href),
  20.                                  callback=self.parse_product_page)
  21.  
  22.     def parse_product_page(self, response):
  23.         products = response.xpath('//div[@class="content"]//div[@class="tov-rows"]//div[@class="t-row"]')
  24.         category = response.xpath('//h1/text()').extract_first()
  25.         for x, product in enumerate(products):  #ERROR: Just gives an item for the first product
  26.             product_loader = VerbraucherweltProdukt()
  27.             product_loader['name'] = product.xpath(
  28.                 '//div[@class="t-center"]//div[@class="t-name"]/text()').extract_first()
  29.             product_loader['nr'] = x
  30.             product_loader['kategorie'] = category
  31.             link = product.xpath('//div[@class="t-right"]//a/@href').extract_first()
  32.             non_ref_link = re.sub(r'\/ref=.*', '', link)
  33.             product_loader['auszeichnung'] = product.xpath(
  34.                 'preceding-sibling::div[1][starts-with(@class,"c-line")]/text()').extract_first()
  35.             product_loader['last_updated'] = datetime.datetime.now().strftime('%d/%m/%Y')
  36.             request = scrapy.Request(non_ref_link,callback=self.test_link, errback=self.test_link)
  37.             request.meta['item'] = product_loader
  38.             yield request
  39.  
  40.     def test_link(self, response):
  41.         item = response.meta['item']
  42.         item['link_fehlerhaft'] = response.status
  43.         yield item
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement