Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- import logging
- import re
- class ItemdataSpider(scrapy.Spider):
- name = 'itemdata'
- allowed_domains = ['www.webstaurantstore.com']
- start_urls = ['https://www.webstaurantstore.com/42529/restaurant-chairs.html']
- def parse(self, response):
- items = response.xpath("//div[@class='details']/a[contains(@class, 'description')]")
- for item in items:
- link = item.xpath(".//@href").get()
- yield response.follow(url=link, callback=self.parse_item)
- next_page = response.xpath("(//a[@rel='next'])[2]/@href").get()
- if next_page:
- yield response.follow(url=next_page, callback=self.parse)
- def parse_item(self, response):
- item_number = response.xpath("//span[@itemprop='sku']/text()").get()
- item_specs = response.xpath("//tr[@class='trSpecSheetRow' and not(@hidden)]/td/text()").getall()
- ##yield {'item_number': item_number, 'item_specs': item_specs}
- ##try:
- ## item_heightstyle = item_specs.index('\nHeight Style\n')
- ## item_specA = item_specs[item_heightstyle + 1]
- ##except:
- ## item_specA = 'N/A'
- ##try:
- ## item_color = item_specs.index('\nColor\n')
- ## item_specB = item_specs[item_color + 1]
- ##except:
- ## item_specB = 'N/A'
- ##try:
- ## item_framecolor = item_specs.index('\nFrame Color\n')
- ## item_specC = item_specs[item_framecolor + 1]
- ##except:
- ## item_specC = 'N/A'
- ##try:
- ## item_framematerial = item_specs.index('\nFrame Material\n')
- ## item_specD = item_specs[item_framematerial + 1]
- ##except:
- ## item_specD = 'N/A'
- ##try:
- ## item_seatmaterial = item_specs.index('\nSeat Material\n')
- ## item_specE = item_specs[item_seatmaterial + 1]
- ##except:
- ## item_specE = 'N/A'
- ##try:
- ## item_style = item_specs.index('\nStyle\n')
- ## item_specF = item_specs[item_style + 1]
- ## except:
- ## item_specF = 'N/A'
- ##try:
- ## item_type = item_specs.index('\nType\n')
- ## item_specG = item_specs[item_type + 1]
- ##except:
- ## item_specG = 'N/A'
- ## try:
- ## item_usage = item_specs.index('\nUsage\n')
- ## item_specH = item_specs[item_usage + 1]
- ##except:
- ## item_specH = 'N/A'
- ##item_specs = [i.split(',', 2)[0] for i in item_specs]
- ## yield {'item_number': item_number, 'item_Height_Style': item_specA, 'item_color': item_specB,
- ## 'item_framecolor': item_specC, 'item_framematerial': item_specD, 'item_seatmaterial': item_specE,
- #### 'item_style': item_specF, 'item_type': item_specG, 'item_usage': item_specH}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement