Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- #import time
- class QuotesSpider(scrapy.Spider):
- name = "products"
- BASE_URL = 'https://shop.hbsfc.co.il/'
- def start_requests(self):
- urls = [self.BASE_URL]
- for url in urls:
- yield scrapy.Request(url=url, callback=self.parse)
- def parse(self, response):
- links = set(response.xpath('//a[starts-with(@href, "/category")]/@href').extract())
- print "Num of links" + str(len(links))
- for link in links:
- absolute_url = self.BASE_URL + link
- print "### outer URL: " + absolute_url
- #time.sleep(1)
- yield scrapy.Request(absolute_url, callback=self.parse_attr)
- def parse_single(self, response):
- #time.sleep(1)
- pages = response.xpath('//a[contains(@class, "page-link")]/@href').extract()
- if (len(pages) == 0):
- products = response.xpath('//a[@class="ee_product_click"]')
- for product in products:
- item_id = product.xpath('@ee_list_itemid').get()
- item_price = product.xpath('@ee_list_itemprice').get()
- res = {'item_id': item_id, 'price': item_price}
- yield res
- return
- def parse_attr(self, response):
- #time.sleep(1)
- pages = response.xpath('//a[contains(@class, "page-link")]/@href').extract()
- print str(pages)
- if (len(pages) == 0):
- products = response.xpath('//a[@class="ee_product_click"]')
- for product in products:
- item_id = product.xpath('@ee_list_itemid').get()
- item_price = product.xpath('@ee_list_itemprice').get()
- res = {'item_id': item_id, 'price': item_price}
- yield res
- return
- for page in pages:
- #time.sleep(3)
- print (page)
- yield scrapy.Request(page, callback=self.parse_single)
- # pages = response.xpath('//a[contains(@class, "page-link")]/@class').extract()
- # if (pages is [] or pages is None):
- # return
- # current_index = [i for i, s in enumerate(pages) if u'current' in s]
- # if (current_index is None or current_index is [] or len(current_index) == 0):
- # return
- # print "PAGING"
- # print "########## current_index" + str(current_index)
- # current_index = current_index[0]
- # if (len(pages) > current_index):
- # if(current_index is 0):
- # time.sleep(3)
- # yield scrapy.Request(response.url + '/page/1', callback=self.parse_attr)
- # else:
- # time.sleep(4)
- # print "Page Number: " + (current_index + 1)
- # yield scrapy.Request(response.url + '/page/' + (current_index + 1), callback=self.parse_attr)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement