Advertisement
Guest User

Untitled

a guest
Apr 26th, 2019
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.85 KB | None | 0 0
  1. import scrapy
  2. #import time
  3.  
  4.  
  5. class QuotesSpider(scrapy.Spider):
  6. name = "products"
  7. BASE_URL = 'https://shop.hbsfc.co.il/'
  8. def start_requests(self):
  9. urls = [self.BASE_URL]
  10. for url in urls:
  11. yield scrapy.Request(url=url, callback=self.parse)
  12.  
  13. def parse(self, response):
  14. links = set(response.xpath('//a[starts-with(@href, "/category")]/@href').extract())
  15. print "Num of links" + str(len(links))
  16. for link in links:
  17. absolute_url = self.BASE_URL + link
  18. print "### outer URL: " + absolute_url
  19. #time.sleep(1)
  20. yield scrapy.Request(absolute_url, callback=self.parse_attr)
  21.  
  22. def parse_single(self, response):
  23. #time.sleep(1)
  24. pages = response.xpath('//a[contains(@class, "page-link")]/@href').extract()
  25. if (len(pages) == 0):
  26. products = response.xpath('//a[@class="ee_product_click"]')
  27. for product in products:
  28. item_id = product.xpath('@ee_list_itemid').get()
  29. item_price = product.xpath('@ee_list_itemprice').get()
  30. res = {'item_id': item_id, 'price': item_price}
  31. yield res
  32. return
  33.  
  34. def parse_attr(self, response):
  35. #time.sleep(1)
  36. pages = response.xpath('//a[contains(@class, "page-link")]/@href').extract()
  37. print str(pages)
  38. if (len(pages) == 0):
  39. products = response.xpath('//a[@class="ee_product_click"]')
  40. for product in products:
  41. item_id = product.xpath('@ee_list_itemid').get()
  42. item_price = product.xpath('@ee_list_itemprice').get()
  43. res = {'item_id': item_id, 'price': item_price}
  44. yield res
  45. return
  46. for page in pages:
  47. #time.sleep(3)
  48. print (page)
  49. yield scrapy.Request(page, callback=self.parse_single)
  50.  
  51.  
  52.  
  53.  
  54. # pages = response.xpath('//a[contains(@class, "page-link")]/@class').extract()
  55. # if (pages is [] or pages is None):
  56. # return
  57. # current_index = [i for i, s in enumerate(pages) if u'current' in s]
  58. # if (current_index is None or current_index is [] or len(current_index) == 0):
  59. # return
  60. # print "PAGING"
  61. # print "########## current_index" + str(current_index)
  62. # current_index = current_index[0]
  63. # if (len(pages) > current_index):
  64. # if(current_index is 0):
  65. # time.sleep(3)
  66. # yield scrapy.Request(response.url + '/page/1', callback=self.parse_attr)
  67. # else:
  68. # time.sleep(4)
  69. # print "Page Number: " + (current_index + 1)
  70. # yield scrapy.Request(response.url + '/page/' + (current_index + 1), callback=self.parse_attr)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement