Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.linkextractors import LinkExtractor
- from scrapy.spiders import CrawlSpider, Rule
- class DealsSpider(CrawlSpider):
- name = 'deals'
- handle_httpstatus_all = True
- allowed_domains = ['amazon.ca']
- start_urls = ["https://www.amazon.ca/electronics-deals-electronics-sale-tv-sale/b/ref=gbps_ftr_m-4_b14c_page_" + str(x) + "?node=2055586011&nocache=1623420881512&gb_f_dealsce=dealTypes:DEAL_OF_THE_DAY%252CBEST_DEAL%252CLIGHTNING_DEAL,page:2,sortOrder:BY_SCORE,enforcedCategories:667823011%252C2404990011%252C2690975011%252C677230011%252C7337291011%252C3379552011%252C3379595011%252C6205511011%252C680468011%252C11402068011%252C6916844011%252C677268011%252C1233055011%252C677273011%252C7204527011%252C677250011%252C2690953011%252C677252011%252C677212011%252C677226011%252C3379546011,dealsPerPage:32&pf_rd_p=cd5fdf1b-afb1-45db-8a82-940414a2b14c&pf_rd_s=merchandised-search-4&pf_rd_t=101&pf_rd_i=2055586011&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=2C6RA5DEFBW1A3KQSPEM&ie=UTF8" for x in range(1,5)]
- rules = (
- Rule(LinkExtractor(restrict_xpaths ='//a[@id="dealTitle"]'), callback='parse_item', follow=True),
- )
- def parse_item(self, response):
- yield{
- 'title':response.xpath('.//h1[@id="title"]/span/text()').get()
- }
Advertisement
Add Comment
Please, Sign In to add comment