Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class mysearchspider(Spider):
- name = "mysearchspider"
- allowed_domains = ["example.com"]
- start_urls = [
- "http://www.example.com/category/234627",
- ]
- visitedURLs = Set()
- def parse(self, response):
- products = Selector(response).xpath('//*[@class="itemCell"]')
- for product in products:
- item = ItemScrapyCategory1()
- item['url'] = product.xpath('div[2]/div/a/@href').extract()[0]
- urls = Set([product.xpath('div[2]/div/a/@href').extract()[0]])
- for url in urls:
- if url not in self.visitedURLs:
- request = Request(url, callback=self.productpage)
- request.meta['item'] = item
- yield request
- def productpage(self, response):
- specs = Selector(response).xpath('//*[@id="Specs"]/fieldset')
- itemdict = {}
- for i in specs:
- test = i.xpath('dl')
- for t in test:
- name = t.xpath('dt/text()').extract()[0]
- itemdict[name] = t.xpath('dd/text()').extract()[0]
- print itemdict
- item = response.meta['item']
- image = Selector(response).xpath('//*[@id="synopsis"]/div/div/div/a/span/img/@src').extract()
- item['image_urls'] = image
Advertisement
Add Comment
Please, Sign In to add comment