Guest User

Scrapy jump into page

a guest
Sep 14th, 2015
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.29 KB | None | 0 0
  1. class mysearchspider(Spider):
  2.     name = "mysearchspider"
  3.     allowed_domains = ["example.com"]
  4.     start_urls = [
  5.         "http://www.example.com/category/234627",
  6.     ]
  7.     visitedURLs = Set()
  8.  
  9.     def parse(self, response):
  10.         products = Selector(response).xpath('//*[@class="itemCell"]')
  11.         for product in products:
  12.             item = ItemScrapyCategory1()
  13.             item['url'] = product.xpath('div[2]/div/a/@href').extract()[0]
  14.             urls = Set([product.xpath('div[2]/div/a/@href').extract()[0]])
  15.             for url in urls:
  16.                 if url not in self.visitedURLs:
  17.                     request = Request(url, callback=self.productpage)
  18.                     request.meta['item'] = item
  19.                     yield request
  20.  
  21.     def productpage(self, response):
  22.         specs = Selector(response).xpath('//*[@id="Specs"]/fieldset')
  23.         itemdict = {}
  24.         for i in specs:
  25.             test = i.xpath('dl')
  26.             for t in test:
  27.                 name = t.xpath('dt/text()').extract()[0]
  28.                 itemdict[name] = t.xpath('dd/text()').extract()[0]
  29.         print itemdict
  30.         item = response.meta['item']
  31.         image = Selector(response).xpath('//*[@id="synopsis"]/div/div/div/a/span/img/@src').extract()
  32.         item['image_urls'] = image
Advertisement
Add Comment
Please, Sign In to add comment