lagunaocean

Scrapy Web Crawl Items

May 26th, 2013
126
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.41 KB | None | 0 0
  1. actual action script mobile.py:
  2. import urlparse
  3. from PIL import Image
  4. from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
  5. from scrapy.spider import BaseSpider
  6. from scrapy.contrib.spiders import CrawlSpider, Rule
  7. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  8. from scrapy.contrib.loader import XPathItemLoader
  9. from scrapy.selector import HtmlXPathSelector
  10. from scrapy.http.request import Request
  11.  
  12. from scrapy.contrib.pipeline.images import ImagesPipeline
  13. from mobile.items import Website
  14.  
  15. class MobileSpider(CrawlSpider):
  16.     name = "mobile"
  17.     allowed_domains = ["mobile-store.ro"]
  18.     start_urls = ["http://www.mobile-store.ro/produse/"]
  19.     rules = (
  20.         Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
  21.         Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
  22.     )
  23.  
  24.     def parse(self, response, response2):
  25.         hxs = HtmlXPathSelector(response)
  26.         next_page = hxs.select("//ul[@class='products']/li/a/@href").extract()
  27.         if not not next_page:
  28.             yield Request(next_page[0], self.parse)
  29.         sites = hxs.select('//div[@id="wrapper"]/div[@id="content"]')
  30.         items = []
  31.  
  32.         for site in sites:
  33.             item = Website()
  34.             item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract()
  35.             item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract()
  36.             item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract()
  37.             item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract()
  38.             image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
  39.             item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
  40.             #item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
  41.             item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract()
  42.             item['url'] = response.url
  43.             items.append(item)
  44.         for item in items:
  45.             yield item
  46.  
  47. settings.py:
  48. SPIDER_MODULES = ['mobile.spiders']
  49. NEWSPIDER_MODULE = 'mobile.spiders'
  50. DEFAULT_ITEM_CLASS = 'mobile.items.Website'
  51.  
  52. ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
  53.  
  54. items.py:
  55. from scrapy.item import Item, Field
  56.  
  57. class Website(Item):
  58.     nume = Field()
  59.     descriere = Field()
  60.     categorie = Field()
  61.     brand = Field()
  62.     pret = Field()
  63.     url = Field()
  64.     image_urls = Field()
  65.     images = Field()
  66.     image_paths = Field()
  67.  
  68. pipelines.py:
  69. from mobile.contrib.pipeline.images import ImagesPipeline
  70. from scrapy.exceptions import DropItem
  71. from scrapy.http import Request
  72.  
  73. class MyImagesPipeline(ImagesPipeline):
  74.  
  75.     def get_media_requests(self, item, info):
  76.         for image_url in item['image_urls']:
  77.             yield Request(image_url)
  78.  
  79.     def item_completed(self, results, item, info):
  80.         image_paths = [x['path'] for ok, x in results if ok]
  81.         if not image_paths:
  82.             raise DropItem("Item contains no images")
  83.         item['image_paths'] = image_paths
  84.         return item
Advertisement
Add Comment
Please, Sign In to add comment