Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- actual action script mobile.py:
- import urlparse
- from PIL import Image
- from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
- from scrapy.spider import BaseSpider
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
- from scrapy.contrib.loader import XPathItemLoader
- from scrapy.selector import HtmlXPathSelector
- from scrapy.http.request import Request
- from scrapy.contrib.pipeline.images import ImagesPipeline
- from mobile.items import Website
- class MobileSpider(CrawlSpider):
- name = "mobile"
- allowed_domains = ["mobile-store.ro"]
- start_urls = ["http://www.mobile-store.ro/produse/"]
- rules = (
- Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
- Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
- )
- def parse(self, response, response2):
- hxs = HtmlXPathSelector(response)
- next_page = hxs.select("//ul[@class='products']/li/a/@href").extract()
- if not not next_page:
- yield Request(next_page[0], self.parse)
- sites = hxs.select('//div[@id="wrapper"]/div[@id="content"]')
- items = []
- for site in sites:
- item = Website()
- item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract()
- item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract()
- item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract()
- item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract()
- image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
- item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
- #item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
- item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract()
- item['url'] = response.url
- items.append(item)
- for item in items:
- yield item
- settings.py:
- SPIDER_MODULES = ['mobile.spiders']
- NEWSPIDER_MODULE = 'mobile.spiders'
- DEFAULT_ITEM_CLASS = 'mobile.items.Website'
- ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
- items.py:
- from scrapy.item import Item, Field
- class Website(Item):
- nume = Field()
- descriere = Field()
- categorie = Field()
- brand = Field()
- pret = Field()
- url = Field()
- image_urls = Field()
- images = Field()
- image_paths = Field()
- pipelines.py:
- from mobile.contrib.pipeline.images import ImagesPipeline
- from scrapy.exceptions import DropItem
- from scrapy.http import Request
- class MyImagesPipeline(ImagesPipeline):
- def get_media_requests(self, item, info):
- for image_url in item['image_urls']:
- yield Request(image_url)
- def item_completed(self, results, item, info):
- image_paths = [x['path'] for ok, x in results if ok]
- if not image_paths:
- raise DropItem("Item contains no images")
- item['image_paths'] = image_paths
- return item
Advertisement
Add Comment
Please, Sign In to add comment