Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
- # useful for handling different item types with a single interface
- from itemadapter import ItemAdapter
- import scrapy
- from scrapy.exceptions import DropItem
- from scrapy.pipelines.images import ImagesPipeline
- import hashlib
- class HouzzCrawlerPipeline:
- def process_item(self, item, spider):
- return item
- class HouzzImagePipeline(ImagesPipeline): # Inherit the ImagePipeline class
- def get_media_requests(self, item, info):
- for image_url in item["images_urls"]:
- yield scrapy.Request(image_url)
- for image_url in item["similarIdeas"]:
- yield scrapy.Request(image_url["images_urls"][0])
- def file_path(self, request, response=None, info=None, *, item=None):
- image_url_hash = hashlib.md5(request.url.encode()).hexdigest()
- if item["similarIdeas"]:
- for img in item["similarIdeas"]:
- img[
- "path"
- ] = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}"
- image_filename = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}/{image_url_hash}.jpg"
- return image_filename
- if item["images_urls"]:
- item[
- "path"
- ] = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}"
- image_filename = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}/{image_url_hash}.jpg"
- return image_filename
- def item_completed(self, results, item, info):
- image_paths = [x["path"] for ok, x in results if ok]
- if not image_paths:
- raise DropItem("Item contains no images")
- return item
Advertisement
Add Comment
Please, Sign In to add comment