rhat398

image_pipeline

May 13th, 2023 (edited)
918
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.86 KB | None | 0 0
  1. # Define your item pipelines here
  2. #
  3. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  4. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  5.  
  6.  
  7. # useful for handling different item types with a single interface
  8. from itemadapter import ItemAdapter
  9. import scrapy
  10. from scrapy.exceptions import DropItem
  11. from scrapy.pipelines.images import ImagesPipeline
  12.  
  13. import hashlib
  14.  
  15.  
  16. class HouzzCrawlerPipeline:
  17.     def process_item(self, item, spider):
  18.         return item
  19.  
  20.  
  21. class HouzzImagePipeline(ImagesPipeline):  # Inherit the ImagePipeline class
  22.     def get_media_requests(self, item, info):
  23.         for image_url in item["images_urls"]:
  24.             yield scrapy.Request(image_url)
  25.         for image_url in item["similarIdeas"]:
  26.             yield scrapy.Request(image_url["images_urls"][0])
  27.  
  28.     def file_path(self, request, response=None, info=None, *, item=None):
  29.         image_url_hash = hashlib.md5(request.url.encode()).hexdigest()
  30.         if item["similarIdeas"]:
  31.             for img in item["similarIdeas"]:
  32.                 img[
  33.                     "path"
  34.                 ] = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}"
  35.                 image_filename = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}/{image_url_hash}.jpg"
  36.                 return image_filename
  37.         if item["images_urls"]:
  38.             item[
  39.                 "path"
  40.             ] = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}"
  41.             image_filename = f"{image_url_hash[:3]}/{image_url_hash[3:6]}/{image_url_hash[6:9]}/{image_url_hash}.jpg"
  42.             return image_filename
  43.  
  44.     def item_completed(self, results, item, info):
  45.         image_paths = [x["path"] for ok, x in results if ok]
  46.         if not image_paths:
  47.             raise DropItem("Item contains no images")
  48.         return item
  49.  
Advertisement
Add Comment
Please, Sign In to add comment