Advertisement
Guest User

Untitled

a guest
Feb 7th, 2020
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.44 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. # Define your item pipelines here
  4. #
  5. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  7. import string
  8. import scrapy
  9. import urllib.parse
  10. import re
  11. from scrapy.pipelines.images import ImagesPipeline
  12. import ntpath
  13. import os
  14. from PIL import Image, ImageDraw, ImageFont
  15. from scrapy.utils.project import get_project_settings
  16. import sqlite3
  17. import json
  18. from pathlib import Path
  19. import unicodedata
  20.  
  21. valid_filename_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
  22. char_limit = 255
  23.  
  24.  
  25. class ShopsCrawlPipeline(object):
  26.     def process_item(self, item, spider):
  27.         return item
  28.  
  29.  
  30. class ImgPipeline(ImagesPipeline):
  31.     CONVERTED_ORIGINAL = re.compile('^full/[0-9,a-f]+.jpg$')
  32.     counter = 0
  33.  
  34.     # name information coming from the spider, in each item
  35.     # add this information to Requests() for individual images downloads
  36.     # through "meta" dict
  37.     def get_media_requests(self, item, info):
  38.         # print("get_media_requests")
  39.         if 'image_urls' in item:
  40.             return [scrapy.Request(x, meta={'item': item, 'idx': idx}, dont_filter=True) for idx, x in
  41.                     enumerate(item.get('image_urls', []))]
  42.  
  43.     def file_path(self, request, response=None, info=None):
  44.         filename = ntpath.basename(request.url)
  45.         filename = urllib.parse.unquote(filename)
  46.         _, file_extension = os.path.splitext(filename)
  47.         sku = self.clean_filename(filename=request.meta['item']['sku'])
  48.         filename = '{}_{}{}'.format(sku, request.meta['idx'], '.jpg')
  49.         # filename = '{}_{}{}'.format(request.meta['item']['sku'], request.meta['idx'], file_extension)
  50.         # if response:
  51.         #     if response.headers.get(b'Content-Type') == b'image/jpeg':
  52.         #         filename = '{}_{}{}'.format(request.meta['item']['sku'], request.meta['idx'], '.jpg')
  53.         #     elif response.headers.get(b'Content-Type') == b'image/png':
  54.         #         filename = '{}_{}{}'.format(request.meta['item']['sku'], request.meta['idx'], '.png')
  55.         #     else:
  56.         #         s = ''
  57.  
  58.         images_store = self.store.basedir + sku + '/' + filename
  59.         isdir = os.path.isdir(self.store.basedir + '/' + sku + '/')
  60.         if isdir:
  61.             s = ''
  62.  
  63.         Path(self.store.basedir + '/' + sku + '/').mkdir(parents=True, exist_ok=True)
  64.         if os.path.isfile(images_store):
  65.             os.remove(images_store)
  66.         return sku + '/' + filename
  67.  
  68.     def decorate_url(self, urla):
  69.         return '["{}"]'.format(urla)
  70.  
  71.     def item_completed(self, results, item, info):
  72.         image_paths = [x['path'] for ok, x in results if ok]
  73.         item['images'] = image_paths
  74.         return item
  75.  
  76.     @staticmethod
  77.     def clean_filename(filename, whitelist=valid_filename_chars, replace=' '):
  78.         # replace spaces
  79.         for r in replace:
  80.             filename = filename.replace(r, '_')
  81.  
  82.         # keep only valid ascii chars
  83.         cleaned_filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
  84.  
  85.         # keep only whitelisted chars
  86.         cleaned_filename = ''.join(c for c in cleaned_filename if c in whitelist)
  87.         if len(cleaned_filename) > char_limit:
  88.             print("Warning, filename truncated because it was over {}. Filenames may no longer be unique".format(
  89.                 char_limit))
  90.         return cleaned_filename[:char_limit]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement