Advertisement
Guest User

Untitled

a guest
Dec 5th, 2019
121
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.93 KB | None | 0 0
  1. import functools
  2. import glob
  3. import os
  4.  
  5. import fitz
  6. import numpy as np
  7. from fpdf import FPDF
  8. import cv2
  9. import sys
  10.  
  11.  
  12. # This function converts a pdf to a set of images
  13. # that will be temporarily saved in the disk.
  14. def pdf2img(dir="imgs/"):
  15.     pix1 = None
  16.     # we open the file given through command line
  17.     doc = fitz.open(sys.argv[1])
  18.     cpt = 0
  19.     for i in range(len(doc)):
  20.         for img in doc.getPageImageList(i):
  21.             xref = img[0]  # check if this xref was handled already?
  22.             pix = fitz.Pixmap(doc, xref)
  23.             if pix.n < 5:  # this is GRAY or RGB
  24.                 pix.writePNG(dir + str(cpt) + ".png")
  25.             else:  # CMYK needs to be converted to RGB first
  26.                 pix1.writePNG(dir + str(cpt) + ".png")
  27.                 pix1 = None  # release storage early (optional)
  28.             pix = None  # release storage early (optional)
  29.             cpt += 1
  30.             sys.stdout.write(f"\r>>> Converting to images... {i} / {len(doc)} converted slides")
  31.             sys.stdout.flush()
  32.     print()
  33.  
  34.  
  35. # Checks if the first page contains a substantial amount of red.
  36. # If so, we process it differently from the usual white slides.
  37. def fix_if_red_cover_page(img_path):
  38.     # A dumb threshold to define the bare minimum amount of red
  39.     #  pixels to have in order to consider the slide as the red ones.
  40.     threshold_probably_red_bg = 25000
  41.     img = cv2.imread(img_path)
  42.     # BGR, not RGB
  43.     red_pixels = (img[:, :, 0] == 94) & (img[:, :, 1] == 90) & (img[:, :, 2] == 245)
  44.     p = np.count_nonzero(red_pixels)
  45.     red = (94, 90, 245)
  46.     white = (253, 252, 254)
  47.     if p > threshold_probably_red_bg:
  48.         print(">>> Red background detected!")
  49.         # set to white greyish pixels
  50.         b = np.logical_and(135 <= img[:, :, 0], img[:, :, 0] <= 255)
  51.         g = np.logical_and(135 <= img[:, :, 1], img[:, :, 1] <= 255)
  52.         r = np.logical_and(135 <= img[:, :, 2], img[:, :, 2] <= 255)
  53.  
  54.         pixels_to_fix = np.logical_and.reduce((b, g, r))
  55.         img[np.where(pixels_to_fix)] = white
  56.  
  57.         # set to red the redish pixels
  58.         b = np.logical_and(89 <= img[:, :, 0], img[:, :, 0] <= 137)
  59.         g = np.logical_and(89 <= img[:, :, 1], img[:, :, 1] <= 134)
  60.         r = np.logical_and(158 <= img[:, :, 2], img[:, :, 2] <= 248)
  61.  
  62.         pixels_to_fix = np.logical_and.reduce((b, g, r))
  63.         img[np.where(pixels_to_fix)] = red
  64.  
  65.         # set to blue the bottom left corner of the slide to hide the owner's name
  66.         b = np.logical_and(0 <= img[:, :, 0], img[:, :, 0] <= 85)
  67.         g = np.logical_and(0 <= img[:, :, 1], img[:, :, 1] <= 67)
  68.         r = np.logical_and(0 <= img[:, :, 2], img[:, :, 2] <= 51)
  69.         pixels_to_fix = np.logical_and.reduce((b, g, r))
  70.         img[np.where(pixels_to_fix)] = (88, 65, 50)
  71.  
  72.     return p > threshold_probably_red_bg, img
  73.  
  74.  
  75. # Whiten the greyish pixels of an image
  76. def replace_color(path="imgs/", path_is_image=False):
  77.     imagelist = [path]
  78.     if not path_is_image:
  79.         imagelist = []
  80.         imagelist.extend(glob.glob(os.path.join(path, '*.png')))
  81.  
  82.     i = 0
  83.     start = 0
  84.     has_been_fixed, img = fix_if_red_cover_page(imagelist[0])
  85.     if has_been_fixed:
  86.         start = 1
  87.         cv2.imwrite("out/" + os.path.basename(imagelist[0]), img)
  88.  
  89.     for imagePath in imagelist[start:]:
  90.         img_hsv = cv2.imread(imagePath)
  91.         # whiten the slide
  92.         rgb = np.logical_and(
  93.                 np.logical_and(
  94.                     np.logical_and(154 <= img_hsv[:, :, 0], img_hsv[:, :, 0] <= 255),
  95.                     np.logical_and(154 <= img_hsv[:, :, 1], img_hsv[:, :, 1] <= 255)),
  96.                 np.logical_and(154 <= img_hsv[:, :, 2], img_hsv[:, :, 2] <= 255)
  97.         )
  98.         indices = np.where(rgb)
  99.         img_hsv[indices] = (255, 255, 255)
  100.  
  101.         # bye bye owner's name: whiten the bottom left area of the image (affects only the dark pixels)
  102.         bottom_left_area = img_hsv[860:890, :700, :]
  103.         b = np.logical_and(0 <= bottom_left_area[:, :, 0], bottom_left_area[:, :, 0] <= 153)
  104.         g = np.logical_and(0 <= bottom_left_area[:, :, 1], bottom_left_area[:, :, 1] <= 153)
  105.         r = np.logical_and(0 <= bottom_left_area[:, :, 2], bottom_left_area[:, :, 2] <= 153)
  106.         pixels_to_fix = np.logical_and.reduce((b, g, r))
  107.         bottom_left_area[np.where(pixels_to_fix)] = (255, 255, 255)
  108.         img_hsv[860:890, :700, :] = bottom_left_area
  109.  
  110.         cv2.imwrite("out/" + os.path.basename(imagePath), img_hsv)
  111.         i += 1
  112.         sys.stdout.write(f"\r>>> Fixing the slides... {i} / {len(imagelist)} processed")
  113.         sys.stdout.flush()
  114.     print()
  115.  
  116.  
  117. # Used further down to compare file names in order to
  118. # sort the pdf's images in a correct order.
  119. def cmp(x, y):
  120.     a = int(os.path.basename(x).split('.')[0])
  121.     b = int(os.path.basename(y).split('.')[0])
  122.     return a - b
  123.  
  124.  
  125. # Converts all processed images to a PDF
  126. def imgs2pdf(imgs="out/"):
  127.     image_directory = imgs
  128.     imagelist = []
  129.     imagelist.extend(glob.glob(os.path.join(image_directory, '*.png')))
  130.     imagelist.sort(key=functools.cmp_to_key(cmp))
  131.     pdf = FPDF(unit="pt", format=[1600, 900])
  132.     i = 0
  133.     for imagePath in imagelist:
  134.         pdf.add_page()
  135.         pdf.image(imagePath, 0, 0)
  136.         i += 1
  137.         sys.stdout.write(f"\r>>> Generating the PDF... slides {i} / {len(imagelist)} merged")
  138.         sys.stdout.flush()
  139.     print()
  140.     pdf.output(sys.argv[1] + "__processed.pdf", "F")
  141.     print(">>> PDF generated !")
  142.     print(">>> Path to the PDF:", sys.argv[1] + "__processed.pdf")
  143.  
  144.  
  145. # Cleans the working directory after the process
  146. def clean(imgs="imgs/", out="out/"):
  147.     files = glob.glob(f"{imgs}/*.png")
  148.     files += glob.glob(f"{out}/*.png")
  149.     for f in files:
  150.         os.remove(f)
  151.  
  152.  
  153. if len(sys.argv) != 2:
  154.     print("Please mention a PDF file to process.")
  155.     print("Example: python fix_slides.py ./mesPfds/slides_chimie.pdf")
  156. else:
  157.     pdf2img()
  158.     replace_color()
  159.     imgs2pdf()
  160.     clean()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement