Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import functools
- import glob
- import os
- import fitz
- import numpy as np
- from fpdf import FPDF
- import cv2
- import sys
- # This function converts a pdf to a set of images
- # that will be temporarily saved in the disk.
- def pdf2img(dir="imgs/"):
- pix1 = None
- # we open the file given through command line
- doc = fitz.open(sys.argv[1])
- cpt = 0
- for i in range(len(doc)):
- for img in doc.getPageImageList(i):
- xref = img[0] # check if this xref was handled already?
- pix = fitz.Pixmap(doc, xref)
- if pix.n < 5: # this is GRAY or RGB
- pix.writePNG(dir + str(cpt) + ".png")
- else: # CMYK needs to be converted to RGB first
- pix1.writePNG(dir + str(cpt) + ".png")
- pix1 = None # release storage early (optional)
- pix = None # release storage early (optional)
- cpt += 1
- sys.stdout.write(f"\r>>> Converting to images... {i} / {len(doc)} converted slides")
- sys.stdout.flush()
- print()
- # Checks if the first page contains a substantial amount of red.
- # If so, we process it differently from the usual white slides.
- def fix_if_red_cover_page(img_path):
- # A dumb threshold to define the bare minimum amount of red
- # pixels to have in order to consider the slide as the red ones.
- threshold_probably_red_bg = 25000
- img = cv2.imread(img_path)
- # BGR, not RGB
- red_pixels = (img[:, :, 0] == 94) & (img[:, :, 1] == 90) & (img[:, :, 2] == 245)
- p = np.count_nonzero(red_pixels)
- red = (94, 90, 245)
- white = (253, 252, 254)
- if p > threshold_probably_red_bg:
- print(">>> Red background detected!")
- # set to white greyish pixels
- b = np.logical_and(135 <= img[:, :, 0], img[:, :, 0] <= 255)
- g = np.logical_and(135 <= img[:, :, 1], img[:, :, 1] <= 255)
- r = np.logical_and(135 <= img[:, :, 2], img[:, :, 2] <= 255)
- pixels_to_fix = np.logical_and.reduce((b, g, r))
- img[np.where(pixels_to_fix)] = white
- # set to red the redish pixels
- b = np.logical_and(89 <= img[:, :, 0], img[:, :, 0] <= 137)
- g = np.logical_and(89 <= img[:, :, 1], img[:, :, 1] <= 134)
- r = np.logical_and(158 <= img[:, :, 2], img[:, :, 2] <= 248)
- pixels_to_fix = np.logical_and.reduce((b, g, r))
- img[np.where(pixels_to_fix)] = red
- # set to blue the bottom left corner of the slide to hide the owner's name
- b = np.logical_and(0 <= img[:, :, 0], img[:, :, 0] <= 85)
- g = np.logical_and(0 <= img[:, :, 1], img[:, :, 1] <= 67)
- r = np.logical_and(0 <= img[:, :, 2], img[:, :, 2] <= 51)
- pixels_to_fix = np.logical_and.reduce((b, g, r))
- img[np.where(pixels_to_fix)] = (88, 65, 50)
- return p > threshold_probably_red_bg, img
- # Whiten the greyish pixels of an image
- def replace_color(path="imgs/", path_is_image=False):
- imagelist = [path]
- if not path_is_image:
- imagelist = []
- imagelist.extend(glob.glob(os.path.join(path, '*.png')))
- i = 0
- start = 0
- has_been_fixed, img = fix_if_red_cover_page(imagelist[0])
- if has_been_fixed:
- start = 1
- cv2.imwrite("out/" + os.path.basename(imagelist[0]), img)
- for imagePath in imagelist[start:]:
- img_hsv = cv2.imread(imagePath)
- # whiten the slide
- rgb = np.logical_and(
- np.logical_and(
- np.logical_and(154 <= img_hsv[:, :, 0], img_hsv[:, :, 0] <= 255),
- np.logical_and(154 <= img_hsv[:, :, 1], img_hsv[:, :, 1] <= 255)),
- np.logical_and(154 <= img_hsv[:, :, 2], img_hsv[:, :, 2] <= 255)
- )
- indices = np.where(rgb)
- img_hsv[indices] = (255, 255, 255)
- # bye bye owner's name: whiten the bottom left area of the image (affects only the dark pixels)
- bottom_left_area = img_hsv[860:890, :700, :]
- b = np.logical_and(0 <= bottom_left_area[:, :, 0], bottom_left_area[:, :, 0] <= 153)
- g = np.logical_and(0 <= bottom_left_area[:, :, 1], bottom_left_area[:, :, 1] <= 153)
- r = np.logical_and(0 <= bottom_left_area[:, :, 2], bottom_left_area[:, :, 2] <= 153)
- pixels_to_fix = np.logical_and.reduce((b, g, r))
- bottom_left_area[np.where(pixels_to_fix)] = (255, 255, 255)
- img_hsv[860:890, :700, :] = bottom_left_area
- cv2.imwrite("out/" + os.path.basename(imagePath), img_hsv)
- i += 1
- sys.stdout.write(f"\r>>> Fixing the slides... {i} / {len(imagelist)} processed")
- sys.stdout.flush()
- print()
- # Used further down to compare file names in order to
- # sort the pdf's images in a correct order.
- def cmp(x, y):
- a = int(os.path.basename(x).split('.')[0])
- b = int(os.path.basename(y).split('.')[0])
- return a - b
- # Converts all processed images to a PDF
- def imgs2pdf(imgs="out/"):
- image_directory = imgs
- imagelist = []
- imagelist.extend(glob.glob(os.path.join(image_directory, '*.png')))
- imagelist.sort(key=functools.cmp_to_key(cmp))
- pdf = FPDF(unit="pt", format=[1600, 900])
- i = 0
- for imagePath in imagelist:
- pdf.add_page()
- pdf.image(imagePath, 0, 0)
- i += 1
- sys.stdout.write(f"\r>>> Generating the PDF... slides {i} / {len(imagelist)} merged")
- sys.stdout.flush()
- print()
- pdf.output(sys.argv[1] + "__processed.pdf", "F")
- print(">>> PDF generated !")
- print(">>> Path to the PDF:", sys.argv[1] + "__processed.pdf")
- # Cleans the working directory after the process
- def clean(imgs="imgs/", out="out/"):
- files = glob.glob(f"{imgs}/*.png")
- files += glob.glob(f"{out}/*.png")
- for f in files:
- os.remove(f)
- if len(sys.argv) != 2:
- print("Please mention a PDF file to process.")
- print("Example: python fix_slides.py ./mesPfds/slides_chimie.pdf")
- else:
- pdf2img()
- replace_color()
- imgs2pdf()
- clean()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement