Untitled

import functools
import glob
import os

import fitz
import numpy as np
from fpdf import FPDF
import cv2
import sys


# This function converts a pdf to a set of images
# that will be temporarily saved in the disk.
def pdf2img(dir="imgs/"):
    pix1 = None
    # we open the file given through command line
    doc = fitz.open(sys.argv[1])
    cpt = 0
    for i in range(len(doc)):
        for img in doc.getPageImageList(i):
            xref = img[0]  # check if this xref was handled already?
            pix = fitz.Pixmap(doc, xref)
            if pix.n < 5:  # this is GRAY or RGB
                pix.writePNG(dir + str(cpt) + ".png")
            else:  # CMYK needs to be converted to RGB first
                pix1.writePNG(dir + str(cpt) + ".png")
                pix1 = None  # release storage early (optional)
            pix = None  # release storage early (optional)
            cpt += 1
            sys.stdout.write(f"\r>>> Converting to images... {i} / {len(doc)} converted slides")
            sys.stdout.flush()
    print()


# Checks if the first page contains a substantial amount of red.
# If so, we process it differently from the usual white slides.
def fix_if_red_cover_page(img_path):
    # A dumb threshold to define the bare minimum amount of red
    #  pixels to have in order to consider the slide as the red ones.
    threshold_probably_red_bg = 25000
    img = cv2.imread(img_path)
    # BGR, not RGB
    red_pixels = (img[:, :, 0] == 94) & (img[:, :, 1] == 90) & (img[:, :, 2] == 245)
    p = np.count_nonzero(red_pixels)
    red = (94, 90, 245)
    white = (253, 252, 254)
    if p > threshold_probably_red_bg:
        print(">>> Red background detected!")
        # set to white greyish pixels
        b = np.logical_and(135 <= img[:, :, 0], img[:, :, 0] <= 255)
        g = np.logical_and(135 <= img[:, :, 1], img[:, :, 1] <= 255)
        r = np.logical_and(135 <= img[:, :, 2], img[:, :, 2] <= 255)

        pixels_to_fix = np.logical_and.reduce((b, g, r))
        img[np.where(pixels_to_fix)] = white

        # set to red the redish pixels
        b = np.logical_and(89 <= img[:, :, 0], img[:, :, 0] <= 137)
        g = np.logical_and(89 <= img[:, :, 1], img[:, :, 1] <= 134)
        r = np.logical_and(158 <= img[:, :, 2], img[:, :, 2] <= 248)

        pixels_to_fix = np.logical_and.reduce((b, g, r))
        img[np.where(pixels_to_fix)] = red

        # set to blue the bottom left corner of the slide to hide the owner's name
        b = np.logical_and(0 <= img[:, :, 0], img[:, :, 0] <= 85)
        g = np.logical_and(0 <= img[:, :, 1], img[:, :, 1] <= 67)
        r = np.logical_and(0 <= img[:, :, 2], img[:, :, 2] <= 51)
        pixels_to_fix = np.logical_and.reduce((b, g, r))
        img[np.where(pixels_to_fix)] = (88, 65, 50)

    return p > threshold_probably_red_bg, img


# Whiten the greyish pixels of an image
def replace_color(path="imgs/", path_is_image=False):
    imagelist = [path]
    if not path_is_image:
        imagelist = []
        imagelist.extend(glob.glob(os.path.join(path, '*.png')))

    i = 0
    start = 0
    has_been_fixed, img = fix_if_red_cover_page(imagelist[0])
    if has_been_fixed:
        start = 1
        cv2.imwrite("out/" + os.path.basename(imagelist[0]), img)

    for imagePath in imagelist[start:]:
        img_hsv = cv2.imread(imagePath)
        # whiten the slide
        rgb = np.logical_and(
                np.logical_and(
                    np.logical_and(154 <= img_hsv[:, :, 0], img_hsv[:, :, 0] <= 255),
                    np.logical_and(154 <= img_hsv[:, :, 1], img_hsv[:, :, 1] <= 255)),
                np.logical_and(154 <= img_hsv[:, :, 2], img_hsv[:, :, 2] <= 255)
        )
        indices = np.where(rgb)
        img_hsv[indices] = (255, 255, 255)

        # bye bye owner's name: whiten the bottom left area of the image (affects only the dark pixels)
        bottom_left_area = img_hsv[860:890, :700, :]
        b = np.logical_and(0 <= bottom_left_area[:, :, 0], bottom_left_area[:, :, 0] <= 153)
        g = np.logical_and(0 <= bottom_left_area[:, :, 1], bottom_left_area[:, :, 1] <= 153)
        r = np.logical_and(0 <= bottom_left_area[:, :, 2], bottom_left_area[:, :, 2] <= 153)
        pixels_to_fix = np.logical_and.reduce((b, g, r))
        bottom_left_area[np.where(pixels_to_fix)] = (255, 255, 255)
        img_hsv[860:890, :700, :] = bottom_left_area

        cv2.imwrite("out/" + os.path.basename(imagePath), img_hsv)
        i += 1
        sys.stdout.write(f"\r>>> Fixing the slides... {i} / {len(imagelist)} processed")
        sys.stdout.flush()
    print()


# Used further down to compare file names in order to
# sort the pdf's images in a correct order.
def cmp(x, y):
    a = int(os.path.basename(x).split('.')[0])
    b = int(os.path.basename(y).split('.')[0])
    return a - b


# Converts all processed images to a PDF
def imgs2pdf(imgs="out/"):
    image_directory = imgs
    imagelist = []
    imagelist.extend(glob.glob(os.path.join(image_directory, '*.png')))
    imagelist.sort(key=functools.cmp_to_key(cmp))
    pdf = FPDF(unit="pt", format=[1600, 900])
    i = 0
    for imagePath in imagelist:
        pdf.add_page()
        pdf.image(imagePath, 0, 0)
        i += 1
        sys.stdout.write(f"\r>>> Generating the PDF... slides {i} / {len(imagelist)} merged")
        sys.stdout.flush()
    print()
    pdf.output(sys.argv[1] + "__processed.pdf", "F")
    print(">>> PDF generated !")
    print(">>> Path to the PDF:", sys.argv[1] + "__processed.pdf")


# Cleans the working directory after the process
def clean(imgs="imgs/", out="out/"):
    files = glob.glob(f"{imgs}/*.png")
    files += glob.glob(f"{out}/*.png")
    for f in files:
        os.remove(f)


if len(sys.argv) != 2:
    print("Please mention a PDF file to process.")
    print("Example: python fix_slides.py ./mesPfds/slides_chimie.pdf")
else:
    pdf2img()
    replace_color()
    imgs2pdf()
    clean()