descale.py - Clean a scanned bi-level image

#!/usr/bin/python3

import os
import sys
import traceback
import math
import numpy as np
import itertools
import html
import argparse
from PIL import Image


VERSION = '1.0.0.0'


class Error (Exception):
  pass

class ArgError (Error):
  pass

class FileError(Error):
  pass

class DataError(Error):
  pass

class CmdError(Error):
  pass


def bilevel_array(M):
  #threshold = np.uint8(np.rint(np.median(M)))
  # Oddly, the median doesn’t seem to work as well as expected
  # for severely distressed images.
  # Use a coarse histogram, resistent to spikes caused by quantisation.
  H, E = np.histogram(M, bins=10)
  H = H.tolist()
  # Find the highest peak.
  hmax = max(H)
  hmax_ix = H.index(hmax)
  # Find the highest most distant secondary peak.
  scores = [h * abs(i - hmax_ix) for i, h in enumerate(H)]
  othermax = max(scores)
  othermax_ix = scores.index(othermax)
  # Identify the histogram indices for the light and dark bin modes.
  dark_ix = othermax_ix
  light_ix = hmax_ix
  if dark_ix > light_ix:
    dark_ix, light_ix = light_ix, dark_ix
  # Set the threshold to be halfway between the outer edges
  # of the light and dark bin modes.
  threshold = np.uint8(np.rint(0.5 * (E[dark_ix] + E[light_ix + 1])))
  M = 255 * np.array(np.greater_equal(M, threshold), dtype=np.uint8)
  #Image.fromarray(M).save("xxx.png")
  return M


def trim_array(M, bgvalue=None):
  if bgvalue is None:
    bgvalue = M[0, 0]
  top = 0
  bottom = M.shape[0]
  left = 0
  right = 0
  trimmed = dict(top=0, bottom=0, left=0, right=0)
  while bottom > top and np.all(np.equal(M[bottom - 1], bgvalue)):
    bottom -= 1
  while top < bottom and np.all(np.equal(M[top], bgvalue)):
    top += 1
  trimmed['top'] = top
  trimmed['bottom'] = M.shape[0] - bottom
  M = M[top : bottom]
  right = M.shape[1]
  while right > left and np.all(np.equal(M[:, right - 1], bgvalue)):
    right -= 1
  while left < right and np.all(np.equal(M[:, left], bgvalue)):
    left += 1
  trimmed['left'] = left
  trimmed['right'] = M.shape[1] - right
  M = M[:, left : right]
  return M, trimmed


def runs_metrics(edges):
  e = np.asarray(edges)
  # Run lengths
  d = e[1:] - e[:-1]
  # Find the single runs.
  d1 = np.equal(d, 1)
  # Find the groups of single runs.
  # (The boundaries wlll also be used in the augmentation step later.)
  t1 = np.pad(np.logical_and(d1, ~np.pad(d1[1:], [(0, 1)])), [(1, 0)])
  b1 = np.nonzero(t1)[0]
  # The array of lengths of runs of singles may include a spurious
  # zero, but that is fine for the max() function. What is important
  # is that b1 is still good for the augemtation step.
  s1 = [sum(seg) for seg in np.split(d1, b1)]
  max_srl = max(s1)
  num_singles = sum(s1)
  # Find the minimum multiple run length (of unaugmented runs).
  dm = np.delete(d, np.nonzero(d1)[0])
  min_mrl = min(dm) if len(dm) else 0
  # The augmented run lengths are the original run lengths of
  # two or more, expanded to swallow the adjacent single runs.
  # Augmentation can result in non-integer run lengths.
  z = []
  u = 0
  for span_d, span_s1 in zip(np.split(np.asfarray(d), b1), s1):
    s = span_d[:len(span_d) - span_s1]
    if len(s):
      s[0] += u
      s[-1] += 0.5 * span_s1
      z.append(s)
      u = 0.5 * span_s1
    else:
      u += span_s1
  arls = np.array(list(itertools.chain.from_iterable(z)))
  if len(arls):
    arls[-1] += u
  # The multiple-run indices each reference the start
  # of a run of duplicate entries before augmentation.
  mrixs = np.delete(e, np.nonzero(d1)[0])[:-1]
  # (The final edge, representing the image extent, is ommited.)
  result = {
    'num_singles': num_singles,
    'max_srl': max_srl,
    'min_mrl': min_mrl,
    'mr_indices': mrixs,
    'mr_lengths': dm,
    'ar_lengths': arls,
  }
  return result


def multiruns_scores(metrics):
  scores = []
  rm = metrics
  mrixs = rm['mr_indices']
  mrls = rm['mr_lengths']
  arls = rm['ar_lengths']
  h = int(round(sum(arls)))
  mindivs = len(arls)
  maxdivs = min(h // 2, math.ceil(5 * h / (min(arls))))
  ares = np.pad(np.cumsum(arls), [(1, 0)])
  singles = np.ones(h, dtype=np.bool)
  singles[list(itertools.chain.from_iterable(
      [range(ix, mrl) for ix, mrl in zip(mrixs, mrixs + mrls)]
  ))] = False
  for ndivs in range(mindivs, maxdivs + 1):
    m = ndivs * ares / h
    sum_e2 = np.linalg.norm(m - np.rint(m))
    sr_clx_score = 0
    for i in range(ndivs):
      x = int(round(h * (i + 0.5) / ndivs))
      err = singles[x] if x < h else 1
      sr_clx_score += err
    res_score = 0.001 * ndivs / maxdivs
    edge_score = round(sum_e2, 4)
    score = sr_clx_score + res_score + edge_score
    scores.append((score, ndivs))
  return scores


def descale_array(M):

  h = M.shape[0]
  w = M.shape[1]

  rowedges = [0]
  for y in range(h):
    if y + 1 >= h or np.any(np.not_equal(M[y + 1], M[y])):
      rowedges.append(y + 1)
  re = np.array(rowedges)
  coledges = [0]
  for x in range(w):
    if x + 1 >= w or np.any(np.not_equal(M[:, x + 1], M[:, x])):
      coledges.append(x + 1)
  ce = np.array(coledges)

  rm = runs_metrics(re)
  cm = runs_metrics(ce)
  row_ns = rm['num_singles']
  row_max_srl = rm['max_srl']
  row_min_mrl = rm['min_mrl']
  col_ns = cm['num_singles']
  col_max_srl = cm['max_srl']
  col_min_mrl = cm['min_mrl']

  mpr = 2 * row_ns < h and row_max_srl < row_min_mrl
  mpc = 2 * col_ns < w and col_max_srl < col_min_mrl

  if mpr and mpc:
    # The image appears to be scaled.
    # Analyse the muliple-run lengths to determine
    # the best sampling positions.
    rss = sorted(multiruns_scores(rm))[:5]
    css = sorted(multiruns_scores(cm))[:5]
    # Of the best in each axis, favour the combinations
    # which preserve the aspect ratio.
    aspect = w / h
    scores = []
    for rs, nr in rss:
      for cs, nc in css:
        a = nc / nr
        ascore = abs(math.log(a / aspect))
        score = rs + cs + ascore
        scores.append((score, nr, nc))
    best, nr, nc = sorted(scores)[0]
    # Select single rows and columns.
    rixs = np.int32(np.minimum(h - 1,
        np.rint(h * (np.arange(nr) + 0.5) / nr)))
    cixs = np.int32(np.minimum(w - 1,
        np.rint(w * (np.arange(nc) + 0.5) / nc)))
    M = M[rixs][:, cixs]

  #Image.fromarray(M).save("xxx.png")

  return M


def load_image_as_array(filename, bilevel=False):
  im = Image.open(filename)
  if bilevel:
    if im.mode == 'P':
      # Avoid the annoying warning about a transparent colour
      # in a paletted image.
      im = im.convert('LA')
    if im.mode != 'L':
      im = im.convert('L')
    M = np.asarray(im)
    M = bilevel_array(M)
  else:
    M = np.asarray(im)
  return M


def descale_trimmed_array(M, trimmed=None):
  if trimmed is None:
    trimmed = dict(top=0, bottom=0, left=0, right=0)
  h0, w0 = M.shape[:2]
  M1 = descale_array(M)
  h1, w1 = M1.shape[:2]
  margins = {
    'top': int(round(trimmed['top'] * h1 / h0)),
    'bottom': int(round(trimmed['bottom'] * h1 / h0)),
    'left': int(round(trimmed['left'] * w1 / w0)),
    'right': int(round(trimmed['right'] * w1 / w0)),
  }
  return M1, margins


def pad_array(M, margins, padvalue):
  if not isinstance(margins, dict):
    x = margins
    margins = dict(top=x, bottom=x, left=x, right=x)
  pad2d = np.expand_dims(np.asarray(padvalue), axis=(0, 1))
  padcol = np.repeat(pad2d, M.shape[0], axis=0)
  padl = np.repeat(padcol, margins['left'], axis=1)
  padr = np.repeat(padcol, margins['right'], axis=1)
  M = np.hstack((padl, M, padr))
  padrow = np.repeat(pad2d, M.shape[1], axis=1)
  padt = np.repeat(padrow, margins['top'], axis=0)
  padb = np.repeat(padrow, margins['bottom'], axis=0)
  M = np.vstack((padt, M, padb))
  return M


def get_arguments():

  cmd = os.path.basename(sys.argv[0])

  parser = argparse.ArgumentParser(
    prog=cmd,
    add_help=False,
    description="Reduces a nearest-neighbour scaled image."
  )

  parser.add_argument(
      "-h", "--help",
      dest="help", action="store_true",
      help="Display this message and exit.")
  parser.add_argument(
      "-b", "--bilevel",
      dest="bilevel", action="store_true",
      help="Reduce the colours to just black and white.")
  parser.add_argument(
      "-i", "--invert",
      dest="invert", action="store_true",
      help="Invert colours.")
  parser.add_argument(
      "-V", "--version",
      dest="version", action="store_true",
      help="Display version and exit.")

  parser.add_argument(
      "filename", metavar="IMAGE-IN",
      type=str,
      help=("The scaled bitmapped image to reduce."))

  parser.add_argument(
      "outfilename", metavar="IMG-OUT",
      type=str,
      help=("The reduced image file to create."))

  if "-h" in sys.argv or "--help" in sys.argv:
    parser.print_help()
    sys.exit(0)

  if "-V" in sys.argv or "--version" in sys.argv:
    print(VERSION)
    sys.exit(0)

  args = parser.parse_args()

  return args


def main():

  result = 0
  err_msg = ''

  cmd = os.path.basename(sys.argv[0])

  try:

    args = get_arguments()

    M = load_image_as_array(args.filename, args.bilevel)
    bgpixel = M[0, 0]
    M, trimmed = trim_array(M, bgpixel)
    M, margins = descale_trimmed_array(M, trimmed)
    M = pad_array(M, margins, bgpixel)
    im = Image.fromarray(M)
    if args.bilevel:
      im = im.convert('1')
    im.save(args.outfilename)

  except ArgError as E:
    err_msg = 'Error: ' + str(E)
    result = 2
  except FileError as E:
    err_msg = str(E)
    result = 3
  except CmdError as E:
    err_msg = str(E)
    result = 4
  except DataError as E:
    err_msg = str(E)
    result = 5
  except Exception as E:
    exc_type, exc_value, exc_traceback = sys.exc_info()
    err_lines = traceback.format_exc().splitlines()
    err_msg = 'Unhandled exception:\n' + '\n'.join(err_lines)
    result = 1

  if err_msg != '':
    print(cmd + ': ' + err_msg, file=sys.stderr)

  return result


if __name__ == '__main__':
  main()