PDF highlight PoC

package com.csc.etl.kerne.fuldmagter;

import java.awt.Color;
import java.io.File;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.LinkedList;

import com.lowagie.text.pdf.BaseFont;
import com.lowagie.text.pdf.PRIndirectReference;
import com.lowagie.text.pdf.PRTokeniser;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfStamper;

/**
 * This proof of concept program will parse a PDF file
 * with OCR data and mark all instances of the substring 'A'
 *
 * The goal of the final program is to mask social security numbers
 * from PDF files that must be made publicly available,
 * but it could equally well be used to search and highlight specific text
 *
 * @author LasseL
 */
public class PdfCprCensurPoc {

  // modify the paths to make it work locally
  String templatePath = "c:/tmp/akter/";
  String inputPath = templatePath + "org/";
  String outputPath = templatePath + "m/";

  String blockString = "A";

  private void censorPdf(File pdf) throws Exception {
    //Reads in the pdf Template
    PdfReader reader = new PdfReader(pdf.getAbsolutePath());
    String outputFile = outputPath + "demo." + pdf.getName();
    System.out.println("Outputting to " + outputFile);
    PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(outputFile));
    MetaParser meta = null; // a simple howebrew parser

    // iterate over all pages
    for (int page = 1; page < reader.getNumberOfPages(); page++) {
      PdfContentByte pagewriter = stamper.getOverContent(page);

      BaseFont bf = null;
      // parse the OCR data on the page and censor stuff
      // examples of data:
      //    Tj, 1, 0, 0, 1, 71.299, 220.099, Tm, 109, Tz, OPBaseFont2, 10, Tf, <String>
      //    Tj, 1, 0, 0, 1, 71.75, 253.449, Tm, 101, Tz, <String>
      //    Tj, 1, 0, 0, 1, 71.75, 175.949, Tm, <String>
      //    Tj, ET, 0, 0, 0, rg, 0, 0, 0, RG, q, 1, 0, 0, 1, 0, 0, cm, 597, 0, 0, 839, 0, 0, cm, ImagePart_3, Do, Q, BT, 0, 0, 0, rg, 0, 0, 0, RG, 1, 0, 0, 1, 223.9, 290.35, Tm, 103, Tz, 3, Tr, OPBaseFont1, 14, Tf
      PRTokeniser tok = new PRTokeniser(reader.getPageContent(page));
      while (tok.nextToken()) {
        // parse the metadata until we get a string
        if (tok.getTokenType() != PRTokeniser.TK_STRING) {
          if (meta == null) meta = new MetaParser();
          meta.next(tok.getStringValue());
        } else {
          // string found
          String text = tok.getStringValue();
          System.out.println(text +"\n\t" + "x : " + meta.x + " y: " + meta.y + " font " + meta.font + " " + meta.fontsize + " error: " + meta.error);
          System.out.println(meta.allData);

          // change font if it was changed in metadata
          if (meta.font != null) {
            bf = findFontInReader(reader, pagewriter, meta.font, meta.fontsize);
          }

          // censor this line?
          if (text.contains(blockString)) {
            float blockWidth = 0f;
            float blockHeight = 0f;
            float stringWidth = 0f;

            // find the size of the string we a blocking in the current font
            blockHeight = bf.getAscent(blockString); // doesn't work (why?)
            blockWidth = pagewriter.getEffectiveStringWidth(blockString, false);
            if (blockWidth <= 0f) {
              blockWidth = 10f;
              meta.error = true;
            }
            if (blockHeight <= 0f) {
              blockHeight = 10f; // always
              meta.error = true;
            }

            // only block the first A on each line for now
            String substr = text.substring(0, text.indexOf(blockString));
            if (substr != null && substr.length() > 0) {
              // MAIN PROBLEM -- HOW TO GET THE CORRECT POSITION FOR THE SUBSTRING I AM LOOKING FOR!
              stringWidth = pagewriter.getEffectiveStringWidth(substr, false);
            }

            // paint a rectangle over the text we wish to block
            if (meta.error) pagewriter.setColorStroke(Color.red);
            else pagewriter.setColorStroke(Color.black);
            pagewriter.rectangle(meta.x + stringWidth, meta.y, blockWidth, blockHeight);
            pagewriter.stroke();

            // paint some debug info
            // a blue dot to indicate the point where I calculate my offset from
            pagewriter.setColorStroke(Color.blue);
            pagewriter.circle(meta.x, meta.y, 2f);
            pagewriter.stroke();

            // a text with the calculated width and the metadata that was parsed
            pagewriter.saveState();
            pagewriter.setFontAndSize(bf, 5);
            pagewriter.beginText();
            pagewriter.setColorStroke(Color.blue);
            pagewriter.moveText(meta.x, meta.y - 5f);
            pagewriter.showText(stringWidth+"px -- meta: " + meta.allData);
            pagewriter.endText();
            pagewriter.stroke();
            pagewriter.restoreState();

            // TODO remove text from PDF / replace with X
          }

          // reset and get ready for a new line
          meta = null;
        }
      }
    }

    stamper.setFormFlattening(true);
    stamper.close();

  }

  private BaseFont findFontInReader(PdfReader reader, PdfContentByte pagewriter, String font, float size) {
    ArrayList documentFonts = BaseFont.getDocumentFonts(reader);
    for (Object ofont : documentFonts) {
      Object[] fontinfo = (Object[]) ofont;
      Object fontname = fontinfo[0];
      PRIndirectReference ref = (PRIndirectReference) fontinfo[1];
      PdfDictionary fontdict = (PdfDictionary) reader.getPdfObject(ref.getNumber());
      PdfName subname = fontdict.getAsName(PdfName.NAME);
      if (subname != null && subname.toString().equals("/" + font)) {
        BaseFont bf = BaseFont.createFont(ref);
        pagewriter.setFontAndSize(bf, size);
        return bf;
      }
    }
    return null;
  }

  private class MetaParser {
    public float x;
    public float y;
    public String font;
    public float fontsize = 8f;
    private boolean lastwasfont = false;
    int idx = 0;
    LinkedList<String> allData = new LinkedList<String>();
    public boolean error = false;

    public void next(String value) {
      allData.add(value);
      switch (idx) {
        case 5: x = parseF(value); break;
        case 6: y = parseF(value); break;
        default:
          if (value.contains("Font")) {
            font = value;
            lastwasfont = true;
          }
          else if (lastwasfont) {
            fontsize = parseF(value);
            lastwasfont = false;
          }
      }
      idx++;
    }

    private float parseF(String value) {
      try {
        return Float.valueOf(value);
      } catch (NumberFormatException nfe) {
        error = true;
        nfe.printStackTrace(); // TODO
        return 20f;
      }
    }
  }

  public static void main(String[] args) {
    try {
      new PdfCprCensurPoc().go();
    } catch (Exception e) {
      System.err.println("Error, bye");
      e.printStackTrace();
      if (e.getCause() != null) e.getCause().printStackTrace();
    }
  }

  private void go() throws Exception {
    File folder = new File(inputPath);
    File[] listFiles = folder.listFiles();
    for (File pdf : listFiles) {
      if (!pdf.isFile() || !pdf.getName().toLowerCase().endsWith(".pdf")) continue;
      System.out.println(pdf.getName());
      censorPdf(pdf);
      // break; // just one file for now
    }
  }

}