package com.csc.etl.kerne.fuldmagter; import java.awt.Color; import java.io.File; import java.io.FileOutputStream; import java.util.ArrayList; import java.util.LinkedList; import com.lowagie.text.pdf.BaseFont; import com.lowagie.text.pdf.PRIndirectReference; import com.lowagie.text.pdf.PRTokeniser; import com.lowagie.text.pdf.PdfContentByte; import com.lowagie.text.pdf.PdfDictionary; import com.lowagie.text.pdf.PdfName; import com.lowagie.text.pdf.PdfReader; import com.lowagie.text.pdf.PdfStamper; /** * This proof of concept program will parse a PDF file * with OCR data and mark all instances of the substring 'A' * * The goal of the final program is to mask social security numbers * from PDF files that must be made publicly available, * but it could equally well be used to search and highlight specific text * * @author LasseL */ public class PdfCprCensurPoc { // modify the paths to make it work locally String templatePath = "c:/tmp/akter/"; String inputPath = templatePath + "org/"; String outputPath = templatePath + "m/"; String blockString = "A"; private void censorPdf(File pdf) throws Exception { //Reads in the pdf Template PdfReader reader = new PdfReader(pdf.getAbsolutePath()); String outputFile = outputPath + "demo." + pdf.getName(); System.out.println("Outputting to " + outputFile); PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(outputFile)); MetaParser meta = null; // a simple howebrew parser // iterate over all pages for (int page = 1; page < reader.getNumberOfPages(); page++) { PdfContentByte pagewriter = stamper.getOverContent(page); BaseFont bf = null; // parse the OCR data on the page and censor stuff // examples of data: // Tj, 1, 0, 0, 1, 71.299, 220.099, Tm, 109, Tz, OPBaseFont2, 10, Tf, // Tj, 1, 0, 0, 1, 71.75, 253.449, Tm, 101, Tz, // Tj, 1, 0, 0, 1, 71.75, 175.949, Tm, // Tj, ET, 0, 0, 0, rg, 0, 0, 0, RG, q, 1, 0, 0, 1, 0, 0, cm, 597, 0, 0, 839, 0, 0, cm, ImagePart_3, Do, Q, BT, 0, 0, 0, rg, 0, 0, 0, RG, 1, 0, 0, 1, 223.9, 290.35, Tm, 103, Tz, 3, Tr, OPBaseFont1, 14, Tf PRTokeniser tok = new PRTokeniser(reader.getPageContent(page)); while (tok.nextToken()) { // parse the metadata until we get a string if (tok.getTokenType() != PRTokeniser.TK_STRING) { if (meta == null) meta = new MetaParser(); meta.next(tok.getStringValue()); } else { // string found String text = tok.getStringValue(); System.out.println(text +"\n\t" + "x : " + meta.x + " y: " + meta.y + " font " + meta.font + " " + meta.fontsize + " error: " + meta.error); System.out.println(meta.allData); // change font if it was changed in metadata if (meta.font != null) { bf = findFontInReader(reader, pagewriter, meta.font, meta.fontsize); } // censor this line? if (text.contains(blockString)) { float blockWidth = 0f; float blockHeight = 0f; float stringWidth = 0f; // find the size of the string we a blocking in the current font blockHeight = bf.getAscent(blockString); // doesn't work (why?) blockWidth = pagewriter.getEffectiveStringWidth(blockString, false); if (blockWidth <= 0f) { blockWidth = 10f; meta.error = true; } if (blockHeight <= 0f) { blockHeight = 10f; // always meta.error = true; } // only block the first A on each line for now String substr = text.substring(0, text.indexOf(blockString)); if (substr != null && substr.length() > 0) { // MAIN PROBLEM -- HOW TO GET THE CORRECT POSITION FOR THE SUBSTRING I AM LOOKING FOR! stringWidth = pagewriter.getEffectiveStringWidth(substr, false); } // paint a rectangle over the text we wish to block if (meta.error) pagewriter.setColorStroke(Color.red); else pagewriter.setColorStroke(Color.black); pagewriter.rectangle(meta.x + stringWidth, meta.y, blockWidth, blockHeight); pagewriter.stroke(); // paint some debug info // a blue dot to indicate the point where I calculate my offset from pagewriter.setColorStroke(Color.blue); pagewriter.circle(meta.x, meta.y, 2f); pagewriter.stroke(); // a text with the calculated width and the metadata that was parsed pagewriter.saveState(); pagewriter.setFontAndSize(bf, 5); pagewriter.beginText(); pagewriter.setColorStroke(Color.blue); pagewriter.moveText(meta.x, meta.y - 5f); pagewriter.showText(stringWidth+"px -- meta: " + meta.allData); pagewriter.endText(); pagewriter.stroke(); pagewriter.restoreState(); // TODO remove text from PDF / replace with X } // reset and get ready for a new line meta = null; } } } stamper.setFormFlattening(true); stamper.close(); } private BaseFont findFontInReader(PdfReader reader, PdfContentByte pagewriter, String font, float size) { ArrayList documentFonts = BaseFont.getDocumentFonts(reader); for (Object ofont : documentFonts) { Object[] fontinfo = (Object[]) ofont; Object fontname = fontinfo[0]; PRIndirectReference ref = (PRIndirectReference) fontinfo[1]; PdfDictionary fontdict = (PdfDictionary) reader.getPdfObject(ref.getNumber()); PdfName subname = fontdict.getAsName(PdfName.NAME); if (subname != null && subname.toString().equals("/" + font)) { BaseFont bf = BaseFont.createFont(ref); pagewriter.setFontAndSize(bf, size); return bf; } } return null; } private class MetaParser { public float x; public float y; public String font; public float fontsize = 8f; private boolean lastwasfont = false; int idx = 0; LinkedList allData = new LinkedList(); public boolean error = false; public void next(String value) { allData.add(value); switch (idx) { case 5: x = parseF(value); break; case 6: y = parseF(value); break; default: if (value.contains("Font")) { font = value; lastwasfont = true; } else if (lastwasfont) { fontsize = parseF(value); lastwasfont = false; } } idx++; } private float parseF(String value) { try { return Float.valueOf(value); } catch (NumberFormatException nfe) { error = true; nfe.printStackTrace(); // TODO return 20f; } } } public static void main(String[] args) { try { new PdfCprCensurPoc().go(); } catch (Exception e) { System.err.println("Error, bye"); e.printStackTrace(); if (e.getCause() != null) e.getCause().printStackTrace(); } } private void go() throws Exception { File folder = new File(inputPath); File[] listFiles = folder.listFiles(); for (File pdf : listFiles) { if (!pdf.isFile() || !pdf.getName().toLowerCase().endsWith(".pdf")) continue; System.out.println(pdf.getName()); censorPdf(pdf); // break; // just one file for now } } }