package com.csc.etl.kerne.fuldmagter;
import java.awt.Color;
import java.io.File;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import com.lowagie.text.pdf.BaseFont;
import com.lowagie.text.pdf.PRIndirectReference;
import com.lowagie.text.pdf.PRTokeniser;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfStamper;
/**
* This proof of concept program will parse a PDF file
* with OCR data and mark all instances of the substring 'A'
*
* The goal of the final program is to mask social security numbers
* from PDF files that must be made publicly available,
* but it could equally well be used to search and highlight specific text
*
* @author LasseL
*/
public class PdfCprCensurPoc {
// modify the paths to make it work locally
String templatePath = "c:/tmp/akter/";
String inputPath = templatePath + "org/";
String outputPath = templatePath + "m/";
String blockString = "A";
private void censorPdf(File pdf) throws Exception {
//Reads in the pdf Template
PdfReader reader = new PdfReader(pdf.getAbsolutePath());
String outputFile = outputPath + "demo." + pdf.getName();
System.out.println("Outputting to " + outputFile);
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(outputFile));
MetaParser meta = null; // a simple howebrew parser
// iterate over all pages
for (int page = 1; page < reader.getNumberOfPages(); page++) {
PdfContentByte pagewriter = stamper.getOverContent(page);
BaseFont bf = null;
// parse the OCR data on the page and censor stuff
// examples of data:
// Tj, 1, 0, 0, 1, 71.299, 220.099, Tm, 109, Tz, OPBaseFont2, 10, Tf, <String>
// Tj, 1, 0, 0, 1, 71.75, 253.449, Tm, 101, Tz, <String>
// Tj, 1, 0, 0, 1, 71.75, 175.949, Tm, <String>
// Tj, ET, 0, 0, 0, rg, 0, 0, 0, RG, q, 1, 0, 0, 1, 0, 0, cm, 597, 0, 0, 839, 0, 0, cm, ImagePart_3, Do, Q, BT, 0, 0, 0, rg, 0, 0, 0, RG, 1, 0, 0, 1, 223.9, 290.35, Tm, 103, Tz, 3, Tr, OPBaseFont1, 14, Tf
PRTokeniser tok = new PRTokeniser(reader.getPageContent(page));
while (tok.nextToken()) {
// parse the metadata until we get a string
if (tok.getTokenType() != PRTokeniser.TK_STRING) {
if (meta == null) meta = new MetaParser();
meta.next(tok.getStringValue());
} else {
// string found
String text = tok.getStringValue();
System.out.println(text +"\n\t" + "x : " + meta.x + " y: " + meta.y + " font " + meta.font + " " + meta.fontsize + " error: " + meta.error);
System.out.println(meta.allData);
// change font if it was changed in metadata
if (meta.font != null) {
bf = findFontInReader(reader, pagewriter, meta.font, meta.fontsize);
}
// censor this line?
if (text.contains(blockString)) {
float blockWidth = 0f;
float blockHeight = 0f;
float stringWidth = 0f;
// find the size of the string we a blocking in the current font
blockHeight = bf.getAscent(blockString); // doesn't work (why?)
blockWidth = pagewriter.getEffectiveStringWidth(blockString, false);
if (blockWidth <= 0f) {
blockWidth = 10f;
meta.error = true;
}
if (blockHeight <= 0f) {
blockHeight = 10f; // always
meta.error = true;
}
// only block the first A on each line for now
String substr = text.substring(0, text.indexOf(blockString));
if (substr != null && substr.length() > 0) {
// MAIN PROBLEM -- HOW TO GET THE CORRECT POSITION FOR THE SUBSTRING I AM LOOKING FOR!
stringWidth = pagewriter.getEffectiveStringWidth(substr, false);
}
// paint a rectangle over the text we wish to block
if (meta.error) pagewriter.setColorStroke(Color.red);
else pagewriter.setColorStroke(Color.black);
pagewriter.rectangle(meta.x + stringWidth, meta.y, blockWidth, blockHeight);
pagewriter.stroke();
// paint some debug info
// a blue dot to indicate the point where I calculate my offset from
pagewriter.setColorStroke(Color.blue);
pagewriter.circle(meta.x, meta.y, 2f);
pagewriter.stroke();
// a text with the calculated width and the metadata that was parsed
pagewriter.saveState();
pagewriter.setFontAndSize(bf, 5);
pagewriter.beginText();
pagewriter.setColorStroke(Color.blue);
pagewriter.moveText(meta.x, meta.y - 5f);
pagewriter.showText(stringWidth+"px -- meta: " + meta.allData);
pagewriter.endText();
pagewriter.stroke();
pagewriter.restoreState();
// TODO remove text from PDF / replace with X
}
// reset and get ready for a new line
meta = null;
}
}
}
stamper.setFormFlattening(true);
stamper.close();
}
private BaseFont findFontInReader(PdfReader reader, PdfContentByte pagewriter, String font, float size) {
ArrayList documentFonts = BaseFont.getDocumentFonts(reader);
for (Object ofont : documentFonts) {
Object[] fontinfo = (Object[]) ofont;
Object fontname = fontinfo[0];
PRIndirectReference ref = (PRIndirectReference) fontinfo[1];
PdfDictionary fontdict = (PdfDictionary) reader.getPdfObject(ref.getNumber());
PdfName subname = fontdict.getAsName(PdfName.NAME);
if (subname != null && subname.toString().equals("/" + font)) {
BaseFont bf = BaseFont.createFont(ref);
pagewriter.setFontAndSize(bf, size);
return bf;
}
}
return null;
}
private class MetaParser {
public float x;
public float y;
public String font;
public float fontsize = 8f;
private boolean lastwasfont = false;
int idx = 0;
LinkedList<String> allData = new LinkedList<String>();
public boolean error = false;
public void next(String value) {
allData.add(value);
switch (idx) {
case 5: x = parseF(value); break;
case 6: y = parseF(value); break;
default:
if (value.contains("Font")) {
font = value;
lastwasfont = true;
}
else if (lastwasfont) {
fontsize = parseF(value);
lastwasfont = false;
}
}
idx++;
}
private float parseF(String value) {
try {
return Float.valueOf(value);
} catch (NumberFormatException nfe) {
error = true;
nfe.printStackTrace(); // TODO
return 20f;
}
}
}
public static void main(String[] args) {
try {
new PdfCprCensurPoc().go();
} catch (Exception e) {
System.err.println("Error, bye");
e.printStackTrace();
if (e.getCause() != null) e.getCause().printStackTrace();
}
}
private void go() throws Exception {
File folder = new File(inputPath);
File[] listFiles = folder.listFiles();
for (File pdf : listFiles) {
if (!pdf.isFile() || !pdf.getName().toLowerCase().endsWith(".pdf")) continue;
System.out.println(pdf.getName());
censorPdf(pdf);
// break; // just one file for now
}
}
}