Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package com.csc.etl.kerne.fuldmagter;
- import java.awt.Color;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.util.ArrayList;
- import java.util.LinkedList;
- import com.lowagie.text.pdf.BaseFont;
- import com.lowagie.text.pdf.PRIndirectReference;
- import com.lowagie.text.pdf.PRTokeniser;
- import com.lowagie.text.pdf.PdfContentByte;
- import com.lowagie.text.pdf.PdfDictionary;
- import com.lowagie.text.pdf.PdfName;
- import com.lowagie.text.pdf.PdfReader;
- import com.lowagie.text.pdf.PdfStamper;
- /**
- * This proof of concept program will parse a PDF file
- * with OCR data and mark all instances of the substring 'A'
- *
- * The goal of the final program is to mask social security numbers
- * from PDF files that must be made publicly available,
- * but it could equally well be used to search and highlight specific text
- *
- * @author LasseL
- */
- public class PdfCprCensurPoc {
- // modify the paths to make it work locally
- String templatePath = "c:/tmp/akter/";
- String inputPath = templatePath + "org/";
- String outputPath = templatePath + "m/";
- String blockString = "A";
- private void censorPdf(File pdf) throws Exception {
- //Reads in the pdf Template
- PdfReader reader = new PdfReader(pdf.getAbsolutePath());
- String outputFile = outputPath + "demo." + pdf.getName();
- System.out.println("Outputting to " + outputFile);
- PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(outputFile));
- MetaParser meta = null; // a simple howebrew parser
- // iterate over all pages
- for (int page = 1; page < reader.getNumberOfPages(); page++) {
- PdfContentByte pagewriter = stamper.getOverContent(page);
- BaseFont bf = null;
- // parse the OCR data on the page and censor stuff
- // examples of data:
- // Tj, 1, 0, 0, 1, 71.299, 220.099, Tm, 109, Tz, OPBaseFont2, 10, Tf, <String>
- // Tj, 1, 0, 0, 1, 71.75, 253.449, Tm, 101, Tz, <String>
- // Tj, 1, 0, 0, 1, 71.75, 175.949, Tm, <String>
- // Tj, ET, 0, 0, 0, rg, 0, 0, 0, RG, q, 1, 0, 0, 1, 0, 0, cm, 597, 0, 0, 839, 0, 0, cm, ImagePart_3, Do, Q, BT, 0, 0, 0, rg, 0, 0, 0, RG, 1, 0, 0, 1, 223.9, 290.35, Tm, 103, Tz, 3, Tr, OPBaseFont1, 14, Tf
- PRTokeniser tok = new PRTokeniser(reader.getPageContent(page));
- while (tok.nextToken()) {
- // parse the metadata until we get a string
- if (tok.getTokenType() != PRTokeniser.TK_STRING) {
- if (meta == null) meta = new MetaParser();
- meta.next(tok.getStringValue());
- } else {
- // string found
- String text = tok.getStringValue();
- System.out.println(text +"\n\t" + "x : " + meta.x + " y: " + meta.y + " font " + meta.font + " " + meta.fontsize + " error: " + meta.error);
- System.out.println(meta.allData);
- // change font if it was changed in metadata
- if (meta.font != null) {
- bf = findFontInReader(reader, pagewriter, meta.font, meta.fontsize);
- }
- // censor this line?
- if (text.contains(blockString)) {
- float blockWidth = 0f;
- float blockHeight = 0f;
- float stringWidth = 0f;
- // find the size of the string we a blocking in the current font
- blockHeight = bf.getAscent(blockString); // doesn't work (why?)
- blockWidth = pagewriter.getEffectiveStringWidth(blockString, false);
- if (blockWidth <= 0f) {
- blockWidth = 10f;
- meta.error = true;
- }
- if (blockHeight <= 0f) {
- blockHeight = 10f; // always
- meta.error = true;
- }
- // only block the first A on each line for now
- String substr = text.substring(0, text.indexOf(blockString));
- if (substr != null && substr.length() > 0) {
- // MAIN PROBLEM -- HOW TO GET THE CORRECT POSITION FOR THE SUBSTRING I AM LOOKING FOR!
- stringWidth = pagewriter.getEffectiveStringWidth(substr, false);
- }
- // paint a rectangle over the text we wish to block
- if (meta.error) pagewriter.setColorStroke(Color.red);
- else pagewriter.setColorStroke(Color.black);
- pagewriter.rectangle(meta.x + stringWidth, meta.y, blockWidth, blockHeight);
- pagewriter.stroke();
- // paint some debug info
- // a blue dot to indicate the point where I calculate my offset from
- pagewriter.setColorStroke(Color.blue);
- pagewriter.circle(meta.x, meta.y, 2f);
- pagewriter.stroke();
- // a text with the calculated width and the metadata that was parsed
- pagewriter.saveState();
- pagewriter.setFontAndSize(bf, 5);
- pagewriter.beginText();
- pagewriter.setColorStroke(Color.blue);
- pagewriter.moveText(meta.x, meta.y - 5f);
- pagewriter.showText(stringWidth+"px -- meta: " + meta.allData);
- pagewriter.endText();
- pagewriter.stroke();
- pagewriter.restoreState();
- // TODO remove text from PDF / replace with X
- }
- // reset and get ready for a new line
- meta = null;
- }
- }
- }
- stamper.setFormFlattening(true);
- stamper.close();
- }
- private BaseFont findFontInReader(PdfReader reader, PdfContentByte pagewriter, String font, float size) {
- ArrayList documentFonts = BaseFont.getDocumentFonts(reader);
- for (Object ofont : documentFonts) {
- Object[] fontinfo = (Object[]) ofont;
- Object fontname = fontinfo[0];
- PRIndirectReference ref = (PRIndirectReference) fontinfo[1];
- PdfDictionary fontdict = (PdfDictionary) reader.getPdfObject(ref.getNumber());
- PdfName subname = fontdict.getAsName(PdfName.NAME);
- if (subname != null && subname.toString().equals("/" + font)) {
- BaseFont bf = BaseFont.createFont(ref);
- pagewriter.setFontAndSize(bf, size);
- return bf;
- }
- }
- return null;
- }
- private class MetaParser {
- public float x;
- public float y;
- public String font;
- public float fontsize = 8f;
- private boolean lastwasfont = false;
- int idx = 0;
- LinkedList<String> allData = new LinkedList<String>();
- public boolean error = false;
- public void next(String value) {
- allData.add(value);
- switch (idx) {
- case 5: x = parseF(value); break;
- case 6: y = parseF(value); break;
- default:
- if (value.contains("Font")) {
- font = value;
- lastwasfont = true;
- }
- else if (lastwasfont) {
- fontsize = parseF(value);
- lastwasfont = false;
- }
- }
- idx++;
- }
- private float parseF(String value) {
- try {
- return Float.valueOf(value);
- } catch (NumberFormatException nfe) {
- error = true;
- nfe.printStackTrace(); // TODO
- return 20f;
- }
- }
- }
- public static void main(String[] args) {
- try {
- new PdfCprCensurPoc().go();
- } catch (Exception e) {
- System.err.println("Error, bye");
- e.printStackTrace();
- if (e.getCause() != null) e.getCause().printStackTrace();
- }
- }
- private void go() throws Exception {
- File folder = new File(inputPath);
- File[] listFiles = folder.listFiles();
- for (File pdf : listFiles) {
- if (!pdf.isFile() || !pdf.getName().toLowerCase().endsWith(".pdf")) continue;
- System.out.println(pdf.getName());
- censorPdf(pdf);
- // break; // just one file for now
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement