Advertisement
Guest User

PDF highlight PoC

a guest
Apr 8th, 2011
428
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 7.69 KB | None | 0 0
  1. package com.csc.etl.kerne.fuldmagter;
  2.  
  3. import java.awt.Color;
  4. import java.io.File;
  5. import java.io.FileOutputStream;
  6. import java.util.ArrayList;
  7. import java.util.LinkedList;
  8.  
  9. import com.lowagie.text.pdf.BaseFont;
  10. import com.lowagie.text.pdf.PRIndirectReference;
  11. import com.lowagie.text.pdf.PRTokeniser;
  12. import com.lowagie.text.pdf.PdfContentByte;
  13. import com.lowagie.text.pdf.PdfDictionary;
  14. import com.lowagie.text.pdf.PdfName;
  15. import com.lowagie.text.pdf.PdfReader;
  16. import com.lowagie.text.pdf.PdfStamper;
  17.  
  18. /**
  19.  * This proof of concept program will parse a PDF file
  20.  * with OCR data and mark all instances of the substring 'A'
  21.  *
  22.  * The goal of the final program is to mask social security numbers
  23.  * from PDF files that must be made publicly available,
  24.  * but it could equally well be used to search and highlight specific text
  25.  *
  26.  * @author LasseL
  27.  */
  28. public class PdfCprCensurPoc {
  29.  
  30.   // modify the paths to make it work locally
  31.   String templatePath = "c:/tmp/akter/";
  32.   String inputPath = templatePath + "org/";
  33.   String outputPath = templatePath + "m/";
  34.  
  35.   String blockString = "A";
  36.  
  37.   private void censorPdf(File pdf) throws Exception {
  38.     //Reads in the pdf Template
  39.     PdfReader reader = new PdfReader(pdf.getAbsolutePath());
  40.     String outputFile = outputPath + "demo." + pdf.getName();
  41.     System.out.println("Outputting to " + outputFile);
  42.     PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(outputFile));
  43.     MetaParser meta = null; // a simple howebrew parser
  44.  
  45.     // iterate over all pages
  46.     for (int page = 1; page < reader.getNumberOfPages(); page++) {
  47.       PdfContentByte pagewriter = stamper.getOverContent(page);
  48.      
  49.       BaseFont bf = null;
  50.       // parse the OCR data on the page and censor stuff
  51.       // examples of data:
  52.       //    Tj, 1, 0, 0, 1, 71.299, 220.099, Tm, 109, Tz, OPBaseFont2, 10, Tf, <String>
  53.       //    Tj, 1, 0, 0, 1, 71.75, 253.449, Tm, 101, Tz, <String>
  54.       //    Tj, 1, 0, 0, 1, 71.75, 175.949, Tm, <String>
  55.       //    Tj, ET, 0, 0, 0, rg, 0, 0, 0, RG, q, 1, 0, 0, 1, 0, 0, cm, 597, 0, 0, 839, 0, 0, cm, ImagePart_3, Do, Q, BT, 0, 0, 0, rg, 0, 0, 0, RG, 1, 0, 0, 1, 223.9, 290.35, Tm, 103, Tz, 3, Tr, OPBaseFont1, 14, Tf
  56.       PRTokeniser tok = new PRTokeniser(reader.getPageContent(page));
  57.       while (tok.nextToken()) {
  58.         // parse the metadata until we get a string
  59.         if (tok.getTokenType() != PRTokeniser.TK_STRING) {
  60.           if (meta == null) meta = new MetaParser();
  61.           meta.next(tok.getStringValue());
  62.         } else {
  63.           // string found
  64.           String text = tok.getStringValue();
  65.           System.out.println(text +"\n\t" + "x : " + meta.x + " y: " + meta.y + " font " + meta.font + " " + meta.fontsize + " error: " + meta.error);
  66.           System.out.println(meta.allData);
  67.          
  68.           // change font if it was changed in metadata
  69.           if (meta.font != null) {
  70.             bf = findFontInReader(reader, pagewriter, meta.font, meta.fontsize);
  71.           }
  72.  
  73.           // censor this line?
  74.           if (text.contains(blockString)) {
  75.             float blockWidth = 0f;
  76.             float blockHeight = 0f;
  77.             float stringWidth = 0f;
  78.            
  79.             // find the size of the string we a blocking in the current font
  80.             blockHeight = bf.getAscent(blockString); // doesn't work (why?)
  81.             blockWidth = pagewriter.getEffectiveStringWidth(blockString, false);
  82.             if (blockWidth <= 0f) {
  83.               blockWidth = 10f;
  84.               meta.error = true;
  85.             }
  86.             if (blockHeight <= 0f) {
  87.               blockHeight = 10f; // always
  88.               meta.error = true;
  89.             }
  90.  
  91.             // only block the first A on each line for now
  92.             String substr = text.substring(0, text.indexOf(blockString));
  93.             if (substr != null && substr.length() > 0) {
  94.               // MAIN PROBLEM -- HOW TO GET THE CORRECT POSITION FOR THE SUBSTRING I AM LOOKING FOR!
  95.               stringWidth = pagewriter.getEffectiveStringWidth(substr, false);
  96.             }
  97.  
  98.             // paint a rectangle over the text we wish to block
  99.             if (meta.error) pagewriter.setColorStroke(Color.red);
  100.             else pagewriter.setColorStroke(Color.black);
  101.             pagewriter.rectangle(meta.x + stringWidth, meta.y, blockWidth, blockHeight);
  102.             pagewriter.stroke();
  103.            
  104.             // paint some debug info
  105.             // a blue dot to indicate the point where I calculate my offset from
  106.             pagewriter.setColorStroke(Color.blue);
  107.             pagewriter.circle(meta.x, meta.y, 2f);
  108.             pagewriter.stroke();
  109.            
  110.             // a text with the calculated width and the metadata that was parsed
  111.             pagewriter.saveState();
  112.             pagewriter.setFontAndSize(bf, 5);
  113.             pagewriter.beginText();
  114.             pagewriter.setColorStroke(Color.blue);
  115.             pagewriter.moveText(meta.x, meta.y - 5f);
  116.             pagewriter.showText(stringWidth+"px -- meta: " + meta.allData);
  117.             pagewriter.endText();
  118.             pagewriter.stroke();
  119.             pagewriter.restoreState();
  120.            
  121.             // TODO remove text from PDF / replace with X
  122.           }
  123.  
  124.           // reset and get ready for a new line
  125.           meta = null;
  126.         }
  127.       }
  128.     }
  129.    
  130.     stamper.setFormFlattening(true);
  131.     stamper.close();
  132.    
  133.   }
  134.  
  135.   private BaseFont findFontInReader(PdfReader reader, PdfContentByte pagewriter, String font, float size) {
  136.     ArrayList documentFonts = BaseFont.getDocumentFonts(reader);
  137.     for (Object ofont : documentFonts) {
  138.       Object[] fontinfo = (Object[]) ofont;
  139.       Object fontname = fontinfo[0];
  140.       PRIndirectReference ref = (PRIndirectReference) fontinfo[1];
  141.       PdfDictionary fontdict = (PdfDictionary) reader.getPdfObject(ref.getNumber());
  142.       PdfName subname = fontdict.getAsName(PdfName.NAME);
  143.       if (subname != null && subname.toString().equals("/" + font)) {
  144.         BaseFont bf = BaseFont.createFont(ref);
  145.         pagewriter.setFontAndSize(bf, size);
  146.         return bf;
  147.       }
  148.     }
  149.     return null;
  150.   }
  151.  
  152.   private class MetaParser {
  153.     public float x;
  154.     public float y;
  155.     public String font;
  156.     public float fontsize = 8f;
  157.     private boolean lastwasfont = false;
  158.     int idx = 0;
  159.     LinkedList<String> allData = new LinkedList<String>();
  160.     public boolean error = false;
  161.    
  162.     public void next(String value) {
  163.       allData.add(value);
  164.       switch (idx) {
  165.         case 5: x = parseF(value); break;
  166.         case 6: y = parseF(value); break;
  167.         default:
  168.           if (value.contains("Font")) {
  169.             font = value;
  170.             lastwasfont = true;
  171.           }
  172.           else if (lastwasfont) {
  173.             fontsize = parseF(value);
  174.             lastwasfont = false;
  175.           }
  176.       }
  177.       idx++;
  178.     }
  179.  
  180.     private float parseF(String value) {
  181.       try {
  182.         return Float.valueOf(value);
  183.       } catch (NumberFormatException nfe) {
  184.         error = true;
  185.         nfe.printStackTrace(); // TODO
  186.         return 20f;
  187.       }
  188.     }
  189.   }
  190.  
  191.   public static void main(String[] args) {
  192.     try {
  193.       new PdfCprCensurPoc().go();
  194.     } catch (Exception e) {
  195.       System.err.println("Error, bye");
  196.       e.printStackTrace();
  197.       if (e.getCause() != null) e.getCause().printStackTrace();
  198.     }
  199.   }
  200.  
  201.   private void go() throws Exception {
  202.     File folder = new File(inputPath);
  203.     File[] listFiles = folder.listFiles();
  204.     for (File pdf : listFiles) {
  205.       if (!pdf.isFile() || !pdf.getName().toLowerCase().endsWith(".pdf")) continue;
  206.       System.out.println(pdf.getName());
  207.       censorPdf(pdf);
  208.       // break; // just one file for now
  209.     }
  210.   }
  211.  
  212. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement