Advertisement
Guest User

Untitled

a guest
Jul 17th, 2019
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.32 KB | None | 0 0
  1. package com.rks.git.pdf.parser.build;
  2.  
  3. import java.io.File;
  4. import java.io.IOException;
  5. import java.util.List;
  6.  
  7. import org.apache.pdfbox.pdmodel.PDDocument;
  8. import technology.tabula.ObjectExtractor;
  9. import technology.tabula.Page;
  10. import technology.tabula.RectangularTextContainer;
  11. import technology.tabula.Table;
  12. import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
  13.  
  14. public class PDFParserMaster {
  15.  
  16. public static void main(String[] args) throws IOException {
  17. // TODO Auto-generated method stub
  18. final String FILENAME="C:\\Users\\Rishu\\Downloads\\PDF Parser Data Test.pdf";
  19.  
  20. PDDocument pd = PDDocument.load(new File(FILENAME));
  21.  
  22. int totalPages = pd.getNumberOfPages();
  23. System.out.println("Total Pages in Document: "+totalPages);
  24.  
  25. ObjectExtractor oe = new ObjectExtractor(pd);
  26. SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
  27. Page page = oe.extract(1);
  28.  
  29. // extract text from the table after detecting
  30. List<Table> table = sea.extract(page);
  31. for(Table tables: table) {
  32. List<List<RectangularTextContainer>> rows = tables.getRows();
  33.  
  34. for(int i=0; i<rows.size(); i++) {
  35.  
  36. List<RectangularTextContainer> cells = rows.get(i);
  37.  
  38. for(int j=0; j<cells.size(); j++) {
  39. System.out.print(cells.get(j).getText()+"|");
  40. }
  41.  
  42. System.out.println();
  43. }
  44. }
  45.  
  46. }
  47.  
  48. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement