Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package com.rks.git.pdf.parser.build;
- import java.io.File;
- import java.io.IOException;
- import java.util.List;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import technology.tabula.ObjectExtractor;
- import technology.tabula.Page;
- import technology.tabula.RectangularTextContainer;
- import technology.tabula.Table;
- import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
- public class PDFParserMaster {
- public static void main(String[] args) throws IOException {
- // TODO Auto-generated method stub
- final String FILENAME="C:\\Users\\Rishu\\Downloads\\PDF Parser Data Test.pdf";
- PDDocument pd = PDDocument.load(new File(FILENAME));
- int totalPages = pd.getNumberOfPages();
- System.out.println("Total Pages in Document: "+totalPages);
- ObjectExtractor oe = new ObjectExtractor(pd);
- SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
- Page page = oe.extract(1);
- // extract text from the table after detecting
- List<Table> table = sea.extract(page);
- for(Table tables: table) {
- List<List<RectangularTextContainer>> rows = tables.getRows();
- for(int i=0; i<rows.size(); i++) {
- List<RectangularTextContainer> cells = rows.get(i);
- for(int j=0; j<cells.size(); j++) {
- System.out.print(cells.get(j).getText()+"|");
- }
- System.out.println();
- }
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement