Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package poijartest;
- import java.awt.Color;
- import java.awt.image.BufferedImage;
- import java.io.ByteArrayInputStream;
- import java.io.ByteArrayOutputStream;
- import java.io.Closeable;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.lang.reflect.Method;
- import java.net.URL;
- import java.nio.charset.Charset;
- import java.util.ArrayList;
- import java.util.Iterator;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import javax.imageio.ImageIO;
- import org.apache.poi.ddf.EscherComplexProperty;
- import org.apache.poi.ddf.EscherOptRecord;
- import org.apache.poi.ddf.EscherProperty;
- import org.apache.poi.hpsf.ClassID;
- import org.apache.poi.hslf.usermodel.HSLFSlideShow;
- import org.apache.poi.hssf.usermodel.HSSFClientAnchor;
- import org.apache.poi.hssf.usermodel.HSSFObjectData;
- import org.apache.poi.hssf.usermodel.HSSFPatriarch;
- import org.apache.poi.hssf.usermodel.HSSFPicture;
- import org.apache.poi.hssf.usermodel.HSSFPictureData;
- import org.apache.poi.hssf.usermodel.HSSFShape;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFSimpleShape;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.openxml4j.opc.PackagePart;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.Entry;
- import org.apache.poi.poifs.filesystem.Ole10Native;
- import org.apache.poi.poifs.filesystem.Ole10NativeException;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.sl.usermodel.AutoShape;
- import org.apache.poi.sl.usermodel.ShapeType;
- import org.apache.poi.sl.usermodel.Slide;
- import org.apache.poi.ss.usermodel.Sheet;
- import org.apache.poi.ss.usermodel.Workbook;
- import org.apache.poi.ss.usermodel.WorkbookFactory;
- import org.apache.poi.util.IOUtils;
- import org.apache.poi.xssf.usermodel.XSSFDrawing;
- import org.apache.poi.xssf.usermodel.XSSFPicture;
- import org.apache.poi.xssf.usermodel.XSSFPictureData;
- import org.apache.poi.xssf.usermodel.XSSFShape;
- import org.apache.poi.xssf.usermodel.XSSFSheet;
- import org.apache.poi.xssf.usermodel.XSSFWorkbook;
- import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTPicture;
- /**
- * Tested with POI 3.16-beta1
- *
- * 17.12.2014: original version for
- * http://apache-poi.1045710.n5.nabble.com/How-to-get-the-full-file-name-of-a-picture-in-xls-file-td5717205.html
- *
- * 17.12.2016: added sample/dummy data for
- * http://stackoverflow.com/questions/41101012/how-to-export-embeded-file-which-from-excel-using-poi
- */
- public class EmbeddedReader {
- private File excel_file;
- private ImageReader image_reader;
- public static void main(String[] args) throws Exception {
- File sample = new File("bla.xls");
- getSampleEmbedded(sample);
- ImageReader ir = new ImageReader(sample);
- for (EmbeddedData ed : ir.embeddings) {
- System.out.println(ed.filename);
- FileOutputStream fos = new FileOutputStream(ed.filename);
- IOUtils.copy(ed.is, fos);
- fos.close();
- }
- ir.close();
- }
- static void getSampleEmbedded(File sample) throws IOException {
- HSSFWorkbook wb = new HSSFWorkbook();
- int storageId = wb.addOlePackage(getSamplePPT(), "dummy.ppt", "dummy.ppt", "dummy.ppt");
- int picId = wb.addPicture(getSamplePng(), HSSFPicture.PICTURE_TYPE_PNG);
- HSSFSheet sheet = wb.createSheet();
- HSSFPatriarch pat = sheet.createDrawingPatriarch();
- HSSFClientAnchor anc = pat.createAnchor(0, 0, 0, 0, 1, 1, 3, 6);
- HSSFObjectData od = pat.createObjectData(anc, storageId, picId);
- od.setNoFill(true);
- wb.write(sample);
- wb.close();
- }
- static byte[] getSamplePng() throws IOException {
- ClassLoader cl = Thread.currentThread().getContextClassLoader();
- URL imgUrl = cl.getResource("javax/swing/plaf/metal/icons/ocean/directory.gif");
- BufferedImage img = ImageIO.read(imgUrl);
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- ImageIO.write(img, "PNG", bos);
- return bos.toByteArray();
- }
- static byte[] getSamplePPT() throws IOException {
- HSLFSlideShow ppt = new HSLFSlideShow();
- Slide<?,?> slide = ppt.createSlide();
- AutoShape<?,?> sh1 = slide.createAutoShape();
- sh1.setShapeType(ShapeType.STAR_32);
- sh1.setAnchor(new java.awt.Rectangle(50, 50, 100, 200));
- sh1.setFillColor(Color.red);
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- ppt.write(bos);
- ppt.close();
- POIFSFileSystem poifs = new POIFSFileSystem(new ByteArrayInputStream(bos.toByteArray()));
- poifs.getRoot().setStorageClsid(ClassID.PPT_SHOW);
- bos.reset();
- poifs.writeFilesystem(bos);
- poifs.close();
- return bos.toByteArray();
- }
- public EmbeddedReader(String excel_path) throws IOException {
- excel_file = new File(excel_path);
- image_reader = new ImageReader(excel_file);
- }
- public String[] get_file_names() {
- ArrayList<String> file_names = new ArrayList<String>();
- for (EmbeddedData ed : image_reader.embeddings) {
- file_names.add(ed.filename);
- }
- return file_names.toArray(new String[file_names.size()]);
- }
- public InputStream get_stream(String file_name) {
- InputStream input_stream = null;
- for (EmbeddedData ed : image_reader.embeddings) {
- if(file_name.equals(ed.filename)) {
- input_stream = ed.is;
- break;
- }
- }
- return input_stream;
- }
- static class ImageReader implements Closeable {
- EmbeddedExtractor extractors[] = {
- new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
- };
- List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
- Workbook wb;
- public ImageReader(File excelfile) throws IOException {
- try {
- wb = WorkbookFactory.create(excelfile);
- Sheet receiptImages = wb.getSheet("Receipt images");
- if (wb instanceof XSSFWorkbook) {
- addSheetPicsAndEmbedds((XSSFSheet)receiptImages);
- } else {
- addAllEmbedds((HSSFWorkbook)wb);
- addSheetPics((HSSFSheet)receiptImages);
- }
- } catch (Exception e) {
- // todo: error handling
- }
- }
- protected void addSheetPicsAndEmbedds(XSSFSheet sheet) throws IOException {
- if (sheet == null) return;
- XSSFDrawing draw = sheet.createDrawingPatriarch();
- for (XSSFShape shape : draw.getShapes()) {
- if (!(shape instanceof XSSFPicture)) continue;
- XSSFPicture picture = (XSSFPicture)shape;
- XSSFPictureData pd = picture.getPictureData();
- PackagePart pp = pd.getPackagePart();
- CTPicture ctPic = picture.getCTPicture();
- String filename = null;
- try {
- filename = ctPic.getNvPicPr().getCNvPr().getName();
- } catch (Exception e) {}
- if (filename == null || "".equals(filename)) {
- filename = new File(pp.getPartName().toString()).getName();
- }
- EmbeddedData ed = new EmbeddedData();
- ed.filename = fileNameWithoutPath(filename);
- ed.is = pp.getInputStream();
- embeddings.add(ed);
- }
- }
- protected void addAllEmbedds(HSSFWorkbook hwb) throws IOException {
- for (HSSFObjectData od : hwb.getAllEmbeddedObjects()) {
- String alternativeName = getAlternativeName(od);
- if (od.hasDirectoryEntry()) {
- DirectoryNode src = (DirectoryNode)od.getDirectory();
- for (EmbeddedExtractor ee : extractors) {
- if (ee.canExtract(src)) {
- EmbeddedData ed = ee.extract(src);
- if (ed.filename == null || ed.filename.startsWith("MBD") || alternativeName != null) {
- ed.filename = alternativeName;
- }
- ed.filename = fileNameWithoutPath(ed.filename);
- ed.source = "object";
- embeddings.add(ed);
- break;
- }
- }
- }
- }
- }
- protected String getAlternativeName(HSSFShape shape) {
- EscherOptRecord eor = reflectEscherOptRecord(shape);
- if (eor == null) return null;
- for (EscherProperty ep : eor.getEscherProperties()) {
- if ("groupshape.shapename".equals(ep.getName()) && ep.isComplex()) {
- return new String(((EscherComplexProperty)ep).getComplexData(),
- Charset.forName("UTF-16LE"));
- }
- }
- return null;
- }
- protected void addSheetPics(HSSFSheet sheet) {
- if (sheet == null) return;
- int picIdx=0;
- int emfIdx = 0;
- HSSFPatriarch patriarch = sheet.getDrawingPatriarch();
- if (patriarch == null) return;
- // Loop through the objects
- for (HSSFShape shape : patriarch.getChildren()) {
- if (!(shape instanceof HSSFPicture)) {
- continue;
- }
- HSSFPicture picture = (HSSFPicture) shape;
- if (picture.getShapeType() != HSSFSimpleShape.OBJECT_TYPE_PICTURE) continue;
- HSSFPictureData pd = picture.getPictureData();
- byte pictureBytes[] = pd.getData();
- int pictureBytesOffset = 0;
- int pictureBytesLen = pictureBytes.length;
- String filename = picture.getFileName();
- // try to find an alternative name
- if (filename == null || "".equals(filename)) {
- filename = getAlternativeName(picture);
- }
- // default to dummy name
- if (filename == null || "".equals(filename)) {
- filename = "picture"+(picIdx++);
- }
- filename = filename.trim();
- // check for emf+ embedded pdf (poor mans style :( )
- // Mac Excel 2011 embeds pdf files with this method.
- boolean validFile = true;
- if (pd.getFormat() == Workbook.PICTURE_TYPE_EMF) {
- validFile = false;
- int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes());
- if (idxStart != -1) {
- int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes());
- if (idxEnd != -1) {
- pictureBytesOffset = idxStart;
- pictureBytesLen = idxEnd-idxStart+6;
- validFile = true;
- }
- } else {
- // This shape was not a Mac Excel 2011 embedded pdf file.
- // So this is a shape related to a regular embedded object
- // Lets update the object filename with the shapes filename
- // if the object filename is of format ARGF1234.pdf
- EmbeddedData ed_obj = embeddings.get(emfIdx);
- Pattern pattern = Pattern.compile("^[A-Z0-9]{8}\\.[pdfPDF]{3}$");
- Matcher matcher = pattern.matcher(ed_obj.filename);
- if(matcher.matches()) {
- ed_obj.filename = filename;
- }
- emfIdx += 1;
- }
- }
- EmbeddedData ed = new EmbeddedData();
- ed.filename = fileNameWithoutPath(filename);
- ed.is = new ByteArrayInputStream(pictureBytes, pictureBytesOffset, pictureBytesLen);
- if(fileNotInEmbeddings(ed.filename) && validFile) {
- embeddings.add(ed);
- }
- }
- }
- private static EscherOptRecord reflectEscherOptRecord(HSSFShape shape) {
- try {
- Method m = HSSFShape.class.getDeclaredMethod("getOptRecord");
- m.setAccessible(true);
- return (EscherOptRecord)m.invoke(shape);
- } catch (Exception e) {
- // todo: log ... well actually "should not happen" ;)
- return null;
- }
- }
- private String fileNameWithoutPath(String filename) {
- int last_index = filename.lastIndexOf("\\");
- return filename.substring(last_index + 1);
- }
- private boolean fileNotInEmbeddings(String filename) {
- boolean exists = true;
- for(EmbeddedData ed : embeddings) {
- if(ed.filename.equals(filename)) {
- exists = false;
- }
- }
- return exists;
- }
- public void close() throws IOException {
- Iterator<EmbeddedData> ed = embeddings.iterator();
- while (ed.hasNext()) {
- ed.next().is.close();
- }
- wb.close();
- }
- }
- static class EmbeddedData {
- String filename;
- InputStream is;
- String source;
- }
- static abstract class EmbeddedExtractor {
- abstract boolean canExtract(DirectoryNode dn);
- abstract EmbeddedData extract(DirectoryNode dn) throws IOException;
- protected EmbeddedData extractFS(DirectoryNode dn, String filename) throws IOException {
- assert(canExtract(dn));
- POIFSFileSystem dest = new POIFSFileSystem();
- copyNodes(dn, dest.getRoot());
- EmbeddedData ed = new EmbeddedData();
- ed.filename = filename;
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- dest.writeFilesystem(bos);
- dest.close();
- ed.is = new ByteArrayInputStream(bos.toByteArray());
- return ed;
- }
- }
- static class Ole10Extractor extends EmbeddedExtractor {
- public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return ClassID.OLE10_PACKAGE.equals(clsId);
- }
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- try {
- Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
- EmbeddedData ed = new EmbeddedData();
- ed.filename = new File(ole10.getFileName()).getName();
- ed.is = new ByteArrayInputStream(ole10.getDataBuffer());
- return ed;
- } catch (Ole10NativeException e) {
- throw new IOException(e);
- }
- }
- }
- static class PdfExtractor extends EmbeddedExtractor {
- static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}");
- public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return (PdfClassID.equals(clsId)
- || dn.hasEntry("CONTENTS"));
- }
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- EmbeddedData ed = new EmbeddedData();
- ed.is = dn.createDocumentInputStream("CONTENTS");
- ed.filename = dn.getName()+".pdf";
- return ed;
- }
- }
- static class WordExtractor extends EmbeddedExtractor {
- public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return (ClassID.WORD95.equals(clsId)
- || ClassID.WORD97.equals(clsId)
- || dn.hasEntry("WordDocument"));
- }
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- return extractFS(dn, dn.getName()+".doc");
- }
- }
- static class ExcelExtractor extends EmbeddedExtractor {
- public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return (ClassID.EXCEL95.equals(clsId)
- || ClassID.EXCEL97.equals(clsId)
- || dn.hasEntry("Workbook") /*...*/);
- }
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- return extractFS(dn, dn.getName()+".xls");
- }
- }
- static class FsExtractor extends EmbeddedExtractor {
- public boolean canExtract(DirectoryNode dn) {
- return true;
- }
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- return extractFS(dn, dn.getName()+".dat");
- }
- }
- private static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException {
- for (Entry e : src) {
- if (e instanceof DirectoryNode) {
- DirectoryNode srcDir = (DirectoryNode)e;
- DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
- destDir.setStorageClsid(srcDir.getStorageClsid());
- copyNodes(srcDir, destDir);
- } else {
- InputStream is = src.createDocumentInputStream(e);
- dest.createDocument(e.getName(), is);
- is.close();
- }
- }
- }
- /**
- * Knuth-Morris-Pratt Algorithm for Pattern Matching
- * Finds the first occurrence of the pattern in the text.
- */
- private static int indexOf(byte[] data, int offset, byte[] pattern) {
- int[] failure = computeFailure(pattern);
- int j = 0;
- if (data.length == 0) return -1;
- for (int i = offset; i < data.length; i++) {
- while (j > 0 && pattern[j] != data[i]) {
- j = failure[j - 1];
- }
- if (pattern[j] == data[i]) { j++; }
- if (j == pattern.length) {
- return i - pattern.length + 1;
- }
- }
- return -1;
- }
- /**
- * Computes the failure function using a boot-strapping process,
- * where the pattern is matched against itself.
- */
- private static int[] computeFailure(byte[] pattern) {
- int[] failure = new int[pattern.length];
- int j = 0;
- for (int i = 1; i < pattern.length; i++) {
- while (j > 0 && pattern[j] != pattern[i]) {
- j = failure[j - 1];
- }
- if (pattern[j] == pattern[i]) {
- j++;
- }
- failure[i] = j;
- }
- return failure;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement