Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package com.example.tika;
- import org.apache.tika.Tika;
- import org.apache.tika.detect.DefaultDetector;
- import org.apache.tika.detect.Detector;
- import org.apache.tika.exception.TikaException;
- import org.apache.tika.io.TikaInputStream;
- import org.apache.tika.metadata.Metadata;
- import org.apache.tika.mime.MediaType;
- import org.apache.tika.parser.AutoDetectParser;
- import org.apache.tika.parser.ParseContext;
- import org.apache.tika.parser.Parser;
- import org.apache.tika.sax.BodyContentHandler;
- import org.apache.tika.sax.ToXMLContentHandler;
- import org.springframework.core.io.InputStreamResource;
- import org.xml.sax.ContentHandler;
- import org.xml.sax.SAXException;
- import java.io.IOException;
- import java.io.InputStream;
- import java.nio.file.Files;
- import java.nio.file.Path;
- public class TikaAnalysis {
- public static String detectDocTypeUsingDetector(InputStream stream) throws Exception {
- Detector detector = new DefaultDetector();
- Metadata metadata = new Metadata();
- MediaType mediaType = detector.detect(stream, metadata);
- return mediaType.toString();
- }
- public static String detectDocTypeUsingFacade(InputStream stream) throws IOException {
- Tika tika = new Tika();
- String mediaType = tika.detect(stream);
- return mediaType;
- }
- public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- parser.parse(stream, handler, metadata, context);
- return handler.toString();
- }
- public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException {
- Tika tika = new Tika();
- String content = tika.parseToString(stream);
- return content;
- }
- public static String parseToHTML(Path path) throws IOException, SAXException, TikaException {
- InputStream stream = Files.newInputStream(path);
- ContentHandler handler = new ToXMLContentHandler();
- AutoDetectParser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
- try {
- parser.parse(stream, handler, metadata);
- return handler.toString();
- } catch (Exception e){
- e.printStackTrace();
- }
- return "unlucky";
- }
- public static Metadata extractMetadatatUsingParser(Path stream) throws IOException, SAXException, TikaException {
- TikaInputStream inputStream = TikaInputStream.get(stream);
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- parser.parse(inputStream, handler, metadata, context);
- return metadata;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement