Advertisement
Guest User

Untitled

a guest
Mar 17th, 2019
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 5 2.98 KB | None | 0 0
  1. package com.example.tika;
  2.  
  3. import org.apache.tika.Tika;
  4. import org.apache.tika.detect.DefaultDetector;
  5. import org.apache.tika.detect.Detector;
  6. import org.apache.tika.exception.TikaException;
  7. import org.apache.tika.io.TikaInputStream;
  8. import org.apache.tika.metadata.Metadata;
  9. import org.apache.tika.mime.MediaType;
  10. import org.apache.tika.parser.AutoDetectParser;
  11. import org.apache.tika.parser.ParseContext;
  12. import org.apache.tika.parser.Parser;
  13. import org.apache.tika.sax.BodyContentHandler;
  14. import org.apache.tika.sax.ToXMLContentHandler;
  15. import org.springframework.core.io.InputStreamResource;
  16. import org.xml.sax.ContentHandler;
  17. import org.xml.sax.SAXException;
  18.  
  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.nio.file.Files;
  22. import java.nio.file.Path;
  23.  
  24. public class TikaAnalysis {
  25.  
  26.     public static String detectDocTypeUsingDetector(InputStream stream) throws Exception {
  27.         Detector detector = new DefaultDetector();
  28.         Metadata metadata = new Metadata();
  29.  
  30.         MediaType mediaType = detector.detect(stream, metadata);
  31.         return mediaType.toString();
  32.     }
  33.  
  34.     public static String detectDocTypeUsingFacade(InputStream stream) throws IOException {
  35.  
  36.         Tika tika = new Tika();
  37.         String mediaType = tika.detect(stream);
  38.         return mediaType;
  39.     }
  40.  
  41.     public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
  42.         Parser parser = new AutoDetectParser();
  43.         ContentHandler handler = new BodyContentHandler();
  44.         Metadata metadata = new Metadata();
  45.         ParseContext context = new ParseContext();
  46.  
  47.         parser.parse(stream, handler, metadata, context);
  48.         return handler.toString();
  49.     }
  50.  
  51.     public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException {
  52.  
  53.         Tika tika = new Tika();
  54.         String content = tika.parseToString(stream);
  55.         return content;
  56.     }
  57.  
  58.     public static String parseToHTML(Path path) throws IOException, SAXException, TikaException {
  59.         InputStream stream = Files.newInputStream(path);
  60.         ContentHandler handler = new ToXMLContentHandler();
  61.  
  62.         AutoDetectParser parser = new AutoDetectParser();
  63.         Metadata metadata = new Metadata();
  64.         try  {
  65.             parser.parse(stream, handler, metadata);
  66.             return handler.toString();
  67.         } catch (Exception e){
  68.             e.printStackTrace();
  69.         }
  70.         return "unlucky";
  71.     }
  72.  
  73.     public static Metadata extractMetadatatUsingParser(Path stream) throws IOException, SAXException, TikaException {
  74.         TikaInputStream inputStream = TikaInputStream.get(stream);
  75.         Parser parser = new AutoDetectParser();
  76.         ContentHandler handler = new BodyContentHandler();
  77.         Metadata metadata = new Metadata();
  78.         ParseContext context = new ParseContext();
  79.  
  80.         parser.parse(inputStream, handler, metadata, context);
  81.         return metadata;
  82.     }
  83. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement