Suggestion Syntax Service

package com.aurora.intelligence;

import com.aurora.intelligence.scheduling.*;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CoreMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;

import javax.annotation.PostConstruct;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Service
public class SyntaxService {

    @Value("${analyzerThreads}")
    private int analyzerThreads;

    @Autowired
    private DictionaryService dictionaryService;

    private static final Logger LOGGER = LoggerFactory.getLogger(SyntaxService.class);
    private static final List<String> PUNCTUATIONAL_MARKS = Arrays
            .asList(".", "?", "!", ":", ";", "-", "—", "(", ")", "[", "]", "...", "“", "”", "/", "/");

    private StanfordCoreNLP coreNLP;
    private ExecutorService textExecutor;

    @PostConstruct
    public void setup() {
        Properties properties = new Properties();
        properties.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
        properties.setProperty("threads", "8");
        coreNLP = new StanfordCoreNLP(properties);
        textExecutor = Executors.newFixedThreadPool(analyzerThreads);
    }

    public List<Word> NERAnalysis(String text) {
        List<Word> result = new ArrayList<>();

        List<CoreMap> coreSentences = getCoreSentences(text);

        int sentenceCounter = 0;
        coreSentences.parallelStream().forEach(cs -> {
            LOGGER.info("processing sentence " + sentenceCounter);

            for (CoreLabel token : cs.get(CoreAnnotations.TokensAnnotation.class)) {
                Word word = new Word();
                String wordValue = token.get(CoreAnnotations.TextAnnotation.class);
                String nameEntity = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);

                word.setValue(wordValue);
                word.setNamedEntityType(nameEntity);
                result.add(word);
            }
        });

        return result;
    }

    public AnalyzedText analyze(String text) throws InvalidBeanPassedForDBProcedure, InterruptedException {
        return analyze(text, new ArrayList<>(), null);
    }

    public AnalyzedText analyze(String text, List<com.aurora.intelligence.scheduling.Annotation> customAnnotations, String statementId)
            throws InvalidBeanPassedForDBProcedure, InterruptedException {

        text = text.replaceAll("/[^A-Za-z0-9]/", "");
        LOGGER.info("new text : " + text);
        String nounPhrasePattern = "((?:\\([^\\(\\)\\s]+\\sDT\\))*?)((?:\\([^\\(\\)\\s]+\\sJJ[RS]?\\)|\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))*)((?:\\([^\\(\\)\\s]+\\sIN\\))?)((?:\\([^\\(\\)\\s]+\\sDT\\))*?)((?:\\([^\\(\\)\\s]+\\sJJ[RS]?\\)|\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))*)((?:\\([^\\(\\)\\s]+\\sDT\\))*?)(\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))";
        String verbPhrasePattern = "((?:\\([^\\(\\)\\s]+\\sVB[GDN]?\\)|\\([^\\(\\)\\s]+\\sRB\\))*)((?:\\([^\\(\\)\\s]+\\sIN\\))?)((?:\\([^\\(\\)\\s]+\\sVB[GDN]?\\)|\\([^\\(\\)\\s]+\\sRB\\))+)";

        AnalyzedText result = new AnalyzedText();
        List<Sentence> sentences = new ArrayList<>();

        List<CoreMap> coreSentences = getCoreSentences(text);

        CountDownLatch latch = new CountDownLatch(coreSentences.size());

        coreSentences.forEach(cs -> {
            textExecutor.submit(new Runnable() {
                @Override
                public void run() {
                    Sentence sentence = new Sentence();
                    List<Word> words = new ArrayList<>();
                    String strSentence = "";

                    Tree tree = cs.get(SentimentCoreAnnotations.AnnotatedTree.class);

                    int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
                    sentence.setSentimentScore(sentiment);
                    sentence.setIndex(cs.get(CoreAnnotations.IndexAnnotation.class));

                    String syntacticProjection = "(";

                    long startTime = System.currentTimeMillis();

                    List<CoreLabel> tokens = new ArrayList<>(cs.get(CoreAnnotations.TokensAnnotation.class));

                    long endTime = System.currentTimeMillis();
                    double time = calculateTime(startTime, endTime);
                    LOGGER.info("Tokenization for " + statementId + " took " + time + " seconds");

                    for (CoreLabel token : tokens) {
                        String wordValue = token.get(CoreAnnotations.TextAnnotation.class);
                        strSentence += wordValue + " ";

                        String nameEntity = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
                        String partOfSpeech = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
                        String gender = token.get(MachineReadingAnnotations.GenderAnnotation.class);

                        Word word = new Word();

                        if (gender != null && !gender.isEmpty()) {
                            word.setGender(gender);
                        }
                        word.setValue(wordValue);
                        word.setNamedEntityType(nameEntity);
                        word.setLexicalCategory(LexicalCategory.getInstanceFromCodeValue(partOfSpeech));
                        words.add(word);
                        syntacticProjection += "(" + wordValue + " " + partOfSpeech + ")";
                    }

                    LOGGER.info("Processing sentence " + strSentence + " for " + statementId);
                    syntacticProjection += ")";

                    sentence.setWords(words);
                    sentence.setNounPhrases(extractPhrase(nounPhrasePattern, syntacticProjection, 8));
                    sentence.setVerbPhrases(extractPhrase(verbPhrasePattern, syntacticProjection, 4));

                    if (customAnnotations != null) {
                        String sentenceString = cs.get(CoreAnnotations.TextAnnotation.class);
                        for (com.aurora.intelligence.scheduling.Annotation customAnnotation : customAnnotations) {

                            if (customAnnotation.getPattern() == null || customAnnotation.getPattern().equals("")) {
                                continue;
                            }
                            Pattern pattern = Pattern.compile(customAnnotation.getPattern(), Pattern.CASE_INSENSITIVE);

                            Matcher matcher = pattern.matcher(sentenceString);

                            while (matcher.find()) {
                                String matchGroup = matcher.group(1);
                                for (String matchedWord : matchGroup.split(" ")) {
                                    for (int j = 0; j < words.size(); j++) {
                                        Word w = words.get(j);
                                        boolean previousWordMatches = j > 0
                                                && matchedWord.contains(words.get(j - 1).getValue());
                                        boolean nextWordMatches = j < words.size() - 1
                                                && matchedWord.contains(words.get(j + 1).getValue());
                                        boolean wordLength = w.getValue().length() > 1;
                                        boolean wordsAreEqual = w.getValue().equals(matchedWord);
                                        boolean separatedWordsMatch = matchedWord.contains(w.getValue())
                                                && (previousWordMatches || nextWordMatches);
                                        if (wordLength && (wordsAreEqual || separatedWordsMatch)) {
                                            w.setNamedEntityType(customAnnotation.getKey());
                                        }
                                    }
                                }
                            }
                        }
                    }

                    try {
                        startTime = System.currentTimeMillis();
                        sentence.setFrequencyAnalysis(dictionaryService.frequencyAnalysis(strSentence.trim(),
                                sentence.getNounsAssembledPhrases(), sentence.getVerbsAssembledPhrases()));

                        endTime = System.currentTimeMillis();
                        time = calculateTime(startTime, endTime);

                        LOGGER.info("Frequency analysys for " + statementId + " took " + time + " seconds");
                    } catch (Exception e) {
                        LOGGER.error("Freqency analysis for " + statementId + "failed");
                    }

                    LOGGER.info(statementId + " has " + (coreSentences.size() - latch.getCount() + 1)
                            + " / " + coreSentences.size());
                    latch.countDown();
                    sentences.add(sentence);

                }
            });
        });

        try {
            latch.await();
            LOGGER.info("Processing sentences done");
            result.setSentences(sentences);

            return result;
        } catch (InterruptedException e) {
            throw e;
        }
    }

    private double calculateTime(long startTime, long endTime) {
        return (double) (endTime - startTime) / 1000;
    }

    private List<List<List<String>>> extractPhrase(String pattern, String syntacticProjection, int groups) {
        Pattern p = Pattern.compile(pattern);
        Matcher m = p.matcher(syntacticProjection);
        List<List<List<String>>> result = new ArrayList<>();
        while (m.find()) {

            List<List<String>> completePhrase = new ArrayList<>();
            for (int i = 0; i < groups; i++) {
                String phrase = m.group(i);
                if (phrase != null && !phrase.isEmpty()) {

                    phrase = phrase.replaceAll("\\(", "");

                    String[] chunks = phrase.split("\\)");

                    for (String chunk : chunks) {

                        List<String> word = Arrays.asList(chunk.split(" "));
                        completePhrase.add(word);
                    }
                }
            }

            result.add(completePhrase);
        }

        return result;
    }


    private List<CoreMap> getCoreSentences(String text) {
        Annotation document = new Annotation(text);
        coreNLP.annotate(document);
        return document.get(CoreAnnotations.SentencesAnnotation.class);
    }

    // TODO: IMPROVE punctuation clear
    public static String clearPunctuation(String text) {
        return text.replaceAll("[^a-zA-Z ]", "");
    }
}