Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package com.aurora.intelligence;
- import com.aurora.intelligence.scheduling.*;
- import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
- import edu.stanford.nlp.ling.CoreAnnotations;
- import edu.stanford.nlp.ling.CoreLabel;
- import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
- import edu.stanford.nlp.pipeline.Annotation;
- import edu.stanford.nlp.pipeline.StanfordCoreNLP;
- import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
- import edu.stanford.nlp.trees.Tree;
- import edu.stanford.nlp.util.CoreMap;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.beans.factory.annotation.Value;
- import org.springframework.stereotype.Service;
- import javax.annotation.PostConstruct;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.List;
- import java.util.Properties;
- import java.util.concurrent.CountDownLatch;
- import java.util.concurrent.ExecutorService;
- import java.util.concurrent.Executors;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- @Service
- public class SyntaxService {
- @Value("${analyzerThreads}")
- private int analyzerThreads;
- @Autowired
- private DictionaryService dictionaryService;
- private static final Logger LOGGER = LoggerFactory.getLogger(SyntaxService.class);
- private static final List<String> PUNCTUATIONAL_MARKS = Arrays
- .asList(".", "?", "!", ":", ";", "-", "—", "(", ")", "[", "]", "...", "“", "”", "/", "/");
- private StanfordCoreNLP coreNLP;
- private ExecutorService textExecutor;
- @PostConstruct
- public void setup() {
- Properties properties = new Properties();
- properties.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
- properties.setProperty("threads", "8");
- coreNLP = new StanfordCoreNLP(properties);
- textExecutor = Executors.newFixedThreadPool(analyzerThreads);
- }
- public List<Word> NERAnalysis(String text) {
- List<Word> result = new ArrayList<>();
- List<CoreMap> coreSentences = getCoreSentences(text);
- int sentenceCounter = 0;
- coreSentences.parallelStream().forEach(cs -> {
- LOGGER.info("processing sentence " + sentenceCounter);
- for (CoreLabel token : cs.get(CoreAnnotations.TokensAnnotation.class)) {
- Word word = new Word();
- String wordValue = token.get(CoreAnnotations.TextAnnotation.class);
- String nameEntity = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
- word.setValue(wordValue);
- word.setNamedEntityType(nameEntity);
- result.add(word);
- }
- });
- return result;
- }
- public AnalyzedText analyze(String text) throws InvalidBeanPassedForDBProcedure, InterruptedException {
- return analyze(text, new ArrayList<>(), null);
- }
- public AnalyzedText analyze(String text, List<com.aurora.intelligence.scheduling.Annotation> customAnnotations, String statementId)
- throws InvalidBeanPassedForDBProcedure, InterruptedException {
- text = text.replaceAll("/[^A-Za-z0-9]/", "");
- LOGGER.info("new text : " + text);
- String nounPhrasePattern = "((?:\\([^\\(\\)\\s]+\\sDT\\))*?)((?:\\([^\\(\\)\\s]+\\sJJ[RS]?\\)|\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))*)((?:\\([^\\(\\)\\s]+\\sIN\\))?)((?:\\([^\\(\\)\\s]+\\sDT\\))*?)((?:\\([^\\(\\)\\s]+\\sJJ[RS]?\\)|\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))*)((?:\\([^\\(\\)\\s]+\\sDT\\))*?)(\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))";
- String verbPhrasePattern = "((?:\\([^\\(\\)\\s]+\\sVB[GDN]?\\)|\\([^\\(\\)\\s]+\\sRB\\))*)((?:\\([^\\(\\)\\s]+\\sIN\\))?)((?:\\([^\\(\\)\\s]+\\sVB[GDN]?\\)|\\([^\\(\\)\\s]+\\sRB\\))+)";
- AnalyzedText result = new AnalyzedText();
- List<Sentence> sentences = new ArrayList<>();
- List<CoreMap> coreSentences = getCoreSentences(text);
- CountDownLatch latch = new CountDownLatch(coreSentences.size());
- coreSentences.forEach(cs -> {
- textExecutor.submit(new Runnable() {
- @Override
- public void run() {
- Sentence sentence = new Sentence();
- List<Word> words = new ArrayList<>();
- String strSentence = "";
- Tree tree = cs.get(SentimentCoreAnnotations.AnnotatedTree.class);
- int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
- sentence.setSentimentScore(sentiment);
- sentence.setIndex(cs.get(CoreAnnotations.IndexAnnotation.class));
- String syntacticProjection = "(";
- long startTime = System.currentTimeMillis();
- List<CoreLabel> tokens = new ArrayList<>(cs.get(CoreAnnotations.TokensAnnotation.class));
- long endTime = System.currentTimeMillis();
- double time = calculateTime(startTime, endTime);
- LOGGER.info("Tokenization for " + statementId + " took " + time + " seconds");
- for (CoreLabel token : tokens) {
- String wordValue = token.get(CoreAnnotations.TextAnnotation.class);
- strSentence += wordValue + " ";
- String nameEntity = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
- String partOfSpeech = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
- String gender = token.get(MachineReadingAnnotations.GenderAnnotation.class);
- Word word = new Word();
- if (gender != null && !gender.isEmpty()) {
- word.setGender(gender);
- }
- word.setValue(wordValue);
- word.setNamedEntityType(nameEntity);
- word.setLexicalCategory(LexicalCategory.getInstanceFromCodeValue(partOfSpeech));
- words.add(word);
- syntacticProjection += "(" + wordValue + " " + partOfSpeech + ")";
- }
- LOGGER.info("Processing sentence " + strSentence + " for " + statementId);
- syntacticProjection += ")";
- sentence.setWords(words);
- sentence.setNounPhrases(extractPhrase(nounPhrasePattern, syntacticProjection, 8));
- sentence.setVerbPhrases(extractPhrase(verbPhrasePattern, syntacticProjection, 4));
- if (customAnnotations != null) {
- String sentenceString = cs.get(CoreAnnotations.TextAnnotation.class);
- for (com.aurora.intelligence.scheduling.Annotation customAnnotation : customAnnotations) {
- if (customAnnotation.getPattern() == null || customAnnotation.getPattern().equals("")) {
- continue;
- }
- Pattern pattern = Pattern.compile(customAnnotation.getPattern(), Pattern.CASE_INSENSITIVE);
- Matcher matcher = pattern.matcher(sentenceString);
- while (matcher.find()) {
- String matchGroup = matcher.group(1);
- for (String matchedWord : matchGroup.split(" ")) {
- for (int j = 0; j < words.size(); j++) {
- Word w = words.get(j);
- boolean previousWordMatches = j > 0
- && matchedWord.contains(words.get(j - 1).getValue());
- boolean nextWordMatches = j < words.size() - 1
- && matchedWord.contains(words.get(j + 1).getValue());
- boolean wordLength = w.getValue().length() > 1;
- boolean wordsAreEqual = w.getValue().equals(matchedWord);
- boolean separatedWordsMatch = matchedWord.contains(w.getValue())
- && (previousWordMatches || nextWordMatches);
- if (wordLength && (wordsAreEqual || separatedWordsMatch)) {
- w.setNamedEntityType(customAnnotation.getKey());
- }
- }
- }
- }
- }
- }
- try {
- startTime = System.currentTimeMillis();
- sentence.setFrequencyAnalysis(dictionaryService.frequencyAnalysis(strSentence.trim(),
- sentence.getNounsAssembledPhrases(), sentence.getVerbsAssembledPhrases()));
- endTime = System.currentTimeMillis();
- time = calculateTime(startTime, endTime);
- LOGGER.info("Frequency analysys for " + statementId + " took " + time + " seconds");
- } catch (Exception e) {
- LOGGER.error("Freqency analysis for " + statementId + "failed");
- }
- LOGGER.info(statementId + " has " + (coreSentences.size() - latch.getCount() + 1)
- + " / " + coreSentences.size());
- latch.countDown();
- sentences.add(sentence);
- }
- });
- });
- try {
- latch.await();
- LOGGER.info("Processing sentences done");
- result.setSentences(sentences);
- return result;
- } catch (InterruptedException e) {
- throw e;
- }
- }
- private double calculateTime(long startTime, long endTime) {
- return (double) (endTime - startTime) / 1000;
- }
- private List<List<List<String>>> extractPhrase(String pattern, String syntacticProjection, int groups) {
- Pattern p = Pattern.compile(pattern);
- Matcher m = p.matcher(syntacticProjection);
- List<List<List<String>>> result = new ArrayList<>();
- while (m.find()) {
- List<List<String>> completePhrase = new ArrayList<>();
- for (int i = 0; i < groups; i++) {
- String phrase = m.group(i);
- if (phrase != null && !phrase.isEmpty()) {
- phrase = phrase.replaceAll("\\(", "");
- String[] chunks = phrase.split("\\)");
- for (String chunk : chunks) {
- List<String> word = Arrays.asList(chunk.split(" "));
- completePhrase.add(word);
- }
- }
- }
- result.add(completePhrase);
- }
- return result;
- }
- private List<CoreMap> getCoreSentences(String text) {
- Annotation document = new Annotation(text);
- coreNLP.annotate(document);
- return document.get(CoreAnnotations.SentencesAnnotation.class);
- }
- // TODO: IMPROVE punctuation clear
- public static String clearPunctuation(String text) {
- return text.replaceAll("[^a-zA-Z ]", "");
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement