Advertisement
viraco4a

Suggestion Syntax Service

Apr 15th, 2019
161
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.42 KB | None | 0 0
  1. package com.aurora.intelligence;
  2.  
  3. import com.aurora.intelligence.scheduling.*;
  4. import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
  5. import edu.stanford.nlp.ling.CoreAnnotations;
  6. import edu.stanford.nlp.ling.CoreLabel;
  7. import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
  8. import edu.stanford.nlp.pipeline.Annotation;
  9. import edu.stanford.nlp.pipeline.StanfordCoreNLP;
  10. import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
  11. import edu.stanford.nlp.trees.Tree;
  12. import edu.stanford.nlp.util.CoreMap;
  13. import org.slf4j.Logger;
  14. import org.slf4j.LoggerFactory;
  15. import org.springframework.beans.factory.annotation.Autowired;
  16. import org.springframework.beans.factory.annotation.Value;
  17. import org.springframework.stereotype.Service;
  18.  
  19. import javax.annotation.PostConstruct;
  20. import java.util.ArrayList;
  21. import java.util.Arrays;
  22. import java.util.List;
  23. import java.util.Properties;
  24. import java.util.concurrent.CountDownLatch;
  25. import java.util.concurrent.ExecutorService;
  26. import java.util.concurrent.Executors;
  27. import java.util.regex.Matcher;
  28. import java.util.regex.Pattern;
  29.  
  30. @Service
  31. public class SyntaxService {
  32.  
  33. @Value("${analyzerThreads}")
  34. private int analyzerThreads;
  35.  
  36. @Autowired
  37. private DictionaryService dictionaryService;
  38.  
  39. private static final Logger LOGGER = LoggerFactory.getLogger(SyntaxService.class);
  40. private static final List<String> PUNCTUATIONAL_MARKS = Arrays
  41. .asList(".", "?", "!", ":", ";", "-", "—", "(", ")", "[", "]", "...", "“", "”", "/", "/");
  42.  
  43. private StanfordCoreNLP coreNLP;
  44. private ExecutorService textExecutor;
  45.  
  46. @PostConstruct
  47. public void setup() {
  48. Properties properties = new Properties();
  49. properties.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
  50. properties.setProperty("threads", "8");
  51. coreNLP = new StanfordCoreNLP(properties);
  52. textExecutor = Executors.newFixedThreadPool(analyzerThreads);
  53. }
  54.  
  55. public List<Word> NERAnalysis(String text) {
  56. List<Word> result = new ArrayList<>();
  57.  
  58. List<CoreMap> coreSentences = getCoreSentences(text);
  59.  
  60. int sentenceCounter = 0;
  61. coreSentences.parallelStream().forEach(cs -> {
  62. LOGGER.info("processing sentence " + sentenceCounter);
  63.  
  64. for (CoreLabel token : cs.get(CoreAnnotations.TokensAnnotation.class)) {
  65. Word word = new Word();
  66. String wordValue = token.get(CoreAnnotations.TextAnnotation.class);
  67. String nameEntity = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
  68.  
  69. word.setValue(wordValue);
  70. word.setNamedEntityType(nameEntity);
  71. result.add(word);
  72. }
  73. });
  74.  
  75. return result;
  76. }
  77.  
  78. public AnalyzedText analyze(String text) throws InvalidBeanPassedForDBProcedure, InterruptedException {
  79. return analyze(text, new ArrayList<>(), null);
  80. }
  81.  
  82. public AnalyzedText analyze(String text, List<com.aurora.intelligence.scheduling.Annotation> customAnnotations, String statementId)
  83. throws InvalidBeanPassedForDBProcedure, InterruptedException {
  84.  
  85. text = text.replaceAll("/[^A-Za-z0-9]/", "");
  86. LOGGER.info("new text : " + text);
  87. String nounPhrasePattern = "((?:\\([^\\(\\)\\s]+\\sDT\\))*?)((?:\\([^\\(\\)\\s]+\\sJJ[RS]?\\)|\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))*)((?:\\([^\\(\\)\\s]+\\sIN\\))?)((?:\\([^\\(\\)\\s]+\\sDT\\))*?)((?:\\([^\\(\\)\\s]+\\sJJ[RS]?\\)|\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))*)((?:\\([^\\(\\)\\s]+\\sDT\\))*?)(\\([^\\(\\)\\s]+\\sNN[SP]?S?\\))";
  88. String verbPhrasePattern = "((?:\\([^\\(\\)\\s]+\\sVB[GDN]?\\)|\\([^\\(\\)\\s]+\\sRB\\))*)((?:\\([^\\(\\)\\s]+\\sIN\\))?)((?:\\([^\\(\\)\\s]+\\sVB[GDN]?\\)|\\([^\\(\\)\\s]+\\sRB\\))+)";
  89.  
  90. AnalyzedText result = new AnalyzedText();
  91. List<Sentence> sentences = new ArrayList<>();
  92.  
  93. List<CoreMap> coreSentences = getCoreSentences(text);
  94.  
  95. CountDownLatch latch = new CountDownLatch(coreSentences.size());
  96.  
  97. coreSentences.forEach(cs -> {
  98. textExecutor.submit(new Runnable() {
  99. @Override
  100. public void run() {
  101. Sentence sentence = new Sentence();
  102. List<Word> words = new ArrayList<>();
  103. String strSentence = "";
  104.  
  105. Tree tree = cs.get(SentimentCoreAnnotations.AnnotatedTree.class);
  106.  
  107. int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
  108. sentence.setSentimentScore(sentiment);
  109. sentence.setIndex(cs.get(CoreAnnotations.IndexAnnotation.class));
  110.  
  111. String syntacticProjection = "(";
  112.  
  113. long startTime = System.currentTimeMillis();
  114.  
  115. List<CoreLabel> tokens = new ArrayList<>(cs.get(CoreAnnotations.TokensAnnotation.class));
  116.  
  117. long endTime = System.currentTimeMillis();
  118. double time = calculateTime(startTime, endTime);
  119. LOGGER.info("Tokenization for " + statementId + " took " + time + " seconds");
  120.  
  121. for (CoreLabel token : tokens) {
  122. String wordValue = token.get(CoreAnnotations.TextAnnotation.class);
  123. strSentence += wordValue + " ";
  124.  
  125. String nameEntity = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
  126. String partOfSpeech = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
  127. String gender = token.get(MachineReadingAnnotations.GenderAnnotation.class);
  128.  
  129. Word word = new Word();
  130.  
  131. if (gender != null && !gender.isEmpty()) {
  132. word.setGender(gender);
  133. }
  134. word.setValue(wordValue);
  135. word.setNamedEntityType(nameEntity);
  136. word.setLexicalCategory(LexicalCategory.getInstanceFromCodeValue(partOfSpeech));
  137. words.add(word);
  138. syntacticProjection += "(" + wordValue + " " + partOfSpeech + ")";
  139. }
  140.  
  141. LOGGER.info("Processing sentence " + strSentence + " for " + statementId);
  142. syntacticProjection += ")";
  143.  
  144. sentence.setWords(words);
  145. sentence.setNounPhrases(extractPhrase(nounPhrasePattern, syntacticProjection, 8));
  146. sentence.setVerbPhrases(extractPhrase(verbPhrasePattern, syntacticProjection, 4));
  147.  
  148. if (customAnnotations != null) {
  149. String sentenceString = cs.get(CoreAnnotations.TextAnnotation.class);
  150. for (com.aurora.intelligence.scheduling.Annotation customAnnotation : customAnnotations) {
  151.  
  152. if (customAnnotation.getPattern() == null || customAnnotation.getPattern().equals("")) {
  153. continue;
  154. }
  155. Pattern pattern = Pattern.compile(customAnnotation.getPattern(), Pattern.CASE_INSENSITIVE);
  156.  
  157. Matcher matcher = pattern.matcher(sentenceString);
  158.  
  159. while (matcher.find()) {
  160. String matchGroup = matcher.group(1);
  161. for (String matchedWord : matchGroup.split(" ")) {
  162. for (int j = 0; j < words.size(); j++) {
  163. Word w = words.get(j);
  164. boolean previousWordMatches = j > 0
  165. && matchedWord.contains(words.get(j - 1).getValue());
  166. boolean nextWordMatches = j < words.size() - 1
  167. && matchedWord.contains(words.get(j + 1).getValue());
  168. boolean wordLength = w.getValue().length() > 1;
  169. boolean wordsAreEqual = w.getValue().equals(matchedWord);
  170. boolean separatedWordsMatch = matchedWord.contains(w.getValue())
  171. && (previousWordMatches || nextWordMatches);
  172. if (wordLength && (wordsAreEqual || separatedWordsMatch)) {
  173. w.setNamedEntityType(customAnnotation.getKey());
  174. }
  175. }
  176. }
  177. }
  178. }
  179. }
  180.  
  181. try {
  182. startTime = System.currentTimeMillis();
  183. sentence.setFrequencyAnalysis(dictionaryService.frequencyAnalysis(strSentence.trim(),
  184. sentence.getNounsAssembledPhrases(), sentence.getVerbsAssembledPhrases()));
  185.  
  186. endTime = System.currentTimeMillis();
  187. time = calculateTime(startTime, endTime);
  188.  
  189. LOGGER.info("Frequency analysys for " + statementId + " took " + time + " seconds");
  190. } catch (Exception e) {
  191. LOGGER.error("Freqency analysis for " + statementId + "failed");
  192. }
  193.  
  194. LOGGER.info(statementId + " has " + (coreSentences.size() - latch.getCount() + 1)
  195. + " / " + coreSentences.size());
  196. latch.countDown();
  197. sentences.add(sentence);
  198.  
  199. }
  200. });
  201. });
  202.  
  203. try {
  204. latch.await();
  205. LOGGER.info("Processing sentences done");
  206. result.setSentences(sentences);
  207.  
  208. return result;
  209. } catch (InterruptedException e) {
  210. throw e;
  211. }
  212. }
  213.  
  214. private double calculateTime(long startTime, long endTime) {
  215. return (double) (endTime - startTime) / 1000;
  216. }
  217.  
  218. private List<List<List<String>>> extractPhrase(String pattern, String syntacticProjection, int groups) {
  219. Pattern p = Pattern.compile(pattern);
  220. Matcher m = p.matcher(syntacticProjection);
  221. List<List<List<String>>> result = new ArrayList<>();
  222. while (m.find()) {
  223.  
  224. List<List<String>> completePhrase = new ArrayList<>();
  225. for (int i = 0; i < groups; i++) {
  226. String phrase = m.group(i);
  227. if (phrase != null && !phrase.isEmpty()) {
  228.  
  229. phrase = phrase.replaceAll("\\(", "");
  230.  
  231. String[] chunks = phrase.split("\\)");
  232.  
  233. for (String chunk : chunks) {
  234.  
  235. List<String> word = Arrays.asList(chunk.split(" "));
  236. completePhrase.add(word);
  237. }
  238. }
  239. }
  240.  
  241. result.add(completePhrase);
  242. }
  243.  
  244. return result;
  245. }
  246.  
  247.  
  248. private List<CoreMap> getCoreSentences(String text) {
  249. Annotation document = new Annotation(text);
  250. coreNLP.annotate(document);
  251. return document.get(CoreAnnotations.SentencesAnnotation.class);
  252. }
  253.  
  254. // TODO: IMPROVE punctuation clear
  255. public static String clearPunctuation(String text) {
  256. return text.replaceAll("[^a-zA-Z ]", "");
  257. }
  258. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement