Guest User

Untitled

a guest
Mar 19th, 2018
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.56 KB | None | 0 0
  1. using edu.stanford.nlp.pipeline;
  2. using java.util;
  3. using Services.NLPService.Configuration;
  4. using Microsoft.ApplicationInsights;
  5. using Microsoft.Extensions.Options;
  6. using System;
  7. using System.Collections.Concurrent;
  8. using System.IO;
  9. using Microsoft.Extensions.Logging;
  10.  
  11. namespace XApp.Services.NLPService.StanfordNLP
  12. {
  13. public class AnnotationFactory : IDisposable
  14. {
  15. private readonly NLPParserConfig _nlpParserConfig;
  16. private ConcurrentDictionary<string, Lazy<StanfordCoreNLP>> _stanfordCoreNLPConInstances;
  17. private readonly TelemetryClient _telemetryClient;
  18. private readonly ILogger _logger;
  19.  
  20.  
  21. public AnnotationFactory(
  22. IOptions<NLPParserConfig> nlpParserConfig,
  23. TelemetryClient telemetryClient,
  24. ILogger<AnnotationFactory> logger)
  25. {
  26. _logger = logger;
  27. _logger.LogWarning($"* StanfordNLP Annotation Factory initializing started at {DateTime.Now.ToLongTimeString()}");
  28. System.Console.BackgroundColor = ConsoleColor.Red;
  29.  
  30. _nlpParserConfig = nlpParserConfig.Value;
  31. _stanfordCoreNLPConInstances = new ConcurrentDictionary<string, Lazy<StanfordCoreNLP>>();
  32. _telemetryClient = telemetryClient;
  33.  
  34. //initialize basic instance
  35. GetAnnotator();
  36. }
  37.  
  38. public const string AnnotatorPosKey = "pos";
  39. public const string AnnotatorLemmaKey = "lemma";
  40. public const string AnnotatorRegexnerKey = "regexner";
  41. //public const string AnnotatorParseKey = "parse";
  42. public const string AnnotatorParseKey = "depparse";
  43.  
  44.  
  45. /// <summary>
  46. /// Return most basic annotators (tokenize, cleanxml, ssplit) if params is null.
  47. /// </summary>
  48. /// <param name="annotatorKeywordParams"></param>
  49. /// <returns></returns>
  50. public StanfordCoreNLP GetAnnotator(params string[] annotatorKeywordParams)
  51. {
  52. string basicAnnotatorKeywords = "tokenize, cleanxml, ssplit, ";
  53.  
  54. string annotatorKeywords = basicAnnotatorKeywords + string.Join(", ", annotatorKeywordParams);
  55.  
  56. StanfordCoreNLP stanfordCoreNLP = _stanfordCoreNLPConInstances.GetOrAdd(annotatorKeywords, key => new Lazy<StanfordCoreNLP>(() =>
  57. {
  58. //Create new instance, add to instance list and return;
  59. var stopwatch = System.Diagnostics.Stopwatch.StartNew();
  60. _logger.LogWarning($"* NLP Model loading started at {DateTime.Now.ToLongTimeString()}");
  61.  
  62. string modelDataDir = Path.GetFullPath(Path.Combine(Environment.CurrentDirectory, @"wwwroot\stanfordnlp-models"));
  63. var currentDir = Environment.CurrentDirectory;
  64. Directory.SetCurrentDirectory(modelDataDir);
  65.  
  66.  
  67. Properties props = GetAnnotatorSettings();
  68. props.setProperty("annotators", annotatorKeywords);
  69.  
  70. stanfordCoreNLP = new StanfordCoreNLP(props);
  71. //System.Console.WriteLine($"Annotation timings: {_pipeline.timingInformation()}");
  72. Directory.SetCurrentDirectory(currentDir);
  73.  
  74. stopwatch.Stop();
  75. _telemetryClient.TrackMetric($"* StanfordNLP models loaded with: {annotatorKeywords}", stopwatch.Elapsed.Seconds);
  76. _logger.LogWarning($"* NLP Models loaded with \"{annotatorKeywords}\" ! - {stopwatch.Elapsed.Seconds} seconds. Ended on: {DateTime.Now.ToLongTimeString()}");
  77.  
  78. return stanfordCoreNLP;
  79. })).Value;
  80.  
  81. return stanfordCoreNLP;
  82. }
  83.  
  84.  
  85. private Properties GetAnnotatorSettings()
  86. {
  87. //TODO:Refactor > Cleaner way to set properties
  88. //Properties propsX = edu.stanford.nlp.util.PropertiesUtils.asProperties(
  89. // //"annotators", "tokenize,ssplit,pos,depparse",
  90. // "depparse.model", edu.stanford.nlp.parser.nndep.DependencyParser.DEFAULT_MODEL
  91. //);
  92.  
  93. var props = new Properties();
  94.  
  95. //---- ssplit
  96. //Split sentences at and only at newlines.
  97. //StanfordCoreNLP will treat the input as already tokenized and one sentence per line,
  98. //only seperating words on whitespace.
  99. bool eolOnly = _nlpParserConfig.SentenceSplitEndOfLineOnly.HasValue && _nlpParserConfig.SentenceSplitEndOfLineOnly.Value;
  100. props.setProperty("ssplit.eolonly", eolOnly.ToString().ToLower()); //"true"
  101. //props.setProperty("ssplit.newLineIsSentenceBreak", "always");
  102.  
  103. //---- pos annotator settings
  104. //props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/wsj-0-18-caseless-left3words-distsim.tagger");
  105. //props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/english-bidirectional/english-bidirectional-distsim.tagger");
  106. //default:
  107. //props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
  108.  
  109. //---- depparse
  110. //depparse.model > edu/stanford/nlp/models/parser/nndep/english_UD.gz
  111.  
  112. //---- parse annotator settings,
  113. //Whether to print verbose messages while parsing.
  114. //props.setProperty("parse.debug", "true"); //>default false
  115.  
  116. //!!Number of threads to use for parsing.
  117. //props.setProperty("parse.nthreads", "2"); //> default=1
  118.  
  119. //default model
  120. //props.setProperty("parse.model", "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
  121. //There is a much faster and more memory efficient parser available in the shift reduce parser.
  122. //It takes quite a while to load, and the download is much larger, which is the main reason it is not the default.
  123. //props.setProperty("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz");
  124. //If set to a positive number, the annotator parses only sentences of length at most this number (in terms of number of tokens).
  125. //props.setProperty("parse.maxlen", "25");
  126. //at the output
  127. //props.setProperty("parse.keepPunct", "false"); //>punct(think-2, .-9)
  128.  
  129. //Generate dependency representations of the sentence, stored under the three Dependencies annotations mentioned in the introduction.
  130. //parse.buildgraphs > true
  131.  
  132.  
  133. //---- ner annotator settings
  134. props.setProperty("ner.useSUTime", "false");
  135. props.setProperty("ner.applyNumericClassifiers", "false");
  136.  
  137. //---- regexner annotator settings
  138. //!!!! en-phrase-noun.tx dan dolayi new StanfordCoreNLP() yavas (3+ dk.)
  139. props.setProperty("regexner.mapping", "custom-ner/en-phrase-noun.txt;custom-ner/en-phrase-verb.txt;custom-ner/en-phrase-adjective.txt;custom-ner/en-phrase-adverb.txt");
  140. //props.setProperty("regexner.mapping", "custom-ner/en-phrase-verb.txt;custom-ner/en-phrase-adjective.txt;custom-ner/en-phrase-adverb.txt");
  141. //props.setProperty("regexner.mapping", "custom-ner/en-phrase-adverb.txt");
  142.  
  143. //regexner ile birlike standar NER tanimlarini da yüklemeye calisiyor. /yukarda sadece regexner yuklendi
  144. // _pipeline.addAnnotator(new TokensRegexNERAnnotator("custom-ner/en-phrase-noun.txt;custom-ner/en-phrase-verb.txt;custom-ner/en-phrase-adjective.txt;custom-ner/en-phrase-adverb.txt", true));
  145. //props.setProperty("regexner.posmatchtype", "MATCH_ALL_TOKENS");
  146. props.setProperty("regexner.ignorecase", "true");
  147. props.setProperty("regexner.verbose", "true");
  148. //Custom NER dosyalarında her satira normalized eklendi
  149. //örn: assert (hers|herself|him|himself|hiss...) VerbPhrase assert_ones
  150. //wordnet mi wiki mi oldugunu anlamak icin assert_ones:wiki
  151. props.setProperty("regexner.mapping.header", "pattern,ner,normalized");
  152. //props.setProperty("regexner.mapping.header", "pattern,ner,normalized, overwrite, priority,group");
  153. props.setProperty("regexner.mapping.field.normalized", "edu.stanford.nlp.ling.CoreAnnotations$NormalizedNamedEntityTagAnnotation");
  154.  
  155. return props;
  156. }
  157.  
  158. public void Dispose()
  159. {
  160. foreach (var item in _stanfordCoreNLPConInstances)
  161. {
  162. //item.Value = null;
  163. //item.Value.unmount();
  164. }
  165. _stanfordCoreNLPConInstances.Clear();
  166.  
  167. _telemetryClient.TrackMetric("*** StanfordParser3-Dispose", 0);
  168. _logger.LogWarning($"*** StanfordCoreParser disposed at {DateTime.UtcNow}.");
  169. }
  170. }
  171. }
Add Comment
Please, Sign In to add comment