Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using edu.stanford.nlp.pipeline;
- using java.util;
- using Services.NLPService.Configuration;
- using Microsoft.ApplicationInsights;
- using Microsoft.Extensions.Options;
- using System;
- using System.Collections.Concurrent;
- using System.IO;
- using Microsoft.Extensions.Logging;
- namespace XApp.Services.NLPService.StanfordNLP
- {
- public class AnnotationFactory : IDisposable
- {
- private readonly NLPParserConfig _nlpParserConfig;
- private ConcurrentDictionary<string, Lazy<StanfordCoreNLP>> _stanfordCoreNLPConInstances;
- private readonly TelemetryClient _telemetryClient;
- private readonly ILogger _logger;
- public AnnotationFactory(
- IOptions<NLPParserConfig> nlpParserConfig,
- TelemetryClient telemetryClient,
- ILogger<AnnotationFactory> logger)
- {
- _logger = logger;
- _logger.LogWarning($"* StanfordNLP Annotation Factory initializing started at {DateTime.Now.ToLongTimeString()}");
- System.Console.BackgroundColor = ConsoleColor.Red;
- _nlpParserConfig = nlpParserConfig.Value;
- _stanfordCoreNLPConInstances = new ConcurrentDictionary<string, Lazy<StanfordCoreNLP>>();
- _telemetryClient = telemetryClient;
- //initialize basic instance
- GetAnnotator();
- }
- public const string AnnotatorPosKey = "pos";
- public const string AnnotatorLemmaKey = "lemma";
- public const string AnnotatorRegexnerKey = "regexner";
- //public const string AnnotatorParseKey = "parse";
- public const string AnnotatorParseKey = "depparse";
- /// <summary>
- /// Return most basic annotators (tokenize, cleanxml, ssplit) if params is null.
- /// </summary>
- /// <param name="annotatorKeywordParams"></param>
- /// <returns></returns>
- public StanfordCoreNLP GetAnnotator(params string[] annotatorKeywordParams)
- {
- string basicAnnotatorKeywords = "tokenize, cleanxml, ssplit, ";
- string annotatorKeywords = basicAnnotatorKeywords + string.Join(", ", annotatorKeywordParams);
- StanfordCoreNLP stanfordCoreNLP = _stanfordCoreNLPConInstances.GetOrAdd(annotatorKeywords, key => new Lazy<StanfordCoreNLP>(() =>
- {
- //Create new instance, add to instance list and return;
- var stopwatch = System.Diagnostics.Stopwatch.StartNew();
- _logger.LogWarning($"* NLP Model loading started at {DateTime.Now.ToLongTimeString()}");
- string modelDataDir = Path.GetFullPath(Path.Combine(Environment.CurrentDirectory, @"wwwroot\stanfordnlp-models"));
- var currentDir = Environment.CurrentDirectory;
- Directory.SetCurrentDirectory(modelDataDir);
- Properties props = GetAnnotatorSettings();
- props.setProperty("annotators", annotatorKeywords);
- stanfordCoreNLP = new StanfordCoreNLP(props);
- //System.Console.WriteLine($"Annotation timings: {_pipeline.timingInformation()}");
- Directory.SetCurrentDirectory(currentDir);
- stopwatch.Stop();
- _telemetryClient.TrackMetric($"* StanfordNLP models loaded with: {annotatorKeywords}", stopwatch.Elapsed.Seconds);
- _logger.LogWarning($"* NLP Models loaded with \"{annotatorKeywords}\" ! - {stopwatch.Elapsed.Seconds} seconds. Ended on: {DateTime.Now.ToLongTimeString()}");
- return stanfordCoreNLP;
- })).Value;
- return stanfordCoreNLP;
- }
- private Properties GetAnnotatorSettings()
- {
- //TODO:Refactor > Cleaner way to set properties
- //Properties propsX = edu.stanford.nlp.util.PropertiesUtils.asProperties(
- // //"annotators", "tokenize,ssplit,pos,depparse",
- // "depparse.model", edu.stanford.nlp.parser.nndep.DependencyParser.DEFAULT_MODEL
- //);
- var props = new Properties();
- //---- ssplit
- //Split sentences at and only at newlines.
- //StanfordCoreNLP will treat the input as already tokenized and one sentence per line,
- //only seperating words on whitespace.
- bool eolOnly = _nlpParserConfig.SentenceSplitEndOfLineOnly.HasValue && _nlpParserConfig.SentenceSplitEndOfLineOnly.Value;
- props.setProperty("ssplit.eolonly", eolOnly.ToString().ToLower()); //"true"
- //props.setProperty("ssplit.newLineIsSentenceBreak", "always");
- //---- pos annotator settings
- //props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/wsj-0-18-caseless-left3words-distsim.tagger");
- //props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/english-bidirectional/english-bidirectional-distsim.tagger");
- //default:
- //props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
- //---- depparse
- //depparse.model > edu/stanford/nlp/models/parser/nndep/english_UD.gz
- //---- parse annotator settings,
- //Whether to print verbose messages while parsing.
- //props.setProperty("parse.debug", "true"); //>default false
- //!!Number of threads to use for parsing.
- //props.setProperty("parse.nthreads", "2"); //> default=1
- //default model
- //props.setProperty("parse.model", "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
- //There is a much faster and more memory efficient parser available in the shift reduce parser.
- //It takes quite a while to load, and the download is much larger, which is the main reason it is not the default.
- //props.setProperty("parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz");
- //If set to a positive number, the annotator parses only sentences of length at most this number (in terms of number of tokens).
- //props.setProperty("parse.maxlen", "25");
- //at the output
- //props.setProperty("parse.keepPunct", "false"); //>punct(think-2, .-9)
- //Generate dependency representations of the sentence, stored under the three Dependencies annotations mentioned in the introduction.
- //parse.buildgraphs > true
- //---- ner annotator settings
- props.setProperty("ner.useSUTime", "false");
- props.setProperty("ner.applyNumericClassifiers", "false");
- //---- regexner annotator settings
- //!!!! en-phrase-noun.tx dan dolayi new StanfordCoreNLP() yavas (3+ dk.)
- props.setProperty("regexner.mapping", "custom-ner/en-phrase-noun.txt;custom-ner/en-phrase-verb.txt;custom-ner/en-phrase-adjective.txt;custom-ner/en-phrase-adverb.txt");
- //props.setProperty("regexner.mapping", "custom-ner/en-phrase-verb.txt;custom-ner/en-phrase-adjective.txt;custom-ner/en-phrase-adverb.txt");
- //props.setProperty("regexner.mapping", "custom-ner/en-phrase-adverb.txt");
- //regexner ile birlike standar NER tanimlarini da yüklemeye calisiyor. /yukarda sadece regexner yuklendi
- // _pipeline.addAnnotator(new TokensRegexNERAnnotator("custom-ner/en-phrase-noun.txt;custom-ner/en-phrase-verb.txt;custom-ner/en-phrase-adjective.txt;custom-ner/en-phrase-adverb.txt", true));
- //props.setProperty("regexner.posmatchtype", "MATCH_ALL_TOKENS");
- props.setProperty("regexner.ignorecase", "true");
- props.setProperty("regexner.verbose", "true");
- //Custom NER dosyalarında her satira normalized eklendi
- //örn: assert (hers|herself|him|himself|hiss...) VerbPhrase assert_ones
- //wordnet mi wiki mi oldugunu anlamak icin assert_ones:wiki
- props.setProperty("regexner.mapping.header", "pattern,ner,normalized");
- //props.setProperty("regexner.mapping.header", "pattern,ner,normalized, overwrite, priority,group");
- props.setProperty("regexner.mapping.field.normalized", "edu.stanford.nlp.ling.CoreAnnotations$NormalizedNamedEntityTagAnnotation");
- return props;
- }
- public void Dispose()
- {
- foreach (var item in _stanfordCoreNLPConInstances)
- {
- //item.Value = null;
- //item.Value.unmount();
- }
- _stanfordCoreNLPConInstances.Clear();
- _telemetryClient.TrackMetric("*** StanfordParser3-Dispose", 0);
- _logger.LogWarning($"*** StanfordCoreParser disposed at {DateTime.UtcNow}.");
- }
- }
- }
Add Comment
Please, Sign In to add comment