Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public class NewFieldTaggingManager
- {
- public const string Other = "<Other>";
- private class IndexRange
- {
- public int Start { get; }
- public int Length { get; set; }
- public IndexRange(int start, int length)
- {
- Start = start;
- Length = length;
- }
- public override string ToString() => $"{Start};{Length}";
- }
- class WordsPhrase
- {
- public WordsPhrase Parent { get; set; }
- public string Phrase { get; }
- public double Weight { get; }
- public string Label { get; }
- public List<Token> Tokens { get; }
- public List<Tuple<string, double>> TopLabels { get; }
- //public double ResultKoef { get; set; }
- public CompareStatus Status { get; set; }
- public int Generation { get; set; }
- public WordsPhrase(string phrase, string label, double weight, List<Token> tokens, List<Tuple<string, double>> topLabels)
- {
- Phrase = phrase;
- Label = label;
- Weight = weight;
- Tokens = tokens;
- TopLabels = topLabels;
- }
- public WordsPhrase Clone()
- {
- return (WordsPhrase)MemberwiseClone();
- }
- //public override string ToString() => $"{Phrase} ({ResultKoef:N3}) => {Label} ({Weight:N3})";
- public override string ToString() => $"{Phrase} => {Label} ({Weight:N3})";
- }
- /// <summary>
- /// Выполняет определение принадлежности токенов к определенной сущности.
- /// </summary>
- /// <param name="tokens"></param>
- public void Parse(List<Token> tokens, ClassifierEngineData engineData)
- {
- var groups = TagTokenHelper.GetMaxGroups(tokens,
- t => t.TokenType == TokenType.Undefined,
- t => t.TokenType == TokenType.Ignore
- );
- foreach (var group in groups)
- {
- // фразы в скобочках обрабатываем отдельно
- if (group.Count > 1 && IsGroupInParentheses(group, tokens))
- AnalizeAndTagSentenceInParentheses(group, tokens, engineData);
- else
- AnalizeAndTagSentence(group, tokens, engineData);
- }
- }
- /// <summary>
- /// Проверяет находится ли группа в скобках.
- /// </summary>
- /// <param name="group"></param>
- /// <param name="tokens"></param>
- /// <returns></returns>
- private bool IsGroupInParentheses(List<Token> group, List<Token> tokens)
- {
- // находим левый токен и пропускаем игнорируемые
- int firstGroupTokenIdx = tokens.IndexOf(group[0]);
- while (firstGroupTokenIdx - 1 >= 0 && tokens[firstGroupTokenIdx - 1].TokenType == TokenType.Ignore)
- firstGroupTokenIdx--;
- if (firstGroupTokenIdx == 0)
- return false;
- // находим правый токен и пропускаем игнорируемые
- int lastGroupTokenIdx = tokens.IndexOf(group[group.Count - 1]);
- while (lastGroupTokenIdx + 1 < tokens.Count && tokens[lastGroupTokenIdx + 1].TokenType == TokenType.Ignore)
- lastGroupTokenIdx++;
- if (lastGroupTokenIdx == tokens.Count - 1)
- return false;
- // проверяем что дальше стоят скобочки
- if (tokens[firstGroupTokenIdx - 1].Word == "(" && tokens[lastGroupTokenIdx + 1].Word == ")")
- return true;
- return false;
- }
- /// <summary>
- /// Выполняет анализ списка токенов, которые расположены в скобках и являются (скорей всего) одной сущностью.
- /// </summary>
- /// <param name="group"></param>
- /// <param name="tokens"></param>
- private void AnalizeAndTagSentenceInParentheses(List<Token> group, List<Token> tokens, ClassifierEngineData engineData)
- {
- WordsPhrase phrase = TokensToPhrase(group, tokens, engineData);
- group.ForEach(t => { t.MatchWord = phrase.TopLabels[0].Item1; t.TokenType = TokenType.Param; });
- }
- /// <summary>
- /// Генерирует фразу со всеми характеристиками из списка токенов.
- /// </summary>
- /// <param name="phraseTokens"></param>
- /// <returns></returns>
- private WordsPhrase TokensToPhrase(List<Token> phraseTokens, List<Token> sourceTokens, ClassifierEngineData engineData)
- {
- int takeCount = phraseTokens.Count;
- List<Token> allPhraseTokens = GetAllTokensWithSeparators(phraseTokens, sourceTokens);
- string sumWord = TagTokenHelper.RecollectTagsWord(allPhraseTokens);
- var res = engineData.Predict(sumWord);
- double degreOfConfidence = res.Weight; //GetDegreeOfConfidence(res.Score);
- List<Tuple<string, double>> topCfSum = GetTopFields(engineData.AllLabels, res.Score, 3);
- WordsPhrase phrase = new WordsPhrase(sumWord, res.Result, res.Weight, phraseTokens, topCfSum);
- //phrase.ResultKoef = CalcResultCoef(phrase);
- return phrase;
- }
- /// <summary>
- /// Выполняет анализ списка токенов для определения принадлежности к сущности.
- /// </summary>
- /// <param name="tokens"></param>
- private void AnalizeAndTagSentence(List<Token> tokens, List<Token> sourceTokens, ClassifierEngineData engineData)
- {
- List<WordsPhrase> phrases = GenerateInitialPhrases(tokens, sourceTokens, engineData);
- int gen = 1;
- bool done = false;
- while (gen <= 5 && !done)
- {
- phrases = ProcessExpandPhrases(phrases, sourceTokens, engineData, gen, out done);
- gen++;
- }
- foreach (var phrase in phrases)
- {
- foreach (var token in phrase.Tokens)
- {
- if (phrase.Label != Other && phrase.Weight > threshold)
- {
- token.TokenType = TokenType.Param;
- token.MatchWord = phrase.Label;
- }
- else
- {
- //token.MatchWord = "Other";
- }
- }
- }
- }
- private List<WordsPhrase> ProcessExpandPhrases(List<WordsPhrase> phrases, List<Token> sourceTokens, ClassifierEngineData engineData, int generation, out bool done)
- {
- List<WordsPhrase> childPhrases = new List<WordsPhrase>();
- bool newPhrasesGenerated = false;
- for (int i = 0; i < phrases.Count; i++)
- {
- var phrase = phrases[i];
- if (phrase.Generation < generation - 1)
- {
- childPhrases.Add(phrase);
- continue;
- }
- bool anyChildrenAdded = false;
- if (i > 0 && (generation == 1 || phrases[i - 1].Label == Other))
- {
- WordsPhrase combinedToLeft = ConcatenatePhrases(phrases[i - 1], phrase, sourceTokens, engineData);
- var compareStatus = ComparePhrases(combinedToLeft, phrase);
- combinedToLeft.Parent = phrase;
- combinedToLeft.Status = compareStatus;
- combinedToLeft.Generation = generation;
- childPhrases.Add(combinedToLeft);
- anyChildrenAdded = true;
- }
- if (i < phrases.Count - 1 && (generation == 1 || phrases[i + 1].Label == Other))
- {
- WordsPhrase combinedToRight = ConcatenatePhrases(phrase, phrases[i + 1], sourceTokens, engineData);
- var compareStatus = ComparePhrases(combinedToRight, phrase);
- combinedToRight.Parent = phrase;
- combinedToRight.Status = compareStatus;
- combinedToRight.Generation = generation;
- childPhrases.Add(combinedToRight);
- anyChildrenAdded = true;
- }
- if (!anyChildrenAdded)
- childPhrases.Add(phrase);
- else
- newPhrasesGenerated = true;
- }
- if (!newPhrasesGenerated)
- {
- done = true;
- return childPhrases;
- }
- var res1 = ResolveSamePhrasePairs(childPhrases);
- var res2 = ResolveSiblings(res1);
- var res = res2;
- bool hasChanges;
- do
- {
- res = ResolveOverlaid(res, out hasChanges);
- } while (hasChanges);
- done = false;
- return res;
- }
- private List<WordsPhrase> ResolveSamePhrasePairs(List<WordsPhrase> phrases)
- {
- List<WordsPhrase> result = new List<WordsPhrase>();
- for (int i = 0; i < phrases.Count;)
- {
- if (i == phrases.Count - 1)
- {
- result.Add(phrases[i]);
- i += 1;
- continue;
- }
- var phrase1 = phrases[i];
- var phrase2 = phrases[i + 1];
- if (phrase1.Phrase != phrase2.Phrase)
- {
- result.Add(phrases[i]);
- i += 1;
- continue;
- }
- ResolvePair(result, phrase1, phrase2);
- i += 2;
- }
- RemoveFullDuplicates(result);
- return result;
- }
- private List<WordsPhrase> ResolveSiblings(List<WordsPhrase> phrases)
- {
- List<WordsPhrase> result = new List<WordsPhrase>();
- for (int i = 0; i < phrases.Count;)
- {
- if (i == phrases.Count - 1)
- {
- result.Add(phrases[i]);
- i += 1;
- continue;
- }
- var phrase1 = phrases[i];
- var phrase2 = phrases[i + 1];
- if (phrase1.Parent == null || phrase1.Parent != phrase2.Parent)
- {
- result.Add(phrases[i]);
- i += 1;
- continue;
- }
- ResolvePair(result, phrase1, phrase2);
- i += 2;
- }
- RemoveFullDuplicates(result);
- return result;
- }
- private List<WordsPhrase> ResolveOverlaid(List<WordsPhrase> phrases, out bool hasChanges)
- {
- hasChanges = false;
- List<WordsPhrase> result = new List<WordsPhrase>();
- for (int i = 0; i < phrases.Count;)
- {
- if (i == phrases.Count - 1)
- {
- result.Add(phrases[i]);
- i += 1;
- continue;
- }
- var phrase1 = phrases[i];
- var phrase2 = phrases[i + 1];
- if (!IsTokensOverlaids(phrase1.Tokens, phrase2.Tokens))
- {
- result.Add(phrase1);
- i += 1;
- continue;
- }
- hasChanges = true;
- ResolvePair(result, phrase1, phrase2);
- if (result.Contains(null))
- {
- }
- i += 2;
- }
- RemoveFullDuplicates(result);
- return result;
- }
- private const double threshold = 0.6;
- private static void ResolvePair(List<WordsPhrase> result, WordsPhrase phrase1, WordsPhrase phrase2)
- {
- var status1 = phrase1.Status;
- var status2 = phrase2.Status;
- if (phrase1.Status == phrase2.Status)
- {
- if (status1 == CompareStatus.Worse)
- {
- result.Add(phrase1.Parent);
- result.Add(phrase2.Parent);
- }
- else if (status1 == CompareStatus.Questionable)
- {
- // временно!
- // нужно учитывать веса у родителей
- double maxWeight = Math.Max(phrase1.Weight, phrase2.Weight);
- if (maxWeight > threshold)
- {
- var tmpPhrase = GetWhere(phrase1, phrase2, p => p.Weight == maxWeight);
- result.Add(tmpPhrase);
- }
- else
- {
- result.Add(phrase1.Parent);
- result.Add(phrase2.Parent);
- }
- }
- else if (status1 == CompareStatus.Better)
- {
- // ???временно! кладем просто первй
- // ???но возможно потребуется откат до родителя для вторго, нужно перепроверить
- // кладем с большим весом
- var tmpPhrase = phrase1.Weight >= phrase2.Weight ? phrase1 : phrase2;
- result.Add(tmpPhrase);
- }
- else // None
- {
- // wut??? не должно происходить ??
- result.Add(phrase1);
- result.Add(phrase2);
- }
- }
- else
- {
- if (IsOneEquals(status1, status2, CompareStatus.Questionable))
- {
- if (IsOneEquals(status1, status2, CompareStatus.Better))
- {
- var betterPhrase = GetWhere(phrase1, phrase2, s => s.Status == CompareStatus.Better);
- result.Add(betterPhrase);
- }
- else if (IsOneEquals(status1, status2, CompareStatus.Worse))
- {
- result.Add(phrase1.Parent ?? phrase1);
- result.Add(phrase2.Parent ?? phrase2);
- }
- else
- {
- var questionablePhrase = GetWhere(phrase1, phrase2, s => s.Status == CompareStatus.Questionable);
- if (questionablePhrase.Weight > threshold)
- {
- result.Add(questionablePhrase);
- }
- else
- {
- result.Add(phrase1.Parent ?? phrase1);
- result.Add(phrase2.Parent ?? phrase2);
- }
- // wut???
- // не должно происходить!
- }
- }
- else if (IsOneEquals(status1, status2, CompareStatus.Better))
- {
- var betterPhrase = GetWhere(phrase1, phrase2, s => s.Status == CompareStatus.Better);
- result.Add(betterPhrase);
- }
- else
- {
- result.Add(phrase1.Parent ?? phrase1);
- result.Add(phrase2.Parent ?? phrase2);
- }
- }
- }
- private void RemoveFullDuplicates(List<WordsPhrase> phrases)
- {
- for (int i = 0; i < phrases.Count - 1; i++)
- {
- var phrase1 = phrases[i];
- var phrase2 = phrases[i + 1];
- if (phrase1 == phrase2)
- {
- phrases.RemoveAt(i + 1);
- }
- }
- }
- private bool IsTokensOverlaids(List<Token> tokens1, List<Token> tokens2)
- {
- for (int i = 0; i < tokens1.Count; i++)
- {
- for (int j = 0; j < tokens2.Count; j++)
- {
- if (tokens1[i] == tokens2[j])
- return true;
- }
- }
- return false;
- }
- private enum CompareStatus
- {
- None,
- Better,
- Worse,
- Questionable
- }
- private static T GetWhere<T>(T obj1, T obj2, Predicate<T> predicate)
- where T : class
- {
- if (predicate(obj1))
- return obj1;
- if (predicate(obj2))
- return obj2;
- return null;
- }
- private static bool IsOneEquals(CompareStatus status1, CompareStatus status2, CompareStatus toCompare)
- {
- return status1 == toCompare || status2 == toCompare;
- }
- const double epsilon = 0;//-0.002;
- private CompareStatus ComparePhrases(WordsPhrase newPhrase, WordsPhrase oldPhrase)
- {
- string newName = newPhrase.Label;
- string oldName = oldPhrase.Label;
- if (newName != Other && oldName != Other)
- {
- if (newName == oldName)
- {
- double newConf = newPhrase.Weight;
- double oldConf = oldPhrase.Weight;
- double delta = newConf - oldConf;
- return delta > epsilon ? CompareStatus.Better : CompareStatus.Worse; // однозначно лучше, если уверенность больше
- }
- else
- return CompareStatus.Questionable; // questionable
- }
- if (newName == Other && oldName == Other)
- {
- return CompareStatus.Worse; // оба Other - нет смысла обрабатывать
- }
- return CompareStatus.Questionable; // если новый не Other то лучше
- }
- private WordsPhrase ConcatenatePhrases(WordsPhrase phrase1, WordsPhrase phrase2, List<Token> sourceTokens, ClassifierEngineData engineData)
- {
- List<Token> sumTokens = phrase1.Tokens.Concat(phrase2.Tokens).ToList();
- return TokensToPhrase(sumTokens, sourceTokens, engineData);
- }
- private List<WordsPhrase> GenerateInitialPhrases(List<Token> tokens, List<Token> sourceTokens, ClassifierEngineData engineData)
- {
- List<WordsPhrase> wordsPhrases = new List<WordsPhrase>();
- List<int> continiousTokensIndexes = DetectContiniousTokens(tokens, sourceTokens);
- var continiousTokensRanges = CollectContiniousRanges(continiousTokensIndexes).ToDictionary(r => r.Start);
- for (int i = 0; i < tokens.Count;)
- {
- int start = i;
- int len = 1;
- if (continiousTokensRanges.TryGetValue(start, out var range))
- {
- len = range.Length;
- }
- List<Token> phraseTokens = tokens.Skip(start).Take(len).ToList();
- WordsPhrase phrase = TokensToPhrase(phraseTokens, sourceTokens, engineData);
- //phrase.Status = CompareStatus.Better;
- wordsPhrases.Add(phrase);
- i += len;
- }
- return wordsPhrases;
- }
- /// <summary>
- /// Собирает все токены из <paramref name="phraseTokens"/> вместе с пропущенными (игнорируемыми) используя все токены предложения <paramref name="sourceTokens"/>.
- /// </summary>
- /// <param name="phraseTokens"></param>
- /// <param name="sourceTokens"></param>
- /// <returns></returns>
- private static List<Token> GetAllTokensWithSeparators(List<Token> phraseTokens, List<Token> sourceTokens)
- {
- if (phraseTokens.Count == 1)
- return phraseTokens;
- int firstTokenIdx = sourceTokens.IndexOf(phraseTokens[0]);
- int lastTokenIdx = sourceTokens.LastIndexOf(phraseTokens[phraseTokens.Count - 1]);
- List<Token> result = sourceTokens.Skip(firstTokenIdx).Take(lastTokenIdx - firstTokenIdx + 1).ToList();
- return result;
- }
- /// <summary>
- /// Возвращает список из наиболее вероятных полей для данного слова.
- /// </summary>
- /// <param name="wordVec"></param>
- /// <param name="topN"></param>
- /// <returns></returns>
- private List<Tuple<string, double>> GetTopFields(string[] allLabels, float[] wordVec, int topN)
- {
- List<Tuple<string, double>> result = new List<Tuple<string, double>>();
- for (int i = 0; i < wordVec.Length; i++)
- {
- if (wordVec[i] > 0.0)
- result.Add(new Tuple<string, double>(allLabels[i], wordVec[i]));
- }
- return result.OrderByDescending(w => w.Item2).Take(topN).ToList();
- }
- public static double GetDegreeOfConfidence(float[] vals)
- {
- double max = vals.Max();
- var avg = MathHelper.AvgStdDev(vals, out double stdDev);
- double thr = stdDev * 0.1;
- double sum = 0;
- int countNonZero = 0;
- for (int i = 0; i < vals.Length; i++)
- {
- if (vals[i] > thr)
- {
- countNonZero++;
- double normVal = vals[i] / max;
- sum += normVal * normVal;
- }
- }
- if (countNonZero == 1)
- return 1;
- if (countNonZero == 0)
- return 0;
- if (sum == countNonZero)
- return max;
- double res = Math.Sqrt(sum / countNonZero);
- return (1 - res) / (1 - Math.Sqrt(1.0 / countNonZero));
- }
- private double CalcResultCoef(WordsPhrase t)
- {
- return 0.2 * t.Weight + 0.8 * t.Weight;// + 0.1 / maxPhraseLen * t.Tokens.Count;
- }
- private List<IndexRange> CollectContiniousRanges(List<int> indexes)
- {
- List<IndexRange> result = new List<IndexRange>();
- if (indexes.Count == 0)
- return result;
- int startIdx = indexes[0];
- int len = 1;
- for (int i = 1; i < indexes.Count; i++)
- {
- if (indexes[i] - len == startIdx)
- len++;
- else
- {
- result.Add(new IndexRange(startIdx, len + 1));
- startIdx = indexes[i];
- len = 1;
- }
- }
- result.Add(new IndexRange(startIdx, len + 1));
- return result;
- }
- HashSet<string> prepositions = new HashSet<string>(StringComparer.OrdinalIgnoreCase) { "with", "without", "w/", "w/o" };
- private List<int> DetectContiniousTokens(List<Token> tokens, List<Token> sourceTokens)
- {
- List<int> result = new List<int>();
- if (tokens.Count <= 1)
- return result;
- List<Token> twoTokens = new List<Token>();
- for (int i = 0; i < tokens.Count - 1; i++)
- {
- if (prepositions.Contains(tokens[i].Word))
- {
- result.Add(i);
- continue;
- }
- twoTokens.Add(tokens[i]);
- twoTokens.Add(tokens[i + 1]);
- List<Token> allPhraseTokens = GetAllTokensWithSeparators(twoTokens, sourceTokens);
- string sumWord = TagTokenHelper.RecollectTagsWord(allPhraseTokens);
- bool containsWhitespace = IsContainsWhitespace(sumWord);
- if (!containsWhitespace)
- result.Add(i);
- twoTokens.Clear();
- }
- return result;
- }
- /// <summary>
- /// Проверяет содержатся ли пробелы в тексте.
- /// </summary>
- /// <param name="sumWord"></param>
- /// <returns></returns>
- private bool IsContainsWhitespace(string sumWord)
- {
- foreach (char ch in sumWord)
- {
- if (char.IsWhiteSpace(ch))
- return true;
- }
- return false;
- }
- private bool CheckContiniousTokens(List<IndexRange> continiousTokensRanges, IndexRange range)
- {
- bool overlapse = continiousTokensRanges.Any(t => IsOverlapse(t, range));
- if (overlapse)
- return false;
- bool b1 = !continiousTokensRanges.Any(t => IsFirstInsideOfSecond(range, t));
- bool b2 = continiousTokensRanges.Any(t => IsFirstInsideOfSecond(t, range));
- return b1 || b2;
- }
- private bool IsOverlapse(IndexRange r1, IndexRange r2)
- {
- return IsIndexInside(r1, r2.Start) ^ IsIndexInside(r1, r2.Start + r2.Length);
- }
- private bool IsFirstInsideOfSecond(IndexRange first, IndexRange second)
- {
- return first.Start >= second.Start && first.Start + first.Length <= second.Start + second.Length;
- }
- private bool IsIndexInside(IndexRange range, int idx)
- {
- return idx > range.Start && idx < range.Start + range.Length;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement