Guest User

LLM written English stemmer

a guest
Jun 18th, 2024
176
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 14.10 KB | None | 0 0
  1. #ifndef clStemmerEnglish_h__
  2. #define clStemmerEnglish_h__
  3.  
  4. #include <string>
  5. #include <algorithm>
  6.  
  7. class clStemmerState
  8. {
  9. public:
  10.   i64 R1 = -1, R2 = -1, RV = -1, FirstVowel = -1;
  11. };
  12.  
  13. class clStemmerEnglish
  14. {
  15. public:
  16.   static std::wstring Stem(const std::wstring &word)
  17.   {
  18.     std::wstring data = word;
  19.  
  20.     clStemmerState state;
  21.     InitializeStemming(data, state);
  22.  
  23.     if (data.length() < 3) return data;
  24.     if (IsException(data)) return data;
  25.  
  26.     HashY(data, L"aeiouy");
  27.     state.FirstVowel = data.find_first_of(L"aeiouy");
  28.     if (state.FirstVowel == -1) return data;
  29.  
  30.     SetInitialR1(data, state);
  31.     FindR2(data, state, L"aeiouy");
  32.  
  33.     Step1A(data, state);
  34.     if (IsExceptionPostStep1A(data)) return data;
  35.     Step1B(data, state);
  36.     Step1C(data);
  37.     Step2(data, state);
  38.     Step3(data, state);
  39.     Step4(data, state);
  40.     Step5(data, state);
  41.  
  42.     UnhashY(data);
  43.  
  44.     return data;
  45.   }
  46.  
  47. private:
  48.   static bool IsException(std::wstring &text)
  49.   {
  50.     if (text == L"skis") { text = L"ski"; return true; }
  51.     if (text == L"skies") { text = L"sky"; return true; }
  52.     if (text == L"dying") { text = L"die"; return true; }
  53.     if (text == L"lying") { text = L"lie"; return true; }
  54.     if (text == L"tying") { text = L"tie"; return true; }
  55.     if (text == L"idly") { text = L"idl"; return true; }
  56.     if (text == L"gently") { text = L"gentl"; return true; }
  57.     if (text == L"ugly") { text = L"ugli"; return true; }
  58.     if (text == L"early") { text = L"earli"; return true; }
  59.     if (text == L"only") { text = L"onli"; return true; }
  60.     if (text == L"singly") { text = L"singl"; return true; }
  61.     if (text == L"sky" || text == L"news" || text == L"howe" || text == L"atlas" || text == L"cosmos" || text == L"bias" || text == L"andes") return true;
  62.     return false;
  63.   }
  64.  
  65.   static void RemovePossessiveSuffix(std::wstring &text)
  66.   {
  67.     if (text.length() >= 2 && IsApostrophe(text[text.length() - 2]) && IsEither(text.back(), s_LOWER_S, s_UPPER_S)) text.erase(text.length() - 2);
  68.     while (text.length() >= 1 && IsApostrophe(text.back())) text.pop_back();
  69.   }
  70.  
  71.   static bool IsExceptionPostStep1A(const std::wstring &text)
  72.   {
  73.     return text == L"inning" || text == L"outing" || text == L"canning" || text == L"herring" || text == L"earring" || text == L"proceed" || text == L"exceed" || text == L"succeed";
  74.   }
  75.  
  76.   static void SetR1(clStemmerState &state, i64 pos) { state.R1 = pos; }
  77.  
  78.   static void HashY(std::wstring &text, const wchar_t *vowels)
  79.   {
  80.     if (text.empty()) return;
  81.     if (text.front() == s_LOWER_Y) text.front() = s_LOWER_Y_HASH;
  82.     else if (text.front() == s_UPPER_Y) text.front() = s_UPPER_Y_HASH;
  83.     for (i64 i = 1; i < static_cast<i64>(text.size()); ++i)
  84.       if ((text[i] == s_LOWER_Y || text[i] == s_UPPER_Y) && !IsOneOf(text[i - 1], vowels))
  85.         text[i] = (text[i] == s_LOWER_Y) ? s_LOWER_Y_HASH : s_UPPER_Y_HASH;
  86.   }
  87.  
  88.   static void Step2(std::wstring &text, clStemmerState &state)
  89.   {
  90.     if (EraseSuffix(text, state, L"ization", 4, 0, L"r1", true)) return;
  91.     if (EraseSuffix(text, state, L"ational", 4, 0, L"r1", true)) return;
  92.     if (EraseSuffix(text, state, L"fulness", 4, 0, L"r1")) return;
  93.     if (EraseSuffix(text, state, L"ousness", 4, 0, L"r1")) return;
  94.     if (EraseSuffix(text, state, L"iveness", 4, 0, L"r1")) return;
  95.     if (EraseSuffix(text, state, L"tional", 2, 0, L"r1")) return;
  96.     if (EraseSuffix(text, state, L"lessli", 2, 0, L"r1")) return;
  97.     if (EraseSuffix(text, state, L"biliti", 3, 0, L"r1", true)) return;
  98.     if (EraseSuffix(text, state, L"iviti", 2, 0, L"r1", true)) return;
  99.     if (EraseSuffix(text, state, L"ation", 2, 0, L"r1", true)) return;
  100.     if (EraseSuffix(text, state, L"alism", 3, 0, L"r1")) return;
  101.     if (EraseSuffix(text, state, L"aliti", 3, 0, L"r1")) return;
  102.     if (EraseSuffix(text, state, L"ousli", 2, 0, L"r1")) return;
  103.     if (EraseSuffix(text, state, L"entli", 2, 0, L"r1")) return;
  104.     if (EraseSuffix(text, state, L"fulli", 2, 0, L"r1")) return;
  105.     if (EraseSuffix(text, state, L"alli", 2, 0, L"r1")) return;
  106.     if (ReplaceSuffix(text, state, L"enci", L'e', L"r1")) return;
  107.     if (ReplaceSuffix(text, state, L"anci", L'e', L"r1")) return;
  108.     if (ReplaceSuffix(text, state, L"abli", L'e', L"r1")) return;
  109.     if (EraseSuffix(text, state, L"izer", 1, 0, L"r1")) return;
  110.     if (EraseSuffix(text, state, L"ator", 1, 0, L"r1", true)) return;
  111.     if (ReplaceSuffix(text, state, L"bli", L'e', L"r1")) return;
  112.     if (ReplaceSuffix(text, state, L"ogi", L'\0', L"r1", L'l')) return;
  113.     if (EraseSuffix(text, state, L"li", 2, 0, L"r1", false, L"cdeghkmnrt")) return;
  114.   }
  115.  
  116.   static void Step3(std::wstring &text, clStemmerState &state)
  117.   {
  118.     if (EraseSuffix(text, state, L"ational", 4, 0, L"r1", true)) return;
  119.     if (EraseSuffix(text, state, L"tional", 2, 0, L"r1")) return;
  120.     if (EraseSuffix(text, state, L"icate", 3, 0, L"r1")) return;
  121.     if (EraseSuffix(text, state, L"iciti", 3, 0, L"r1")) return;
  122.     if (EraseSuffix(text, state, L"alize", 3, 0, L"r1")) return;
  123.     if (EraseSuffix(text, state, L"ative", 5, 0, L"r2")) return;
  124.     if (EraseSuffix(text, state, L"ical", 2, 0, L"r1")) return;
  125.     if (EraseSuffix(text, state, L"ness", 4, 0, L"r1")) return;
  126.     if (EraseSuffix(text, state, L"ful", 3, 0, L"r1")) return;
  127.   }
  128.  
  129.   static void Step4(std::wstring &text, clStemmerState &state)
  130.   {
  131.     if (EraseSuffix(text, state, L"ement", 5, 0, L"r2")) return;
  132.     if (EraseSuffix(text, state, L"able", 4, 0, L"r2")) return;
  133.     if (EraseSuffix(text, state, L"ible", 4, 0, L"r2")) return;
  134.     if (EraseSuffix(text, state, L"ment", 4, 0, L"r2")) return;
  135.     if (EraseSuffix(text, state, L"ence", 4, 0, L"r2")) return;
  136.     if (EraseSuffix(text, state, L"ance", 4, 0, L"r2")) return;
  137.     if (EraseSuffix(text, state, L"sion", 3, 0, L"r2")) return;
  138.     if (EraseSuffix(text, state, L"tion", 3, 0, L"r2")) return;
  139.     if (EraseSuffix(text, state, L"ant", 3, 0, L"r2")) return;
  140.     if (EraseSuffix(text, state, L"ent", 3, 0, L"r2")) return;
  141.     if (EraseSuffix(text, state, L"ism", 3, 0, L"r2")) return;
  142.     if (EraseSuffix(text, state, L"ate", 3, 0, L"r2")) return;
  143.     if (EraseSuffix(text, state, L"iti", 3, 0, L"r2")) return;
  144.     if (EraseSuffix(text, state, L"ous", 3, 0, L"r2")) return;
  145.     if (EraseSuffix(text, state, L"ive", 3, 0, L"r2")) return;
  146.     if (EraseSuffix(text, state, L"ize", 3, 0, L"r2")) return;
  147.     if (EraseSuffix(text, state, L"al", 2, 0, L"r2")) return;
  148.     if (EraseSuffix(text, state, L"er", 2, 0, L"r2")) return;
  149.     if (EraseSuffix(text, state, L"ic", 2, 0, L"r2")) return;
  150.   }
  151.  
  152.   static void Step5(std::wstring &text, clStemmerState &state)
  153.   {
  154.     if (text.length() >= 1 && text.back() == L'e')
  155.     {
  156.       if (state.R2 != text.length())
  157.       {
  158.         text.erase(text.length() - 1);
  159.         UpdateRSections(text, state);
  160.       }
  161.       else if (state.R1 != text.length() && text.length() >= 2 && !EndsWithShortSyllable(text, state, text.length() - 1))
  162.       {
  163.         text.erase(text.length() - 1);
  164.         UpdateRSections(text, state);
  165.       }
  166.     }
  167.     else if (state.R2 != text.length() && text.length() >= 2 && text.substr(text.length() - 2) == L"ll") { text.erase(text.length() - 1); UpdateRSections(text, state); }
  168.   }
  169.  
  170.   static void Step1A(std::wstring &text, clStemmerState &state)
  171.   {
  172.     if (EraseSuffix(text, state, L"sses", 2)) return;
  173.     if (EraseSuffix(text, state, L"ied", 2, 1)) return;
  174.     if (EraseSuffix(text, state, L"ies", 2, 1)) return;
  175.     if (EraseSuffix(text, state, L"s", 1, -1, L"aeiouy")) return;
  176.   }
  177.  
  178.   static void Step1B(std::wstring &text, clStemmerState &state)
  179.   {
  180.     if (EraseSuffix(text, state, L"eed", 1, 0, L"r1")) return;
  181.     if (EraseSuffix(text, state, L"eedly", 3, 0, L"r1")) return;
  182.  
  183.     if (!(EraseSuffix(text, state, L"ed", 2, -1, L"aeiouy") || EraseSuffix(text, state, L"edly", 4, -1, L"aeiouy") || EraseSuffix(text, state, L"ing", 3, -1, L"aeiouy") || EraseSuffix(text, state, L"ingly", 5, -1, L"aeiouy"))) return;
  184.    
  185.     if (AddSuffix(text, state, L"at", L'e')) return;
  186.     if (AddSuffix(text, state, L"bl", L'e')) return;
  187.     if (AddSuffix(text, state, L"iz", L'e')) return;
  188.     if (EraseSuffix(text, state, L"bb") || EraseSuffix(text, state, L"dd") || EraseSuffix(text, state, L"ff") || EraseSuffix(text, state, L"gg") || EraseSuffix(text, state, L"mm") || EraseSuffix(text, state, L"nn") || EraseSuffix(text, state, L"pp") || EraseSuffix(text, state, L"rr") || EraseSuffix(text, state, L"tt")) return;
  189.     if (IsShortWord(text, state)) AddSuffix(text, state, L"", L'e');
  190.   }
  191.  
  192.   static void SetInitialR1(const std::wstring &text, clStemmerState &state)
  193.   {
  194.     if (text.length() >= 5 && text.substr(0, 5) == L"gener") SetR1(state, 5);
  195.     else if (text.length() >= 6 && text.substr(0, 6) == L"commun") SetR1(state, 6);
  196.     else if (text.length() >= 5 && text.substr(0, 5) == L"arsen") SetR1(state, 5);
  197.     else FindR1(text, state, L"aeiouy");
  198.   }
  199.  
  200.   static void InitializeStemming(std::wstring &text, clStemmerState &state)
  201.   {
  202.     state.FirstVowel = -1;
  203.     ResetRValues(state);
  204.     RemovePossessiveSuffix(text);
  205.   }
  206.  
  207.   static void UpdateRSections(const std::wstring &text, clStemmerState &state)
  208.   {
  209.     i64 len = text.length();
  210.     state.R1 = (state.R1 > len ? len : state.R1);
  211.     state.R2 = (state.R2 > len ? len : state.R2);
  212.     state.RV = (state.RV > len ? len : state.RV);
  213.   }
  214.  
  215.   static bool IsVowel(const wchar_t character) { return IsOneOf(character, L"aeiouy"); }
  216.  
  217.   static void ResetRValues(clStemmerState &state) { state.R1 = state.R2 = state.RV = -1; }
  218.  
  219.   static bool IsEither(wchar_t character, wchar_t ch1, wchar_t ch2) { return character == ch1 || ch2; }
  220.  
  221.   static void FindR1(const std::wstring &text, clStemmerState &state, const wchar_t *vowelList)
  222.   {
  223.     i64 start = text.find_first_of(vowelList, 0);
  224.     state.R1 = (start == -1) ? text.length() : text.find_first_not_of(vowelList, ++start);
  225.     if (state.R1 != -1) ++state.R1;
  226.     else state.R1 = text.length();
  227.   }
  228.  
  229.   static void FindR2(const std::wstring &text, clStemmerState &state, const wchar_t *vowelList)
  230.   {
  231.     i64 start = (state.R1 != text.length()) ? text.find_first_of(vowelList, state.R1) : -1;
  232.     state.R2 = (start != -1 && start != text.length() - 1) ? text.find_first_not_of(vowelList, ++start) : text.length();
  233.     state.R2 = (state.R2 == -1) ? text.length() : ++state.R2;
  234.   }
  235.  
  236.   static bool EraseSuffix(std::wstring &text, clStemmerState &state, const std::wstring &suffix)
  237.   {
  238.     if (text.length() >= suffix.length() && text.substr(text.length() - suffix.length()) == suffix)
  239.     {
  240.       text.erase(text.length() - suffix.length());
  241.       UpdateRSections(text, state);
  242.       return true;
  243.     }
  244.     return false;
  245.   }
  246.  
  247.   static bool IsApostrophe(const wchar_t &ch) { return (ch == 39 || ch == 146 || ch == 180 || ch == 0x2019); }
  248.  
  249.   static bool EndsWithShortSyllable(const std::wstring &text, const clStemmerState &state, const i64 length)
  250.   {
  251.     if (length == 2) return IsVowel(text[0]) && !IsVowel(text[1]);
  252.     else if (length > 2)
  253.     {
  254.       i64 start = text.find_last_of(L"aeiouy", length - 1); if (start == -1) return false;
  255.       return start > 0 && start == (length - 2) && !IsVowel(text[start + 1]) && !IsOneOf(text[start + 1], L"wx") && !IsEither(text[start + 1], s_LOWER_Y_HASH, s_UPPER_Y_HASH) && !IsVowel(text[start - 1]);
  256.     }
  257.     return false;
  258.   }
  259.  
  260.   static bool AddSuffix(std::wstring &text, clStemmerState &state, const std::wstring &suffix, const wchar_t add)
  261.   {
  262.     if (text.length() >= suffix.length() && text.substr(text.length() - suffix.length()) == suffix)
  263.     {
  264.       text += add;
  265.       FindR2(text, state, L"aeiouy");
  266.       return true;
  267.     }
  268.     return false;
  269.   }
  270.  
  271.   static void Step1C(std::wstring &text) { if (text.length() > 2 && !IsVowel(text[text.length() - 2]) && text.back() == L'y') text.back() = L'i'; }
  272.  
  273.   static bool IsOneOf(wchar_t character, const wchar_t *characters) { while (*characters) if (*characters++ == character) return true; return false; }
  274.  
  275.   static bool IsShortWord(const std::wstring &text, const clStemmerState &state) { return EndsWithShortSyllable(text, state, text.length()) && state.R1 == text.length(); }
  276.  
  277.   static void UnhashY(std::wstring &text) { std::replace(text.begin(), text.end(), s_LOWER_Y_HASH, s_LOWER_Y); std::replace(text.begin(), text.end(), s_UPPER_Y_HASH, s_UPPER_Y); }
  278.  
  279.   static bool ReplaceSuffix(std::wstring &text, clStemmerState &state, const std::wstring &suffix, const wchar_t replace, const wchar_t *condition = nullptr, const wchar_t preceding = L'\0')
  280.   {
  281.     if (text.length() >= suffix.length() && text.substr(text.length() - suffix.length()) == suffix)
  282.     {
  283.       if (condition && wcscmp(condition, L"r1") == 0 && state.R1 > static_cast<i64>(text.length() - suffix.length())) return false;
  284.       if (condition && wcscmp(condition, L"r2") == 0 && state.R2 > static_cast<i64>(text.length() - suffix.length())) return false;
  285.       if (preceding != L'\0' && text[text.length() - suffix.length() - 1] != preceding) return false;
  286.       text.back() = replace;
  287.       return true;
  288.     }
  289.     return false;
  290.   }
  291.  
  292.   static bool EraseSuffix(std::wstring &text, clStemmerState &state, const std::wstring &suffix, int eraseCount, int minLength = 0, const wchar_t *condition = nullptr, bool addE = false, const wchar_t *additionalVowels = nullptr)
  293.   {
  294.     if (text.length() >= suffix.length() && text.substr(text.length() - suffix.length()) == suffix)
  295.     {
  296.       if (condition && wcscmp(condition, L"r1") == 0 && state.R1 > static_cast<i64>(text.length() - suffix.length())) return false;
  297.       if (condition && wcscmp(condition, L"r2") == 0 && state.R2 > static_cast<i64>(text.length() - suffix.length())) return false;
  298.       if (minLength > 0 && text.length() - suffix.length() < minLength) return false;
  299.       if (additionalVowels && !IsOneOf(text[text.length() - suffix.length() - 1], additionalVowels)) return false;
  300.  
  301.       text.erase(text.length() - eraseCount);
  302.       UpdateRSections(text, state);
  303.  
  304.       if (addE) text.back() = L'e';
  305.       return true;
  306.     }
  307.     return false;
  308.   }
  309.  
  310.   static const wchar_t s_LOWER_Y = 0x79, s_UPPER_Y = 0x59, s_LOWER_S = 0x73, s_UPPER_S = 0x53, s_LOWER_Y_HASH = 9, s_UPPER_Y_HASH = 7;
  311. };
  312.  
  313. #endif // clStemmerEnglish_h__
  314.  
Advertisement
Add Comment
Please, Sign In to add comment