Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #ifndef clStemmerEnglish_h__
- #define clStemmerEnglish_h__
- #include <string>
- #include <algorithm>
- class clStemmerState
- {
- public:
- i64 R1 = -1, R2 = -1, RV = -1, FirstVowel = -1;
- };
- class clStemmerEnglish
- {
- public:
- static std::wstring Stem(const std::wstring &word)
- {
- std::wstring data = word;
- clStemmerState state;
- InitializeStemming(data, state);
- if (data.length() < 3) return data;
- if (IsException(data)) return data;
- HashY(data, L"aeiouy");
- state.FirstVowel = data.find_first_of(L"aeiouy");
- if (state.FirstVowel == -1) return data;
- SetInitialR1(data, state);
- FindR2(data, state, L"aeiouy");
- Step1A(data, state);
- if (IsExceptionPostStep1A(data)) return data;
- Step1B(data, state);
- Step1C(data);
- Step2(data, state);
- Step3(data, state);
- Step4(data, state);
- Step5(data, state);
- UnhashY(data);
- return data;
- }
- private:
- static bool IsException(std::wstring &text)
- {
- if (text == L"skis") { text = L"ski"; return true; }
- if (text == L"skies") { text = L"sky"; return true; }
- if (text == L"dying") { text = L"die"; return true; }
- if (text == L"lying") { text = L"lie"; return true; }
- if (text == L"tying") { text = L"tie"; return true; }
- if (text == L"idly") { text = L"idl"; return true; }
- if (text == L"gently") { text = L"gentl"; return true; }
- if (text == L"ugly") { text = L"ugli"; return true; }
- if (text == L"early") { text = L"earli"; return true; }
- if (text == L"only") { text = L"onli"; return true; }
- if (text == L"singly") { text = L"singl"; return true; }
- if (text == L"sky" || text == L"news" || text == L"howe" || text == L"atlas" || text == L"cosmos" || text == L"bias" || text == L"andes") return true;
- return false;
- }
- static void RemovePossessiveSuffix(std::wstring &text)
- {
- if (text.length() >= 2 && IsApostrophe(text[text.length() - 2]) && IsEither(text.back(), s_LOWER_S, s_UPPER_S)) text.erase(text.length() - 2);
- while (text.length() >= 1 && IsApostrophe(text.back())) text.pop_back();
- }
- static bool IsExceptionPostStep1A(const std::wstring &text)
- {
- return text == L"inning" || text == L"outing" || text == L"canning" || text == L"herring" || text == L"earring" || text == L"proceed" || text == L"exceed" || text == L"succeed";
- }
- static void SetR1(clStemmerState &state, i64 pos) { state.R1 = pos; }
- static void HashY(std::wstring &text, const wchar_t *vowels)
- {
- if (text.empty()) return;
- if (text.front() == s_LOWER_Y) text.front() = s_LOWER_Y_HASH;
- else if (text.front() == s_UPPER_Y) text.front() = s_UPPER_Y_HASH;
- for (i64 i = 1; i < static_cast<i64>(text.size()); ++i)
- if ((text[i] == s_LOWER_Y || text[i] == s_UPPER_Y) && !IsOneOf(text[i - 1], vowels))
- text[i] = (text[i] == s_LOWER_Y) ? s_LOWER_Y_HASH : s_UPPER_Y_HASH;
- }
- static void Step2(std::wstring &text, clStemmerState &state)
- {
- if (EraseSuffix(text, state, L"ization", 4, 0, L"r1", true)) return;
- if (EraseSuffix(text, state, L"ational", 4, 0, L"r1", true)) return;
- if (EraseSuffix(text, state, L"fulness", 4, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"ousness", 4, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"iveness", 4, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"tional", 2, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"lessli", 2, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"biliti", 3, 0, L"r1", true)) return;
- if (EraseSuffix(text, state, L"iviti", 2, 0, L"r1", true)) return;
- if (EraseSuffix(text, state, L"ation", 2, 0, L"r1", true)) return;
- if (EraseSuffix(text, state, L"alism", 3, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"aliti", 3, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"ousli", 2, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"entli", 2, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"fulli", 2, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"alli", 2, 0, L"r1")) return;
- if (ReplaceSuffix(text, state, L"enci", L'e', L"r1")) return;
- if (ReplaceSuffix(text, state, L"anci", L'e', L"r1")) return;
- if (ReplaceSuffix(text, state, L"abli", L'e', L"r1")) return;
- if (EraseSuffix(text, state, L"izer", 1, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"ator", 1, 0, L"r1", true)) return;
- if (ReplaceSuffix(text, state, L"bli", L'e', L"r1")) return;
- if (ReplaceSuffix(text, state, L"ogi", L'\0', L"r1", L'l')) return;
- if (EraseSuffix(text, state, L"li", 2, 0, L"r1", false, L"cdeghkmnrt")) return;
- }
- static void Step3(std::wstring &text, clStemmerState &state)
- {
- if (EraseSuffix(text, state, L"ational", 4, 0, L"r1", true)) return;
- if (EraseSuffix(text, state, L"tional", 2, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"icate", 3, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"iciti", 3, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"alize", 3, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"ative", 5, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ical", 2, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"ness", 4, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"ful", 3, 0, L"r1")) return;
- }
- static void Step4(std::wstring &text, clStemmerState &state)
- {
- if (EraseSuffix(text, state, L"ement", 5, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"able", 4, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ible", 4, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ment", 4, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ence", 4, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ance", 4, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"sion", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"tion", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ant", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ent", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ism", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ate", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"iti", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ous", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ive", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ize", 3, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"al", 2, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"er", 2, 0, L"r2")) return;
- if (EraseSuffix(text, state, L"ic", 2, 0, L"r2")) return;
- }
- static void Step5(std::wstring &text, clStemmerState &state)
- {
- if (text.length() >= 1 && text.back() == L'e')
- {
- if (state.R2 != text.length())
- {
- text.erase(text.length() - 1);
- UpdateRSections(text, state);
- }
- else if (state.R1 != text.length() && text.length() >= 2 && !EndsWithShortSyllable(text, state, text.length() - 1))
- {
- text.erase(text.length() - 1);
- UpdateRSections(text, state);
- }
- }
- else if (state.R2 != text.length() && text.length() >= 2 && text.substr(text.length() - 2) == L"ll") { text.erase(text.length() - 1); UpdateRSections(text, state); }
- }
- static void Step1A(std::wstring &text, clStemmerState &state)
- {
- if (EraseSuffix(text, state, L"sses", 2)) return;
- if (EraseSuffix(text, state, L"ied", 2, 1)) return;
- if (EraseSuffix(text, state, L"ies", 2, 1)) return;
- if (EraseSuffix(text, state, L"s", 1, -1, L"aeiouy")) return;
- }
- static void Step1B(std::wstring &text, clStemmerState &state)
- {
- if (EraseSuffix(text, state, L"eed", 1, 0, L"r1")) return;
- if (EraseSuffix(text, state, L"eedly", 3, 0, L"r1")) return;
- if (!(EraseSuffix(text, state, L"ed", 2, -1, L"aeiouy") || EraseSuffix(text, state, L"edly", 4, -1, L"aeiouy") || EraseSuffix(text, state, L"ing", 3, -1, L"aeiouy") || EraseSuffix(text, state, L"ingly", 5, -1, L"aeiouy"))) return;
- if (AddSuffix(text, state, L"at", L'e')) return;
- if (AddSuffix(text, state, L"bl", L'e')) return;
- if (AddSuffix(text, state, L"iz", L'e')) return;
- if (EraseSuffix(text, state, L"bb") || EraseSuffix(text, state, L"dd") || EraseSuffix(text, state, L"ff") || EraseSuffix(text, state, L"gg") || EraseSuffix(text, state, L"mm") || EraseSuffix(text, state, L"nn") || EraseSuffix(text, state, L"pp") || EraseSuffix(text, state, L"rr") || EraseSuffix(text, state, L"tt")) return;
- if (IsShortWord(text, state)) AddSuffix(text, state, L"", L'e');
- }
- static void SetInitialR1(const std::wstring &text, clStemmerState &state)
- {
- if (text.length() >= 5 && text.substr(0, 5) == L"gener") SetR1(state, 5);
- else if (text.length() >= 6 && text.substr(0, 6) == L"commun") SetR1(state, 6);
- else if (text.length() >= 5 && text.substr(0, 5) == L"arsen") SetR1(state, 5);
- else FindR1(text, state, L"aeiouy");
- }
- static void InitializeStemming(std::wstring &text, clStemmerState &state)
- {
- state.FirstVowel = -1;
- ResetRValues(state);
- RemovePossessiveSuffix(text);
- }
- static void UpdateRSections(const std::wstring &text, clStemmerState &state)
- {
- i64 len = text.length();
- state.R1 = (state.R1 > len ? len : state.R1);
- state.R2 = (state.R2 > len ? len : state.R2);
- state.RV = (state.RV > len ? len : state.RV);
- }
- static bool IsVowel(const wchar_t character) { return IsOneOf(character, L"aeiouy"); }
- static void ResetRValues(clStemmerState &state) { state.R1 = state.R2 = state.RV = -1; }
- static bool IsEither(wchar_t character, wchar_t ch1, wchar_t ch2) { return character == ch1 || ch2; }
- static void FindR1(const std::wstring &text, clStemmerState &state, const wchar_t *vowelList)
- {
- i64 start = text.find_first_of(vowelList, 0);
- state.R1 = (start == -1) ? text.length() : text.find_first_not_of(vowelList, ++start);
- if (state.R1 != -1) ++state.R1;
- else state.R1 = text.length();
- }
- static void FindR2(const std::wstring &text, clStemmerState &state, const wchar_t *vowelList)
- {
- i64 start = (state.R1 != text.length()) ? text.find_first_of(vowelList, state.R1) : -1;
- state.R2 = (start != -1 && start != text.length() - 1) ? text.find_first_not_of(vowelList, ++start) : text.length();
- state.R2 = (state.R2 == -1) ? text.length() : ++state.R2;
- }
- static bool EraseSuffix(std::wstring &text, clStemmerState &state, const std::wstring &suffix)
- {
- if (text.length() >= suffix.length() && text.substr(text.length() - suffix.length()) == suffix)
- {
- text.erase(text.length() - suffix.length());
- UpdateRSections(text, state);
- return true;
- }
- return false;
- }
- static bool IsApostrophe(const wchar_t &ch) { return (ch == 39 || ch == 146 || ch == 180 || ch == 0x2019); }
- static bool EndsWithShortSyllable(const std::wstring &text, const clStemmerState &state, const i64 length)
- {
- if (length == 2) return IsVowel(text[0]) && !IsVowel(text[1]);
- else if (length > 2)
- {
- i64 start = text.find_last_of(L"aeiouy", length - 1); if (start == -1) return false;
- return start > 0 && start == (length - 2) && !IsVowel(text[start + 1]) && !IsOneOf(text[start + 1], L"wx") && !IsEither(text[start + 1], s_LOWER_Y_HASH, s_UPPER_Y_HASH) && !IsVowel(text[start - 1]);
- }
- return false;
- }
- static bool AddSuffix(std::wstring &text, clStemmerState &state, const std::wstring &suffix, const wchar_t add)
- {
- if (text.length() >= suffix.length() && text.substr(text.length() - suffix.length()) == suffix)
- {
- text += add;
- FindR2(text, state, L"aeiouy");
- return true;
- }
- return false;
- }
- static void Step1C(std::wstring &text) { if (text.length() > 2 && !IsVowel(text[text.length() - 2]) && text.back() == L'y') text.back() = L'i'; }
- static bool IsOneOf(wchar_t character, const wchar_t *characters) { while (*characters) if (*characters++ == character) return true; return false; }
- static bool IsShortWord(const std::wstring &text, const clStemmerState &state) { return EndsWithShortSyllable(text, state, text.length()) && state.R1 == text.length(); }
- static void UnhashY(std::wstring &text) { std::replace(text.begin(), text.end(), s_LOWER_Y_HASH, s_LOWER_Y); std::replace(text.begin(), text.end(), s_UPPER_Y_HASH, s_UPPER_Y); }
- static bool ReplaceSuffix(std::wstring &text, clStemmerState &state, const std::wstring &suffix, const wchar_t replace, const wchar_t *condition = nullptr, const wchar_t preceding = L'\0')
- {
- if (text.length() >= suffix.length() && text.substr(text.length() - suffix.length()) == suffix)
- {
- if (condition && wcscmp(condition, L"r1") == 0 && state.R1 > static_cast<i64>(text.length() - suffix.length())) return false;
- if (condition && wcscmp(condition, L"r2") == 0 && state.R2 > static_cast<i64>(text.length() - suffix.length())) return false;
- if (preceding != L'\0' && text[text.length() - suffix.length() - 1] != preceding) return false;
- text.back() = replace;
- return true;
- }
- return false;
- }
- static bool EraseSuffix(std::wstring &text, clStemmerState &state, const std::wstring &suffix, int eraseCount, int minLength = 0, const wchar_t *condition = nullptr, bool addE = false, const wchar_t *additionalVowels = nullptr)
- {
- if (text.length() >= suffix.length() && text.substr(text.length() - suffix.length()) == suffix)
- {
- if (condition && wcscmp(condition, L"r1") == 0 && state.R1 > static_cast<i64>(text.length() - suffix.length())) return false;
- if (condition && wcscmp(condition, L"r2") == 0 && state.R2 > static_cast<i64>(text.length() - suffix.length())) return false;
- if (minLength > 0 && text.length() - suffix.length() < minLength) return false;
- if (additionalVowels && !IsOneOf(text[text.length() - suffix.length() - 1], additionalVowels)) return false;
- text.erase(text.length() - eraseCount);
- UpdateRSections(text, state);
- if (addE) text.back() = L'e';
- return true;
- }
- return false;
- }
- static const wchar_t s_LOWER_Y = 0x79, s_UPPER_Y = 0x59, s_LOWER_S = 0x73, s_UPPER_S = 0x53, s_LOWER_Y_HASH = 9, s_UPPER_Y_HASH = 7;
- };
- #endif // clStemmerEnglish_h__
Advertisement
Add Comment
Please, Sign In to add comment