Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*#########################################################################
- sp_check.h
- This header takes care about spell-checking from Levenshtein distance to
- diacritics. This is one of the Dr.Text project headers.
- Written by Andrei Zene, 2010.
- #########################################################################*/
- #define COPY_LETTER 0
- #define INF 32000
- int DIACRITIC = 1;
- int DELETE_LETTER = 3;
- int INSERT_LETTER = 3;
- int REPLACE_LETTER = 4;
- int TWIDDLE_LETTERS = 3;
- const TCHAR KEYBOARD_MAP[32][15] = {
- _T("aqwszăâ"), // a 0
- _T("bnhgvp"), // b 1
- _T("cxdfv"), // c 2
- _T("dserfcxt"), // d ..
- _T("ewsdr"), // e
- _T("fdrtgvc"), // f
- _T("gcftyzhbv"), // g
- _T("hgyzujnb"), // h
- _T("iuokjî"), //i
- _T("juikmnh"), // j
- _T("kciolmj"), // k
- _T("lopkş"), // l
- _T("mnjk"), // m
- _T("nbhjm"), // n
- _T("oiklp"), // o
- _T("polăşb"), // p
- _T("qwsa"), // q
- _T("redft"), //r
- _T("sşawedxzy"), // s
- _T("tţrfgzyd"), // t
- _T("uikjhzy"), // u
- _T("vbgfcw"), // v
- _T("wsaeqv"), //w
- _T("xyzsdc"), //x
- _T("yztujhgxsa"), //y
- _T("yztujhgxsa"), //z
- _T("ăaâîţşp"), // ă
- _T("âî"), // â
- _T("îiâăţş"), // î
- _T("şţăpol"),// ş
- _T("ţşpăî"),// ţ
- _T("/")
- };
- const TCHAR caps = _T('A'-'a');
- int is_letter(TCHAR letter)
- {
- if(letter >= _T('a') && letter <= _T('z'))
- return 1;
- return 0;
- }
- int is_diacritic(TCHAR letter, TCHAR special_char)
- {
- if(
- (letter == _T('ă')) || (letter == _T('ţ')) ||
- (letter == _T('ş')) || (letter == _T('î')) ||
- (letter == _T('â')) || (special_char != 0 && letter == special_char)
- )
- return 1;
- return 0;
- }
- int is_word(CString cuvant)
- {
- int i;
- for(i = 0; i < cuvant.GetLength(); i++)
- if(i > 0) {
- if(!::IsCharAlphaW(cuvant[i]) && !is_diacritic(cuvant[i],_T('-')))
- return 0;
- }
- else
- if(!::IsCharAlphaW(cuvant[i]) && !is_diacritic(cuvant[i],0))
- return 0;
- if(i != 0)
- return 1;
- else
- return 0;
- }
- int minimul(int a, int b, int c, int d, int e, int f, int &which)
- {
- int minim = a; which = -1;
- if(b < minim) {minim = b;which = 0;}
- if(c < minim) {minim = c;which = 1;}
- if(d < minim) {minim = d;which = 2;}
- if(e < minim) {minim = e;which = 3;}
- if(f < minim) {minim = f;which = 4;}
- return minim;
- }
- int minimul(int a, int b, int c, int d, int e, int f)
- {
- int minim = a;
- if(b < minim) minim = b;
- if(c < minim) minim = c;
- if(d < minim) minim = d;
- if(e < minim) minim = e;
- if(f < minim) minim = f;
- return minim;
- }
- int needs_diacritic(TCHAR a, TCHAR b)
- {
- if((a == _T('a') && b == _T('ă')) || (a == _T('ă') && b == _T('a')))
- return 1;
- if((a == _T('a') && b == _T('â')) || (a == _T('â') && b == _T('a')))
- return 1;
- if((a == _T('i') && b == _T('î')) || (a == _T('î') && b == _T('i')))
- return 1;
- if((a == _T('i') && b == _T('â')) || (a == _T('î') && b == _T('â')))
- return 1;
- if((a == _T('s') && b == _T('ş')) || (a == _T('ş') && b == _T('s')))
- return 1;
- if((a == _T('t') && b == _T('ţ')) || (a == _T('ţ') && b == _T('t')))
- return 1;
- return 0;
- }
- int Levenshtein(CString a, CString b, int MINIMUM_DISTANCE)
- {
- int c[30][30], n = a.GetLength(), m = b.GetLength();
- // initializarea matricei de costuri
- c[0][0] = 0;
- for(int i = 1; i <= m; i++)
- c[0][i] = i*INSERT_LETTER;
- for(int i = 1; i <= n; i++)
- c[i][0] = i*DELETE_LETTER;
- //ne construim matricea de costuri
- int c_copy, c_delete, c_insert, c_replace, c_twiddle, c_diacritic;
- for(int i = 1; i <= n; i++)
- for(int j = 1; j <= m; j++) {
- if(minimul(c[i][j-1], c[i-1][j], c[i-1][j-1], INF, INF, INF) <= MINIMUM_DISTANCE) {
- c_copy = c_twiddle = c_diacritic = INF;
- if(a[i-1] == b[j-1])
- c_copy = COPY_LETTER + c[i-1][j-1];
- else if(needs_diacritic(a[i-1], b[j-1]))
- c_diacritic = DIACRITIC + c[i-1][j-1];
- c_delete = DELETE_LETTER + c[i-1][j];
- c_insert = INSERT_LETTER + c[i][j-1];
- c_replace = REPLACE_LETTER + c[i-1][j-1];
- //the replace cost can be reduced if the letter with which we replace is close to the wrong letter on the keyboard
- if(c_copy == INF) {
- if(is_letter(a[i-1]))
- if(wcschr(KEYBOARD_MAP[a[i-1] - 97], b[j-1]))
- c_replace -= 1;
- if(is_diacritic(a[i-1],'-')) {
- int special_asci;
- if(a[i-1] == _T('ă')) special_asci = 26;
- if(a[i-1] == _T('â')) special_asci = 27;
- if(a[i-1] == _T('î')) special_asci = 28;
- if(a[i-1] == _T('ş')) special_asci = 29;
- if(a[i-1] == _T('ţ')) special_asci = 30;
- if(a[i-1] == _T('-')) special_asci = 31;
- if(wcschr(KEYBOARD_MAP[special_asci], b[j-1]))
- c_replace -= 1;
- }
- }
- if(i >= 2 && j >= 2 && a[i-2] == b[j-1] && a[i-1] == b[j-2])
- c_twiddle = TWIDDLE_LETTERS + c[i-2][j-2];
- c[i][j] = minimul(c_copy, c_diacritic, c_delete, c_insert, c_replace, c_twiddle);
- }
- else
- c[i][j] = INF;
- }
- return c[n][m];
- }
- void Levenshtein2(CString a, CString b, int UserStats[2][7])
- {
- int Changes[30][30];
- int c[30][30], n = a.GetLength(), m = b.GetLength();
- // initializarea matricei de costuri
- c[0][0] = 0;
- for(int i = 1; i <= m; i++) {
- c[0][i] = i*INSERT_LETTER;
- Changes[0][i] = 2;
- }
- for(int i = 1; i <= n; i++) {
- c[i][0] = i*DELETE_LETTER;
- Changes[0][i] = 1;
- }
- //ne construim matricea de costuri
- int c_copy, c_delete, c_insert, c_replace, c_twiddle, c_diacritic;
- for(int i = 1; i <= n; i++)
- for(int j = 1; j <= m; j++) {
- c_copy = c_twiddle = c_diacritic = INF;
- if(a[i-1] == b[j-1])
- c_copy = COPY_LETTER + c[i-1][j-1];
- else if(needs_diacritic(a[i-1], b[j-1]))
- c_diacritic = DIACRITIC + c[i-1][j-1];
- c_delete = DELETE_LETTER + c[i-1][j];
- c_insert = INSERT_LETTER + c[i][j-1];
- c_replace = REPLACE_LETTER + c[i-1][j-1];
- //costul de inlocuire poate fi redus daca litera cu care inlocuim e apropiata pe tastatura de litera gresita
- if(c_copy == INF) {
- if(is_letter(a[i-1]))
- if(wcschr(KEYBOARD_MAP[a[i-1] - 97], b[j-1]))
- c_replace -= 1;
- if(is_diacritic(a[i-1],'-')) {
- int special_asci;
- if(a[i-1] == _T('ă')) special_asci = 26;
- if(a[i-1] == _T('â')) special_asci = 27;
- if(a[i-1] == _T('î')) special_asci = 28;
- if(a[i-1] == _T('ş')) special_asci = 29;
- if(a[i-1] == _T('ţ')) special_asci = 30;
- if(a[i-1] == _T('-')) special_asci = 31;
- if(wcschr(KEYBOARD_MAP[special_asci], b[j-1]))
- c_replace -= 1;
- }
- }
- if(i >= 2 && j >= 2 && a[i-2] == b[j-1] && a[i-1] == b[j-2])
- c_twiddle = TWIDDLE_LETTERS + c[i-2][j-2];
- c[i][j] = minimul(c_copy, c_diacritic, c_delete, c_insert, c_replace, c_twiddle, Changes[i][j]);
- }
- //we determin the changes that were made and save them in UserStats
- int Mistakes = 0;
- int j, k;
- j = n; k = m;
- while(j >0 && k >0) {
- if(Changes[j][k] != -1) Mistakes++;
- if(Changes[j][k] == -1) {j--;k--;} //copy_letter
- else if(Changes[j][k] == 0) {j--;k--;UserStats[1][0]++;} //diacritic
- else if(Changes[j][k] == 1) {j--;UserStats[1][1]++;} //delete
- else if(Changes[j][k] == 2) {k--;UserStats[1][2]++;} //insert
- else if(Changes[j][k] == 3) {j--;k--;UserStats[1][3]++;} //replace
- else if(Changes[j][k] == 4) {j -= 2;k -= 2;UserStats[1][4]++;} //twiddle
- }
- UserStats[1][5] += Mistakes;
- }
- void RefreshCosts(int UserStats[2][7])
- {
- DIACRITIC = UserStats[0][0];
- DELETE_LETTER = UserStats[0][1];
- INSERT_LETTER = UserStats[0][2];
- REPLACE_LETTER = UserStats[0][3];
- TWIDDLE_LETTERS = UserStats[0][4];
- }
- //changes Levenshtein costs for calculating distance based on the mistakes he founds in the document
- void ChangeCosts(int UserStats[2][7])
- {
- float percentage_scores[5];
- for(int i = 0; i <= 4; i++) {
- percentage_scores[i] = float(UserStats[1][i]) / float(UserStats[1][5]);
- // if(i == 3) {//replace e caz special pt ca in levenshtein scadem un pct daca litera e apropiata
- if(percentage_scores[i] -(1 - float(float(UserStats[0][i])/4)) >= 0.25)
- UserStats[0][i]--;
- else
- if(percentage_scores[i] != 1 && UserStats[0][i] == 0) UserStats[0][i] = 1;
- else
- if(percentage_scores[i] - (1 - float(float(UserStats[0][i])/4)) < -0.25)
- UserStats[0][i]++;
- }
- /* }
- else {
- if(percentage_scores[i] = 0)
- UserStats[0][i] = 4;
- else
- if(percentage_scores[i] < 0.25)
- UserStats[0][i] = 3;
- else
- if(percentage_scores[i] < 0.5)
- UserStats[0][i] = 2;
- else
- if(percentage_scores[i] < 1)
- UserStats[0][i] = 1;
- else
- UserStats[0][i] = 0;
- }
- }*/
- RefreshCosts(UserStats);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement