Advertisement
Ladies_Man

Spellchecker (Проверка орфографии на основе биграмм)

Dec 6th, 2014
249
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.44 KB | None | 0 0
  1. #include <iostream>
  2. #include <vector>
  3. #include <iterator>
  4. #include <string>
  5. #include <string.h>
  6. #include <fstream>
  7. #include <set>
  8. #include <map>
  9.  
  10. using namespace std;
  11.  
  12. int get_tokens(const string s, vector<string> &tokens)
  13. {
  14.     string newstr;
  15.     newstr.clear();
  16.     for (int i = 0, mark = 0; i < s.size(); i++, mark = 0) {
  17.         if (s[i] >= 'a' && s[i] <= 'z') newstr += s[i];
  18.         if (s[i] == '\n') mark = 1;
  19.         if (!newstr.empty() && mark == 1) {
  20.             tokens.push_back(newstr);
  21.             newstr.clear();
  22.         }
  23.     }
  24.     return tokens.size();
  25. }
  26.  
  27. float dice_coefficient(set<string> a, set<string> b)
  28. {
  29.     int intersection = 0, join;
  30.     for(set<string>::iterator IT = b.begin(); IT != b.end(); IT++) intersection += a.count((*IT));
  31.     join = a.size() + b.size() - intersection;
  32.     float dice = (float)intersection / (float)join;
  33.     return dice;
  34. }
  35.  
  36. set<string> make_bi(string a)   //Назовём биграммой слова его подстроку длиной в две буквы.
  37. {
  38.     set<string> retset;
  39.     for (int k = 0; k < a.length() - 1; k++) retset.insert(a.substr(k, 2));
  40.     return retset;
  41. }
  42.  
  43. int main()
  44. {
  45. //Tests:
  46.     //string intext = "prepearing\ngoverment\ncomming\nquickle\njouvenile\n";               //1
  47.     //string intext = "beatiful\ntogegether\nenvolving\nilness\nepidemia\n";              //2
  48.     //string intext = "anual\nsincerly\ncluching\nmentaly\nballons\ngirle\nbrethless\n";    //4
  49.     //string intext = "finaly\nexausted\ngrabed\npubliclly\nexcelent\ncontageous\nbegining\nnobady\nhappenin\ninnecessary\n"; //5
  50.     //string intext = "epidemy\nbycicle\ndamadged\nstollen\ndeliceaus\npreventions\nhollidays\nfamilly\nradios\nsympthoms\n"; //6
  51.     //string intext = "affraid\nmeasurment\nappologized\nsimptoms\natribute\npannic\nsincerly\nhuredly\nstoped\nacused\n";      //7
  52.  
  53.  
  54.     string intext( (istreambuf_iterator<char>(cin)),(istreambuf_iterator<char>()) );
  55.     intext += "\n";
  56.     map    < string, string > in_out;       // map  < incorrect_word, correct_word >
  57.     vector < set   < string > > check_word_bi;  // vector of sets of incorrects_word's bigramms
  58.     vector < string > check_word;       // incorrect_words
  59.     vector < int > freq_vect;           // correct_word's frequencies
  60.     vector < float > dice_vect;         // dice_coefficients
  61.     string word;
  62.     int i, word_num = get_tokens(intext, check_word), freq;
  63.     float newdice;
  64.  
  65.     for (i = 0; i < word_num; i++) {
  66.         check_word_bi.push_back(make_bi(check_word[i]));   //makes a bigramm(set) for check_word and pushes it into vector
  67.         dice_vect.push_back(0);
  68.         freq_vect.push_back(0);
  69.     }
  70.  
  71.     ifstream infile;
  72.     infile.open("count_big.txt");       //text format: correct_word(word) word_frequency(number) \n
  73.  
  74.     while (infile >> word >> freq) {
  75.         set <string> word_bi = make_bi(word);
  76.         for (i = 0; i < word_num; i++) {
  77.             newdice = dice_coefficient(word_bi, check_word_bi[i]);
  78.             if (newdice > dice_vect[i]) {
  79.                 dice_vect[i] = newdice;
  80.                 in_out[check_word[i]] = word;
  81.                 freq_vect[i] = freq;
  82.             } else {
  83.                 if (newdice == dice_vect[i] && freq > freq_vect[i]) {
  84.                     in_out[check_word[i]] = word;
  85.                     freq_vect[i] = freq;
  86.                 }
  87.             }
  88.         }
  89.     }
  90.  
  91.     for (i = 0; i < word_num; i++) cout << in_out[check_word[i]] << endl;
  92.     infile.close();
  93.     return 0;
  94. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement