Advertisement
rockdrilla

transliterate.cc: cyrillic to ascii transform

Nov 26th, 2014
514
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 7.14 KB | None | 0 0
  1. /*
  2. License: MIT. Id est: Use it. Reuse it. Love it.
  3.  
  4. # build options for overall usage
  5. CXXFLAGS='-std=c++11'
  6. LDFLAGS='-lboost_{system,locale}'
  7.  
  8. # build options for Gentoo fellows
  9. CXXFLAGS='-std=c++11 -g0 -Ofast -flto=2 -flto-compression-level=0 -ffat-lto-objects -march=native -mtune=native'
  10. LDFLAGS='-fuse-linker-plugin -Wl,{-z\,{relro,now},-l{dl,pthread,icu{data,uc,i18n},boost_{system,thread,chrono,locale}}}'
  11.  
  12. NAME=transliterate
  13. build() {
  14.     eval rm $1 ${NAME}{,.o} 2>/dev/null
  15.     eval g++ -o ${NAME}{.o,.cc} ${CXXFLAGS} -c
  16.     eval g++ -o ${NAME}{,.o}    ${CXXFLAGS} ${LDFLAGS}
  17.     eval rm ${NAME}.o 2>/dev/null
  18.     eval objcopy --strip-debug --strip-unneeded ${NAME} 2>/dev/null
  19.     eval LC_ALL=C ls -lgG ${NAME} 2>/dev/null
  20. }
  21. */
  22.  
  23. #define TEST_WORD_MIN_LEN 1000
  24. #define TEST_WORD_MAX_LEN 1000
  25. #define TEST_WORD_SEQ_LEN 10000
  26. #define TEST_SILENT
  27.  
  28. #include <functional>
  29. #include <iostream>
  30.  
  31. #include <string>
  32. #include <map>
  33. #include <unordered_map>
  34. #include <vector>
  35.  
  36. #include <chrono>
  37. #include <thread>
  38. #include <random>
  39.  
  40. //package "libboost-locale-dev"
  41. #include <boost/locale/boundary.hpp>
  42. #include <boost/locale/conversion.hpp>
  43. #include <boost/locale/generator.hpp>
  44.  
  45. static std::map<std::string, std::string> rules_base {
  46. {"а", "a"},
  47. {"б", "b"},
  48. {"в", "v"},
  49. {"г", "g"},
  50. {"д", "d"},
  51. {"е", "e"},
  52. {"ё", "yo"},
  53. {"ж", "zh"},
  54. {"з", "z"},
  55. {"и", "i"},
  56. {"й", "y"},
  57. {"к", "k"},
  58. {"л", "l"},
  59. {"м", "m"},
  60. {"н", "n"},
  61. {"о", "o"},
  62. {"п", "p"},
  63. {"р", "r"},
  64. {"с", "s"},
  65. {"т", "t"},
  66. {"у", "u"},
  67. {"ф", "f"},
  68. {"х", "h"},
  69. {"ц", "ts"},
  70. {"ч", "ch"},
  71. {"ш", "sh"},
  72. {"щ", "sch"},
  73. {"ъ", "`"},
  74. {"ы", "y"},
  75. {"ь", "'"},
  76. {"э", "e"},
  77. {"ю", "yu"},
  78. {"я", "ya"},
  79. };
  80.  
  81. // -------------------
  82.  
  83. void nop(void);
  84.  
  85. // -------------------
  86.  
  87. std::string lowercase(const std::string & target) {
  88.     return boost::locale::to_lower(
  89.         boost::locale::normalize(target, boost::locale::norm_nfc)
  90.     );
  91. }
  92.  
  93. std::string uppercase(const std::string & target) {
  94.     return boost::locale::to_upper(
  95.         boost::locale::normalize(target, boost::locale::norm_nfc)
  96.     );
  97. }
  98.  
  99. std::string titlecase(const std::string & target) {
  100.     return boost::locale::to_title(
  101.         boost::locale::normalize(target, boost::locale::norm_nfc)
  102.     );
  103. }
  104.  
  105. // -------------------
  106.  
  107. void initialize_locale(void) {
  108.     boost::locale::generator gen;
  109.     std::locale loc = gen("");
  110.     std::locale::global(loc);
  111.  
  112.     std::cin.imbue(loc);
  113.     std::cout.imbue(loc);
  114.     std::cerr.imbue(loc);
  115. }
  116.  
  117. static std::unordered_map<std::string, std::string> rules;
  118. static std::vector<std::string> letters;
  119. void initialize_engine(void) {
  120.     rules.clear();
  121.     letters.clear();
  122.  
  123.     for (const auto & rule : rules_base) {
  124.         rules[uppercase(rule.first)] = titlecase(rule.second);
  125.         rules[lowercase(rule.first)] = lowercase(rule.second);
  126.         letters.push_back(rule.first);
  127.     }
  128.  
  129.     letters.shrink_to_fit();
  130. }
  131.  
  132. // -------------------
  133.  
  134. std::string transliterate_1(const std::string & text) {
  135.     std::string result;
  136.  
  137.     boost::locale::boundary::ssegment_index
  138.         index( boost::locale::boundary::character, text.begin(), text.end() );
  139.  
  140.     boost::locale::boundary::ssegment_index::iterator
  141.         current, end;
  142.  
  143.     for (current = index.begin(), end = index.end(); current != end; ++current) {
  144.         auto rule = rules.find(*current);
  145.         if (rule != rules.end()) {
  146.             result.append(rule->second);
  147.         } else {
  148.             result.append(*current);
  149.         }
  150.     }
  151.  
  152.     return result;
  153. }
  154.  
  155. /*
  156. //TODO: rewrite code
  157. std::string transliterate_2(const std::string & text) {
  158.     std::string result;
  159.  
  160.     boost::locale::boundary::ssegment_index
  161.         index( boost::locale::boundary::character, text.begin(), text.end() );
  162.  
  163.     boost::locale::boundary::ssegment_index::iterator
  164.         current, end;
  165.  
  166.     for (current = index.begin(), end = index.end(); current != end; ++current) {
  167.         auto rule = rules.find(*current);
  168.         if (rule != rules.end()) {
  169.             result.append(rule->second);
  170.         } else {
  171.             result.append(*current);
  172.         }
  173.     }
  174.  
  175.     return result;
  176. }
  177. //*/
  178.  
  179. // -------------------
  180.  
  181. /*
  182. source taken from:
  183. http://codereview.stackexchange.com/questions/48872/measuring-execution-time-in-c
  184. */
  185. template<typename TimeT = std::chrono::milliseconds>
  186. struct measure {
  187.     template<typename F, typename ...Args>
  188.     static typename TimeT::rep execution(F func, Args&&... args) {
  189.         auto start = std::chrono::system_clock::now();
  190.         func(std::forward<Args>(args)...);
  191.         auto duration = std::chrono::duration_cast<TimeT>(std::chrono::system_clock::now() - start);
  192.         return duration.count();
  193.     }
  194. };
  195.  
  196. // -------------------
  197.  
  198. static std::vector<std::string> words;
  199. void generate_test_data(void) {
  200.     std::string tmp;
  201.  
  202.     words.clear();
  203.  
  204.     //lowercase letters
  205.     tmp.clear();
  206.     for (const auto & rule : rules_base) {
  207.         tmp.append(lowercase(rule.first));
  208.     }
  209.     words.push_back(tmp);
  210.  
  211.     //uppercase letters
  212.     tmp.clear();
  213.     for (const auto & rule : rules_base) {
  214.         tmp.append(uppercase(rule.first));
  215.     }
  216.     words.push_back(tmp);
  217.  
  218.     //generate some letter sequences
  219.     std::minstd_rand gen;
  220.     std::uniform_int_distribution<int> dist(0, letters.size() - 1);
  221.  
  222.     uint i, k, n;
  223.     for (i = TEST_WORD_MIN_LEN; i < TEST_WORD_MAX_LEN + 1; ++i) {
  224.         for (k = 0; k < TEST_WORD_SEQ_LEN; ++k) {
  225.             tmp.clear();
  226.             for (n = 0; n < i; ++n) {
  227.                 tmp.append(rules[letters[dist(gen)]]);
  228.             }
  229.             words.push_back(tmp);
  230.         }
  231.     }
  232.  
  233. //  std::cerr << "words.size     = " << words.size() << std::endl;
  234. //  std::cerr << "words.capacity = " << words.capacity() << std::endl;
  235. //  std::cerr << "words.shrink_to_fit()" << std::endl;
  236.     words.shrink_to_fit();
  237. //  std::cerr << "words.capacity = " << words.capacity() << std::endl;
  238. }
  239.  
  240. void run_test_1(void) {
  241.     for (const auto & word : words) {
  242. #ifdef TEST_SILENT
  243.         transliterate_1(word);
  244. #else
  245.         std::cout << word << " -> " << transliterate_1(word) << std::endl;
  246. #endif
  247.     }
  248. }
  249.  
  250. /*
  251. void run_test_2(void) {
  252.     for (const auto & word : words) {
  253. #ifdef TEST_SILENT
  254.         transliterate_2(word);
  255. #else
  256.         std::cout << word << " -> " << transliterate_2(word) << std::endl;
  257. #endif
  258.     }
  259. }
  260. //*/
  261.  
  262. void run_tests(void) {
  263.     auto duration = measure<>::execution(nop);
  264.  
  265. //  std::cerr << "sleep 3 seconds before test #1" << std::endl;
  266.     std::this_thread::sleep_for(std::chrono::seconds(3));
  267.     std::cerr << "run test #1 ... ";
  268.     duration = measure<>::execution(run_test_1);
  269.     std::cerr << duration << " milliseconds" << std::endl;
  270.  
  271. /*
  272. //  std::cerr << "sleep 3 seconds before test #2" << std::endl;
  273.     std::this_thread::sleep_for(std::chrono::seconds(3));
  274.     std::cerr << "run test #2 ... ";
  275.     duration = measure<>::execution(run_test_2);
  276.     std::cerr << duration << " milliseconds" << std::endl;
  277. //*/
  278. }
  279.  
  280. // -------------------
  281.  
  282. int main(int argc, char* argv[]) {
  283.     std::cerr
  284.     << "test configuration:"
  285.     << std::endl
  286.     << "  word length:  [" << TEST_WORD_MIN_LEN << ".." << TEST_WORD_MAX_LEN << "]"
  287.     << std::endl
  288.     << "  sample count: " << TEST_WORD_SEQ_LEN
  289.     << std::endl;
  290.  
  291.     initialize_locale();
  292.     initialize_engine();
  293.  
  294.     auto duration = measure<>::execution(nop);
  295.  
  296.     std::cerr << "generate data for tests ... ";
  297.     duration = measure<>::execution(generate_test_data);
  298.     std::cerr << duration << " milliseconds" << std::endl;
  299.  
  300. //  std::cerr << "run tests..." << std::endl;
  301.     run_tests();
  302.  
  303.     return 0;
  304. }
  305.  
  306. void nop(void) { ; }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement