Advertisement
Guest User

Untitled

a guest
Feb 28th, 2017
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.77 KB | None | 0 0
  1. //
  2. // main.cpp
  3. // freqs
  4. //
  5. // Created by Alexandr Shchukin on 27/02/2017.
  6. // Copyright © 2017 Alexandr Shchukin. All rights reserved.
  7. //
  8.  
  9. #include <iostream>
  10. #include <map>
  11. #include <vector>
  12. #include <sstream>
  13. #include <fstream>
  14. #include <algorithm>
  15. #include <initializer_list>
  16.  
  17. using namespace std;
  18.  
  19. typedef vector<uint8_t> String;
  20.  
  21. /*
  22. is no need for map<String, ...>, using default less<>
  23.  
  24. template<>
  25. struct less<String> {
  26. bool operator()(String const& left, String const& right) const
  27. {
  28. auto lIt = left.begin(), lEnd = left.end();
  29. auto rIt = right.begin(), rEnd = right.end();
  30. for(; lIt != lEnd && rIt != rEnd; ++lIt, ++rIt)
  31. {
  32. if (*lIt == *rIt)
  33. {
  34. continue;
  35. }
  36. return *lIt < *rIt;
  37. }
  38. return left.size() < right.size();
  39. }
  40. };
  41. */
  42.  
  43. typedef map<String, size_t> FreqsTable;
  44.  
  45. ostream& operator<<(ostream& stream, String const& s)
  46. {
  47. stream.write(reinterpret_cast<const char*>(s.data()), s.size());
  48. return stream;
  49. }
  50.  
  51. // Russian rune encoded with 2 byte
  52. bool getRune(ifstream& f, uint16_t& r) {
  53. int const L2 = 0xc0;
  54. int const L3 = 0xe0;
  55. int const L4 = 0xf0;
  56. int const L5 = 0xf8;
  57. int const L6 = 0xfc;
  58.  
  59. int c = f.get();
  60. if (f.eof())
  61. return false;
  62.  
  63. // TODO check input error for next reading
  64.  
  65. // skip long runes
  66. if ((c & L6) == L6)
  67. {
  68. f.ignore(5);
  69. }
  70. else if ((c & L5) == L5)
  71. {
  72. f.ignore(4);
  73. }
  74. else if ((c & L4) == L4)
  75. {
  76. f.ignore(3);
  77. }
  78. else if ((c & L3) == L3)
  79. {
  80. f.ignore(2);
  81. }
  82. else if ((c & L2) == L2)
  83. {
  84. r = static_cast<uint16_t>((c << 8) | f.get());
  85. return true;
  86. }
  87. else
  88. {
  89. r = static_cast<uint16_t>(c);
  90. return true;
  91. }
  92. return false;
  93. }
  94.  
  95. // check letter and to lower
  96. // return true if letter is valid
  97. bool processRune(uint16_t& r)
  98. {
  99. uint16_t const Utf8Rus_a = 0xd0b0;
  100. uint16_t const Utf8Rus_ya = 0xd18f;
  101. uint16_t const Utf8Rus_yo = 0xd191;
  102. uint16_t const Utf8Rus_A = 0xd090;
  103. uint16_t const Utf8Rus_Ya = 0xd0af;
  104. uint16_t const Utf8Rus_Yo = 0xd081;
  105.  
  106.  
  107. if ((r >= 'a' && r <= 'z')) // ascii English lower
  108. {
  109. return true;
  110. }
  111. else if (r >= 'A' && r <= 'Z') // ascii English upper
  112. {
  113. r = 'a' + (r - 'A');
  114. return true;
  115. }
  116.  
  117. /*
  118. auto toUtf8Code = [](unsigned short u)->unsigned short
  119. {
  120. unsigned short hi = u & 0x1f00;
  121. unsigned short lo = u & 0x003f;
  122. return ((lo << 2) | hi) >> 2;
  123. };
  124. */
  125.  
  126. auto fromUtf8Code = [](uint16_t u)->uint16_t
  127. {
  128. uint16_t hi = ((u << 2) & 0x1f00);
  129. uint16_t lo = (u & 0x003f);
  130. return (hi | 0xc000) | (lo | 0x80);
  131. };
  132.  
  133.  
  134. //unsigned short u = toUtf8Code(r);
  135. //if((u >= 0x0430 && u <= 0x044f) || u == 0x0451) // utf-8 Russian lower
  136. if((r >= Utf8Rus_a && r <= Utf8Rus_ya) || r == Utf8Rus_yo) // utf-8 Russian lower
  137. {
  138. return true;
  139. }
  140.  
  141. //if(u >= 0x0410 && u <= 0x042f) // utf-8 Russian upper
  142. if(r >= Utf8Rus_A && r <= Utf8Rus_Ya) // utf-8 Russian upper
  143. {
  144. uint16_t const UTF8_num_a = 0x0430;
  145. uint16_t d = r - Utf8Rus_A;
  146. uint16_t t = fromUtf8Code(UTF8_num_a + d);
  147. r = t;
  148. return true;
  149. }
  150.  
  151. if(r == Utf8Rus_Yo) // Ё (Russian upper yo :-) )
  152. {
  153. r = Utf8Rus_yo; // to lower
  154. return true;
  155. }
  156.  
  157. return false;
  158. }
  159.  
  160. void parseFile(FreqsTable* ft, const char* filename)
  161. {
  162. std::ifstream f(filename);
  163. String s;
  164.  
  165. auto tryAdd = [&s, &ft]()
  166. {
  167. if (s.size() > 0)
  168. {
  169. auto it = ft->find(s);
  170. if (it != ft->end())
  171. {
  172. ++(it->second);
  173. } else {
  174. ft->insert(make_pair(s, 1));
  175. }
  176. s.clear();
  177. }
  178. };
  179.  
  180. uint16_t r;
  181. while (f.good()) {
  182. if (getRune(f, r) && processRune(r))
  183. {
  184. uint16_t const UTF8_PREFIX = 0xc000;
  185. if ((r & UTF8_PREFIX) == UTF8_PREFIX)
  186. {
  187. // utf-8 letter
  188. s.push_back(static_cast<uint8_t>(r >> 8)); // hi part
  189. s.push_back(static_cast<uint8_t>(r & 0xff)); // lo part
  190. } else {
  191. // ascii letter
  192. s.push_back(static_cast<uint8_t>(r));
  193. }
  194. continue;
  195. }
  196. tryAdd();
  197. }
  198. tryAdd(); // may be last word
  199.  
  200. f.close();
  201. }
  202.  
  203. int main(int argc, const char * argv[])
  204. {
  205. FreqsTable ft;
  206.  
  207. if (argc < 2)
  208. {
  209. cout << "freqs <input file> [<input file> ...] <output file>" << endl;
  210. return 1;
  211. }
  212.  
  213. for (int i = 1, end = argc-1; i < end; ++i)
  214. {
  215. parseFile(&ft, argv[i]);
  216. }
  217.  
  218.  
  219. typedef FreqsTable::const_iterator Item;
  220.  
  221. vector<Item> sorted;
  222. /* simple variant
  223. sorted.reserve(ft.size());
  224. for(Item it = ft.begin(), end = ft.end(); it != end; ++it)
  225. {
  226. sorted.push_back(it);
  227. }
  228. */
  229. {
  230. sorted.resize(ft.size());
  231. auto begin = ft.begin();
  232. generate(sorted.begin(), sorted.end(), [&begin]() { return begin++; });
  233. }
  234.  
  235. sort(sorted.begin(), sorted.end(), [](Item const& left, Item const& right)->bool
  236. {
  237. return left->second == right->second ? left->first < right->first : left->second > right->second;
  238. /*
  239. if (left->second == right->second) {
  240. return left->first < right->first;
  241. } else {
  242. return left->second > right->second;
  243. }
  244. */
  245. });
  246.  
  247.  
  248. ofstream out(argv[argc-1], std::ofstream::out | std::ofstream::trunc);
  249. if (out.bad())
  250. {
  251. return 2;
  252. }
  253.  
  254. for (Item const& it : sorted)
  255. {
  256. out << it->second << " " << it->first << endl;
  257. }
  258. out.close();
  259.  
  260. return 0;
  261. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement