Advertisement
Guest User

Untitled

a guest
Feb 25th, 2017
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.93 KB | None | 0 0
  1. #include <sys/stat.h>
  2. #include <stdexcept>
  3. #include <iostream>
  4. #include <fstream>
  5. #include <string>
  6. #include <map>
  7. #include <utility>
  8. #include <regex>
  9. #include <locale>
  10. #include <clocale>
  11. #include <stdio.h>
  12. #include <codecvt>
  13.  
  14. using namespace std;
  15.  
  16. // Map of found words + number of occasions
  17. typedef map<wstring, int> WordMap;
  18. // For ordering
  19. typedef pair<wstring, int> WordFrequency;
  20.  
  21. // Since we don't know which locales current system have installed let's try this list
  22. const vector<string> RU_LOCALES = { "ru_RU.UTF-8", "ru_RU", "ru" };
  23.  
  24. // Yep, I googled this one
  25. bool FileExists(const char* name) {
  26. struct stat buffer;
  27. return (stat(name, &buffer) == 0);
  28. }
  29.  
  30. // Just in case basic checks for input parameters
  31. int CheckParams(int argc, char *argv[]) {
  32. if (argc < 3) {
  33. throw invalid_argument("Invalid parameters. Two arguments required: input filename and output filename");
  34. }
  35. if (!FileExists(argv[1])) {
  36. char* msg = new char[256];
  37. snprintf(msg, 256, "Cannot open file \"%s\"", argv[1]);
  38. throw invalid_argument(msg);
  39. }
  40. return 0;
  41. }
  42.  
  43. int main(int argc, char *argv[]) {
  44. // Enterprise level code here we go
  45. try {
  46. CheckParams(argc, argv);
  47. }
  48. catch (const exception& e) {
  49. cerr << e.what();
  50. return 0;
  51. }
  52. // Just kidding
  53. for (unsigned int i = 0; i < RU_LOCALES.size(); i++) {
  54. try {
  55. locale::global(locale(locale(RU_LOCALES[i]), new codecvt_utf8<wchar_t>));
  56. break;
  57. }
  58. catch (const exception&) {
  59. if (RU_LOCALES.size()-1 == i) {
  60. cout << "Warning: Cannot set locale \"ru\". Cyrillic characters may not be detected correctly" << endl;
  61. }
  62. }
  63. }
  64. wstring token;
  65. WordMap words;
  66. wifstream input;
  67. wofstream output;
  68. input.open(argv[1]);
  69. output.open(argv[2]);
  70. // Variables for tranforming and parsing text
  71. wregex rx(L"[[:alpha:]]+");
  72. wsmatch word;
  73. locale currentLocale("");
  74. while (!input.eof()) {
  75. input >> token;
  76. // To lowercase
  77. transform(
  78. token.begin(),
  79. token.end(),
  80. token.begin(),
  81. [&currentLocale] (wchar_t c) {
  82. return tolower(c, currentLocale);
  83. }
  84. );
  85. auto from(token.cbegin());
  86. // Find words with regex and shove them into a map
  87. while (regex_search(from, token.cend(), word, rx))
  88. {
  89. auto node = words.find(word[0]);
  90. // if it's already in there increment counter
  91. if (node != words.cend()) {
  92. node->second++;
  93. }
  94. else {
  95. words.insert(make_pair(word[0], 1));
  96. }
  97. from += word.position() + word.length();
  98. }
  99. }
  100. // Now make an ordered list of words
  101. vector<WordFrequency> ordered;
  102. for (auto it = words.cbegin(); it != words.cend(); it++) {
  103. ordered.push_back(*it);
  104. }
  105. sort(
  106. ordered.begin(),
  107. ordered.end(),
  108. [](WordFrequency elem1, WordFrequency elem2) -> bool {
  109. // first sort by frequency, then lexicographically
  110. return elem1.second > elem2.second
  111. || elem1.second == elem2.second && elem1.first < elem2.first;
  112. }
  113. );
  114. // We did it boys
  115. for (auto it = ordered.cbegin(); it != ordered.cend(); it++) {
  116. output << it->second << ' ' << it->first << endl;
  117. }
  118. return 0;
  119. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement