Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <sys/stat.h>
- #include <stdexcept>
- #include <iostream>
- #include <fstream>
- #include <string>
- #include <map>
- #include <utility>
- #include <regex>
- #include <locale>
- #include <clocale>
- #include <stdio.h>
- #include <codecvt>
- using namespace std;
- // Map of found words + number of occasions
- typedef map<wstring, int> WordMap;
- // For ordering
- typedef pair<wstring, int> WordFrequency;
- // Since we don't know which locales current system have installed let's try this list
- const vector<string> RU_LOCALES = { "ru_RU.UTF-8", "ru_RU", "ru" };
- // Yep, I googled this one
- bool FileExists(const char* name) {
- struct stat buffer;
- return (stat(name, &buffer) == 0);
- }
- // Just in case basic checks for input parameters
- int CheckParams(int argc, char *argv[]) {
- if (argc < 3) {
- throw invalid_argument("Invalid parameters. Two arguments required: input filename and output filename");
- }
- if (!FileExists(argv[1])) {
- char* msg = new char[256];
- snprintf(msg, 256, "Cannot open file \"%s\"", argv[1]);
- throw invalid_argument(msg);
- }
- return 0;
- }
- int main(int argc, char *argv[]) {
- // Enterprise level code here we go
- try {
- CheckParams(argc, argv);
- }
- catch (const exception& e) {
- cerr << e.what();
- return 0;
- }
- // Just kidding
- for (unsigned int i = 0; i < RU_LOCALES.size(); i++) {
- try {
- locale::global(locale(locale(RU_LOCALES[i]), new codecvt_utf8<wchar_t>));
- break;
- }
- catch (const exception&) {
- if (RU_LOCALES.size()-1 == i) {
- cout << "Warning: Cannot set locale \"ru\". Cyrillic characters may not be detected correctly" << endl;
- }
- }
- }
- wstring token;
- WordMap words;
- wifstream input;
- wofstream output;
- input.open(argv[1]);
- output.open(argv[2]);
- // Variables for tranforming and parsing text
- wregex rx(L"[[:alpha:]]+");
- wsmatch word;
- locale currentLocale("");
- while (!input.eof()) {
- input >> token;
- // To lowercase
- transform(
- token.begin(),
- token.end(),
- token.begin(),
- [¤tLocale] (wchar_t c) {
- return tolower(c, currentLocale);
- }
- );
- auto from(token.cbegin());
- // Find words with regex and shove them into a map
- while (regex_search(from, token.cend(), word, rx))
- {
- auto node = words.find(word[0]);
- // if it's already in there increment counter
- if (node != words.cend()) {
- node->second++;
- }
- else {
- words.insert(make_pair(word[0], 1));
- }
- from += word.position() + word.length();
- }
- }
- // Now make an ordered list of words
- vector<WordFrequency> ordered;
- for (auto it = words.cbegin(); it != words.cend(); it++) {
- ordered.push_back(*it);
- }
- sort(
- ordered.begin(),
- ordered.end(),
- [](WordFrequency elem1, WordFrequency elem2) -> bool {
- // first sort by frequency, then lexicographically
- return elem1.second > elem2.second
- || elem1.second == elem2.second && elem1.first < elem2.first;
- }
- );
- // We did it boys
- for (auto it = ordered.cbegin(); it != ordered.cend(); it++) {
- output << it->second << ' ' << it->first << endl;
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement