Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //
- // main.cpp
- // freqs
- //
- // Created by Alexandr Shchukin on 27/02/2017.
- // Copyright © 2017 Alexandr Shchukin. All rights reserved.
- //
- #include <iostream>
- #include <map>
- #include <vector>
- #include <sstream>
- #include <fstream>
- #include <algorithm>
- #include <initializer_list>
- using namespace std;
- typedef vector<uint8_t> String;
- /*
- is no need for map<String, ...>, using default less<>
- template<>
- struct less<String> {
- bool operator()(String const& left, String const& right) const
- {
- auto lIt = left.begin(), lEnd = left.end();
- auto rIt = right.begin(), rEnd = right.end();
- for(; lIt != lEnd && rIt != rEnd; ++lIt, ++rIt)
- {
- if (*lIt == *rIt)
- {
- continue;
- }
- return *lIt < *rIt;
- }
- return left.size() < right.size();
- }
- };
- */
- typedef map<String, size_t> FreqsTable;
- ostream& operator<<(ostream& stream, String const& s)
- {
- stream.write(reinterpret_cast<const char*>(s.data()), s.size());
- return stream;
- }
- // Russian rune encoded with 2 byte
- bool getRune(ifstream& f, uint16_t& r) {
- int const L2 = 0xc0;
- int const L3 = 0xe0;
- int const L4 = 0xf0;
- int const L5 = 0xf8;
- int const L6 = 0xfc;
- int c = f.get();
- if (f.eof())
- return false;
- // TODO check input error for next reading
- // skip long runes
- if ((c & L6) == L6)
- {
- f.ignore(5);
- }
- else if ((c & L5) == L5)
- {
- f.ignore(4);
- }
- else if ((c & L4) == L4)
- {
- f.ignore(3);
- }
- else if ((c & L3) == L3)
- {
- f.ignore(2);
- }
- else if ((c & L2) == L2)
- {
- r = static_cast<uint16_t>((c << 8) | f.get());
- return true;
- }
- else
- {
- r = static_cast<uint16_t>(c);
- return true;
- }
- return false;
- }
- // check letter and to lower
- // return true if letter is valid
- bool processRune(uint16_t& r)
- {
- uint16_t const Utf8Rus_a = 0xd0b0;
- uint16_t const Utf8Rus_ya = 0xd18f;
- uint16_t const Utf8Rus_yo = 0xd191;
- uint16_t const Utf8Rus_A = 0xd090;
- uint16_t const Utf8Rus_Ya = 0xd0af;
- uint16_t const Utf8Rus_Yo = 0xd081;
- if ((r >= 'a' && r <= 'z')) // ascii English lower
- {
- return true;
- }
- else if (r >= 'A' && r <= 'Z') // ascii English upper
- {
- r = 'a' + (r - 'A');
- return true;
- }
- /*
- auto toUtf8Code = [](unsigned short u)->unsigned short
- {
- unsigned short hi = u & 0x1f00;
- unsigned short lo = u & 0x003f;
- return ((lo << 2) | hi) >> 2;
- };
- */
- auto fromUtf8Code = [](uint16_t u)->uint16_t
- {
- uint16_t hi = ((u << 2) & 0x1f00);
- uint16_t lo = (u & 0x003f);
- return (hi | 0xc000) | (lo | 0x80);
- };
- //unsigned short u = toUtf8Code(r);
- //if((u >= 0x0430 && u <= 0x044f) || u == 0x0451) // utf-8 Russian lower
- if((r >= Utf8Rus_a && r <= Utf8Rus_ya) || r == Utf8Rus_yo) // utf-8 Russian lower
- {
- return true;
- }
- //if(u >= 0x0410 && u <= 0x042f) // utf-8 Russian upper
- if(r >= Utf8Rus_A && r <= Utf8Rus_Ya) // utf-8 Russian upper
- {
- uint16_t const UTF8_num_a = 0x0430;
- uint16_t d = r - Utf8Rus_A;
- uint16_t t = fromUtf8Code(UTF8_num_a + d);
- r = t;
- return true;
- }
- if(r == Utf8Rus_Yo) // Ё (Russian upper yo :-) )
- {
- r = Utf8Rus_yo; // to lower
- return true;
- }
- return false;
- }
- void parseFile(FreqsTable* ft, const char* filename)
- {
- std::ifstream f(filename);
- String s;
- auto tryAdd = [&s, &ft]()
- {
- if (s.size() > 0)
- {
- auto it = ft->find(s);
- if (it != ft->end())
- {
- ++(it->second);
- } else {
- ft->insert(make_pair(s, 1));
- }
- s.clear();
- }
- };
- uint16_t r;
- while (f.good()) {
- if (getRune(f, r) && processRune(r))
- {
- uint16_t const UTF8_PREFIX = 0xc000;
- if ((r & UTF8_PREFIX) == UTF8_PREFIX)
- {
- // utf-8 letter
- s.push_back(static_cast<uint8_t>(r >> 8)); // hi part
- s.push_back(static_cast<uint8_t>(r & 0xff)); // lo part
- } else {
- // ascii letter
- s.push_back(static_cast<uint8_t>(r));
- }
- continue;
- }
- tryAdd();
- }
- tryAdd(); // may be last word
- f.close();
- }
- int main(int argc, const char * argv[])
- {
- FreqsTable ft;
- if (argc < 2)
- {
- cout << "freqs <input file> [<input file> ...] <output file>" << endl;
- return 1;
- }
- for (int i = 1, end = argc-1; i < end; ++i)
- {
- parseFile(&ft, argv[i]);
- }
- typedef FreqsTable::const_iterator Item;
- vector<Item> sorted;
- /* simple variant
- sorted.reserve(ft.size());
- for(Item it = ft.begin(), end = ft.end(); it != end; ++it)
- {
- sorted.push_back(it);
- }
- */
- {
- sorted.resize(ft.size());
- auto begin = ft.begin();
- generate(sorted.begin(), sorted.end(), [&begin]() { return begin++; });
- }
- sort(sorted.begin(), sorted.end(), [](Item const& left, Item const& right)->bool
- {
- return left->second == right->second ? left->first < right->first : left->second > right->second;
- /*
- if (left->second == right->second) {
- return left->first < right->first;
- } else {
- return left->second > right->second;
- }
- */
- });
- ofstream out(argv[argc-1], std::ofstream::out | std::ofstream::trunc);
- if (out.bad())
- {
- return 2;
- }
- for (Item const& it : sorted)
- {
- out << it->second << " " << it->first << endl;
- }
- out.close();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement