Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <fstream>
- #include <vector>
- #include <sstream>
- #include <stdexcept>
- #include <random>
- #include <unordered_map>
- #include <unordered_set>
- using namespace std;
- void usage() {
- cout << "Usage: program VOCAB_FILE N" << endl;
- }
- string rusLetters = "абвгдежзиклмнопрстуфхцчшщэюя";
- vector<string> readVocab(string vocabFilename) {
- ifstream infile(vocabFilename);
- vector<string> vocab;
- string buf;
- while(getline(infile, buf)) {
- if(!buf.empty())
- vocab.push_back(buf);
- }
- return vocab;
- }
- int parseInt(const string& str) {
- istringstream istr(str);
- int x;
- if(istr >> x)
- return x;
- else
- throw runtime_error("Expected integer, got " + str);
- }
- size_t characterLength(char first) {
- unsigned char c = static_cast<unsigned char>(first);
- if(c <= 127)
- return 1;
- else if(c <= 223)
- return 2;
- else if(c <= 239)
- return 3;
- else
- return 4;
- }
- string firstLetter(const string& str) {
- size_t length = characterLength(str.at(0));
- string letter = str.substr(0, length);
- if(letter.size() != length)
- throw runtime_error("Invalid encoding");
- return letter;
- }
- void print_weights(const vector<string>& alphabete, const vector<double>& weights) {
- for(size_t i = 0; i < weights.size(); ++i)
- cout << alphabete[i] << " " << weights[i] << endl;
- cout << endl;
- }
- void normalize(vector<double>& weights) {
- double sum = 0;
- for(double w: weights)
- sum += w;
- for(double& w: weights)
- w /= sum;
- }
- vector<string> getLetters(int nLetters,
- double alpha,
- const vector<string>& alphabete,
- const vector<string>& vocab) {
- vector<double> weights(alphabete.size());
- std::fill(weights.begin(), weights.end(), alpha);
- unordered_map<string, size_t> letter2id;
- for(size_t i = 0; i < alphabete.size(); ++i)
- letter2id[alphabete[i]] = i;
- for(const string& definition: vocab) {
- string letter = firstLetter(definition);
- weights[letter2id.at(letter)] += 1;
- }
- normalize(weights);
- print_weights(alphabete, weights);
- std::random_device rd;
- std::mt19937 gen(rd());
- std::discrete_distribution<size_t> ds(weights.begin(), weights.end());
- vector<string> result;
- for(int i = 0; i < nLetters; ++i) {
- size_t idx = ds(gen);
- result.push_back(alphabete.at(idx));
- }
- return result;
- }
- vector<string> inferAlphabete(const vector<string>& vocab) {
- std::unordered_set<string> unique_letters;
- for(const string& def : vocab)
- unique_letters.insert(firstLetter(def));
- return std::vector<string>(unique_letters.begin(), unique_letters.end());
- }
- int main(int argc, char* argv[])
- {
- if(argc < 3) {
- usage();
- return 0;
- }
- string vocabFilename = argv[1];
- vector<string> vocab = readVocab(vocabFilename);
- int nLetters = parseInt(argv[2]);
- for(const string& letter : getLetters(nLetters, 0, inferAlphabete(vocab), vocab)) {
- cout << letter << " ";
- }
- cout << endl;
- return 0;
- }
Add Comment
Please, Sign In to add comment