// g++ -O2 -Wall -std=c++11 ctrans.cpp -o ctrans // ./ctrans data/alexa_scrape.txt > inv.txt #include #include #include #include #include #include #include #include #include #include #include using namespace std; struct cat_rank { char *cat; unsigned int rank; }; struct cp_hash { inline size_t operator()(const char *val) const { size_t hash = 5381; char c; while ((c = *val++)) { hash += (hash << 5) + (size_t) c; } return hash; } }; struct cp_equal { inline bool operator()(const char *a, const char *b) const { return strcmp(a,b) == 0; } }; int main(int argc, char **argv) { if (argc != 2) { cout << "expecting input file as argument" << endl; return -1; } // setup for mmap int fd; if ((fd = open(argv[1], O_RDONLY)) == -1) { perror("open"); return 1; } // stat for file size struct stat stats; if (fstat(fd, &stats) == -1) { perror("fstat"); return 1; } // map into memory void *data = nullptr; if ((data = mmap(0, stats.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0)) == MAP_FAILED) { perror("mmap"); return 1; } // sequential pass madvise(data, stats.st_size, MADV_SEQUENTIAL); // represent each string as an offset and a length, to avoid copying we only save the address char *begin, *end; begin = end = (char *)data; // indexing data cat_rank current_cat = { .cat = nullptr, .rank = 0 }; unordered_map category_totals; unordered_map *, cp_hash, cp_equal> keyvals; while ((end = (char *)memchr(end, '\n', stats.st_size - (end - (char *)data))) != nullptr) { // replace the new line *end = '\0'; // do the real work here if (*begin == '/') { if (current_cat.cat != nullptr) { category_totals[current_cat.cat] = current_cat.rank; } current_cat = { .cat = begin, .rank = 0 }; } else { // parse out the numbers const char *name = (const char *)memchr(begin, ' ', end - begin) + 1; if (keyvals.find(name) == keyvals.end()) { keyvals[name] = new vector(); } keyvals[name]->push_back(current_cat); ++current_cat.rank; } // reset for next line begin = ++end; } // random access madvise(data, stats.st_size, MADV_RANDOM); cout.sync_with_stdio(false); // output results for(auto keypair : keyvals) { cout << keypair.first << "\n"; for (auto rank : *keypair.second) { cout << "(" << rank.rank << "/" << category_totals[rank.cat] << ") " << rank.cat << "\n"; } } cout.flush(); // clean up for (auto keypair : keyvals) { delete keypair.second; } keyvals.clear(); // unmap if (munmap(data, stats.st_size) == -1) { perror("munmap"); return 1; } // close if (close(fd) == -1) { perror("close"); return 1; } return 0; }