Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // g++ -O2 -Wall -std=c++11 ctrans.cpp -o ctrans
- // ./ctrans data/alexa_scrape.txt > inv.txt
- #include <sys/mman.h>
- #include <sys/types.h>
- #include <sys/stat.h>
- #include <fcntl.h>
- #include <unistd.h>
- #include <iostream>
- #include <unordered_map>
- #include <vector>
- #include <cstring>
- #include <cstdlib>
- #include <cstdio>
- using namespace std;
- struct cat_rank {
- char *cat;
- unsigned int rank;
- };
- struct cp_hash {
- inline size_t operator()(const char *val) const {
- size_t hash = 5381;
- char c;
- while ((c = *val++)) {
- hash += (hash << 5) + (size_t) c;
- }
- return hash;
- }
- };
- struct cp_equal {
- inline bool operator()(const char *a, const char *b) const {
- return strcmp(a,b) == 0;
- }
- };
- int main(int argc, char **argv) {
- if (argc != 2) {
- cout << "expecting input file as argument" << endl;
- return -1;
- }
- // setup for mmap
- int fd;
- if ((fd = open(argv[1], O_RDONLY)) == -1) {
- perror("open");
- return 1;
- }
- // stat for file size
- struct stat stats;
- if (fstat(fd, &stats) == -1) {
- perror("fstat");
- return 1;
- }
- // map into memory
- void *data = nullptr;
- if ((data = mmap(0, stats.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0)) == MAP_FAILED) {
- perror("mmap");
- return 1;
- }
- // sequential pass
- madvise(data, stats.st_size, MADV_SEQUENTIAL);
- // represent each string as an offset and a length, to avoid copying we only save the address
- char *begin, *end;
- begin = end = (char *)data;
- // indexing data
- cat_rank current_cat = { .cat = nullptr, .rank = 0 };
- unordered_map<const char *, unsigned int> category_totals;
- unordered_map<const char *, vector<cat_rank> *, cp_hash, cp_equal> keyvals;
- while ((end = (char *)memchr(end, '\n', stats.st_size - (end - (char *)data))) != nullptr) {
- // replace the new line
- *end = '\0';
- // do the real work here
- if (*begin == '/') {
- if (current_cat.cat != nullptr) {
- category_totals[current_cat.cat] = current_cat.rank;
- }
- current_cat = { .cat = begin, .rank = 0 };
- } else {
- // parse out the numbers
- const char *name = (const char *)memchr(begin, ' ', end - begin) + 1;
- if (keyvals.find(name) == keyvals.end()) {
- keyvals[name] = new vector<cat_rank>();
- }
- keyvals[name]->push_back(current_cat);
- ++current_cat.rank;
- }
- // reset for next line
- begin = ++end;
- }
- // random access
- madvise(data, stats.st_size, MADV_RANDOM);
- cout.sync_with_stdio(false);
- // output results
- for(auto keypair : keyvals) {
- cout << keypair.first << "\n";
- for (auto rank : *keypair.second) {
- cout << "(" << rank.rank << "/" << category_totals[rank.cat] << ") " << rank.cat << "\n";
- }
- }
- cout.flush();
- // clean up
- for (auto keypair : keyvals) {
- delete keypair.second;
- }
- keyvals.clear();
- // unmap
- if (munmap(data, stats.st_size) == -1) {
- perror("munmap");
- return 1;
- }
- // close
- if (close(fd) == -1) {
- perror("close");
- return 1;
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement