Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on Dec 9th, 2012  |  syntax: C++  |  size: 3.40 KB  |  views: 87  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. // g++ -O2 -Wall -std=c++11 ctrans.cpp -o ctrans
  2. // ./ctrans data/alexa_scrape.txt > inv.txt
  3.  
  4. #include <sys/mman.h>
  5. #include <sys/types.h>
  6. #include <sys/stat.h>
  7. #include <fcntl.h>
  8. #include <unistd.h>
  9.  
  10. #include <iostream>
  11. #include <unordered_map>
  12. #include <vector>
  13.  
  14. #include <cstring>
  15. #include <cstdlib>
  16. #include <cstdio>
  17.  
  18. using namespace std;
  19.  
  20. struct cat_rank {
  21.     char *cat;
  22.     unsigned int rank;
  23. };
  24.  
  25. struct cp_hash {
  26.     inline size_t operator()(const char *val) const {
  27.         size_t hash = 5381;
  28.         char c;
  29.         while ((c = *val++)) {
  30.             hash += (hash << 5) + (size_t) c;
  31.         }
  32.         return hash;
  33.     }
  34. };
  35.  
  36. struct cp_equal {
  37.     inline bool operator()(const char *a, const char *b) const {
  38.         return strcmp(a,b) == 0;
  39.     }
  40. };
  41.  
  42. int main(int argc, char **argv) {
  43.     if (argc != 2) {
  44.         cout << "expecting input file as argument" << endl;
  45.         return -1;
  46.     }
  47.  
  48.     // setup for mmap
  49.     int fd;
  50.     if ((fd = open(argv[1], O_RDONLY)) == -1) {
  51.         perror("open");
  52.         return 1;
  53.     }
  54.  
  55.     // stat for file size
  56.     struct stat stats;
  57.     if (fstat(fd, &stats) == -1) {
  58.         perror("fstat");
  59.         return 1;
  60.     }
  61.  
  62.     // map into memory
  63.     void *data = nullptr;
  64.     if ((data = mmap(0, stats.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0)) == MAP_FAILED) {
  65.         perror("mmap");
  66.         return 1;
  67.     }
  68.  
  69.     // sequential pass
  70.     madvise(data, stats.st_size, MADV_SEQUENTIAL);
  71.  
  72.     // represent each string as an offset and a length, to avoid copying we only save the address
  73.     char *begin, *end;
  74.     begin = end = (char *)data;
  75.  
  76.     // indexing data
  77.     cat_rank current_cat = { .cat = nullptr, .rank = 0 };
  78.  
  79.     unordered_map<const char *, unsigned int> category_totals;
  80.     unordered_map<const char *, vector<cat_rank> *, cp_hash, cp_equal> keyvals;
  81.  
  82.     while ((end = (char *)memchr(end, '\n', stats.st_size - (end - (char *)data))) != nullptr) {
  83.         // replace the new line
  84.         *end = '\0';
  85.  
  86.         // do the real work here
  87.         if (*begin == '/') {
  88.             if (current_cat.cat != nullptr) {
  89.                 category_totals[current_cat.cat] = current_cat.rank;
  90.             }
  91.             current_cat = { .cat = begin, .rank = 0 };
  92.         } else {
  93.             // parse out the numbers
  94.             const char *name = (const char *)memchr(begin, ' ', end - begin) + 1;
  95.             if (keyvals.find(name) == keyvals.end()) {
  96.                 keyvals[name] = new vector<cat_rank>();
  97.             }
  98.             keyvals[name]->push_back(current_cat);
  99.  
  100.             ++current_cat.rank;
  101.         }
  102.  
  103.         // reset for next line
  104.         begin = ++end;
  105.     }
  106.  
  107.     // random access
  108.     madvise(data, stats.st_size, MADV_RANDOM);
  109.     cout.sync_with_stdio(false);
  110.  
  111.     // output results
  112.     for(auto keypair : keyvals) {
  113.         cout << keypair.first << "\n";
  114.         for (auto rank : *keypair.second) {
  115.             cout << "(" << rank.rank << "/" << category_totals[rank.cat] << ") " << rank.cat << "\n";
  116.         }
  117.     }
  118.     cout.flush();
  119.  
  120.     // clean up
  121.     for (auto keypair : keyvals) {
  122.         delete keypair.second;
  123.     }
  124.     keyvals.clear();
  125.  
  126.     // unmap
  127.     if (munmap(data, stats.st_size) == -1) {
  128.         perror("munmap");
  129.         return 1;
  130.     }
  131.  
  132.     // close
  133.     if (close(fd) == -1) {
  134.         perror("close");
  135.         return 1;
  136.     }
  137.  
  138.     return 0;
  139. }
clone this paste RAW Paste Data