// g++ -O2 -Wall -std=c++11 ctrans.cpp -o ctrans
// ./ctrans data/alexa_scrape.txt > inv.txt
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <iostream>
#include <unordered_map>
#include <vector>
#include <cstring>
#include <cstdlib>
#include <cstdio>
using namespace std;
struct cat_rank {
char *cat;
unsigned int rank;
};
struct cp_hash {
inline size_t operator()(const char *val) const {
size_t hash = 5381;
char c;
while ((c = *val++)) {
hash += (hash << 5) + (size_t) c;
}
return hash;
}
};
struct cp_equal {
inline bool operator()(const char *a, const char *b) const {
return strcmp(a,b) == 0;
}
};
int main(int argc, char **argv) {
if (argc != 2) {
cout << "expecting input file as argument" << endl;
return -1;
}
// setup for mmap
int fd;
if ((fd = open(argv[1], O_RDONLY)) == -1) {
perror("open");
return 1;
}
// stat for file size
struct stat stats;
if (fstat(fd, &stats) == -1) {
perror("fstat");
return 1;
}
// map into memory
void *data = nullptr;
if ((data = mmap(0, stats.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0)) == MAP_FAILED) {
perror("mmap");
return 1;
}
// sequential pass
madvise(data, stats.st_size, MADV_SEQUENTIAL);
// represent each string as an offset and a length, to avoid copying we only save the address
char *begin, *end;
begin = end = (char *)data;
// indexing data
cat_rank current_cat = { .cat = nullptr, .rank = 0 };
unordered_map<const char *, unsigned int> category_totals;
unordered_map<const char *, vector<cat_rank> *, cp_hash, cp_equal> keyvals;
while ((end = (char *)memchr(end, '\n', stats.st_size - (end - (char *)data))) != nullptr) {
// replace the new line
*end = '\0';
// do the real work here
if (*begin == '/') {
if (current_cat.cat != nullptr) {
category_totals[current_cat.cat] = current_cat.rank;
}
current_cat = { .cat = begin, .rank = 0 };
} else {
// parse out the numbers
const char *name = (const char *)memchr(begin, ' ', end - begin) + 1;
if (keyvals.find(name) == keyvals.end()) {
keyvals[name] = new vector<cat_rank>();
}
keyvals[name]->push_back(current_cat);
++current_cat.rank;
}
// reset for next line
begin = ++end;
}
// random access
madvise(data, stats.st_size, MADV_RANDOM);
cout.sync_with_stdio(false);
// output results
for(auto keypair : keyvals) {
cout << keypair.first << "\n";
for (auto rank : *keypair.second) {
cout << "(" << rank.rank << "/" << category_totals[rank.cat] << ") " << rank.cat << "\n";
}
}
cout.flush();
// clean up
for (auto keypair : keyvals) {
delete keypair.second;
}
keyvals.clear();
// unmap
if (munmap(data, stats.st_size) == -1) {
perror("munmap");
return 1;
}
// close
if (close(fd) == -1) {
perror("close");
return 1;
}
return 0;
}