/**
* bitpacking.cpp
*
* Original 6199 line source by Daniel Lemire, http://lemire.me/blog/
* Question: if you pack and unpack bits, is it much faster if you
* pack into 8 or 16 bits than, say, 31 or 7 bits?
*
* Modified 354 line source by willcodeforfood@reddit
* g++ -DMAIN -DBENCH -O3 bitpacking.cpp -o bench
* g++ -DMAIN -DSPECS -O1 bitpacking.cpp -o specs
* g++ -O3 bitpacking.cpp -c -o bitpack.o
*/
extern "C" {
# include <sys/time.h>
}
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <vector>
#ifdef __USE_GNU
# define RESTRICT __restrict__
# define NOINLINE __attribute__ ((noinline))
#else
# define RESTRICT
# define NOINLINE
#endif
#define PARAMS cuint * RESTRICT i, uint * RESTRICT o
#define FOREACH_1to31(F) \
F (1) F (2) F (3) F (4) F (5) F (6) F (7) F (8) \
F (9) F (10) F (11) F (12) F (13) F (14) F (15) F (16) \
F (17) F (18) F (19) F (20) F (21) F (22) F (23) F (24) \
F (25) F (26) F (27) F (28) F (29) F (30) F (31)
#define FOREACH_1to32(F) FOREACH_1to31(F) F (32)
#define SUB(a, b) ((a) - (b))
#define LSH(a, b) ((a) << (b))
#define RSH(a, b) ((a) >> (b))
#define MASK(x, k) ((x) % (1U << (k)))
#define die(why) do { cerr << why << endl; assert(0); \
exit(EXIT_FAILURE); } while(0)
using namespace std;
typedef const bool cbool;
typedef const uint cuint;
typedef vector<uint> vuint;
typedef const vuint cvuint;
// P = pack, U = unpack, S = spec
// defines 96 functions: {p,u}{1..32}(PARAMS) and s{1..32}()
namespace metaprog {
#define TPAR cuint * RESTRICT &i, uint * RESTRICT &o
#define TCALL(f) (f (i, o))
#define IGNORE(x) ((void)x)
// in this context, dirty means that some bits weren't processed.
// [PU][CD][CD] = (pack|unpack) start:(clean|dirty) end:(clean|dirty)
// [PU]Dx = called when dirty at the start; handles remainders
// [PU]Rx = adds the remainder bits to the output
// these functional macros aren't fully general; some aren't expr-safe,
// and some evaluate their params more than once.
#define PCD(n, k) do { (*o) |= LSH (MASK ((*i), n), k); } while (0)
#define UCD(n, k) do { (*o) = MASK (RSH ((*i), k), n); } while (0)
#define PCC(n, k) do { PCD(n, k); (++i); } while (0)
#define UCC(n, k) do { UCD(n, k); (++o); } while (0)
#define PRx(n, k) do { (*o) |= RSH (MASK ((*i), n), SUB (n, k)); } while (0)
#define URx(n, k) do { (*o) |= LSH (MASK ((*i), k), SUB (n, k)); } while (0)
#define PDx(n, k) do { (++o); PRx(n, k); (++i); } while (0)
#define UDx(n, k) do { (++i); URx(n, k); (++o); } while (0)
#define PDD(n, k) do { PDx(n, k); PCD(n, k); } while (0)
#define UDD(n, k) do { UDx(n, k); UCD(n, k); } while (0)
#define PDC(n, k) do { PDD(n, k); (++i); } while (0)
#define UDC(n, k) do { UDD(n, k); (++o); } while (0)
// before and after refer to dirty state relative to X
template <cuint n, cuint b, cbool before, cbool after>
struct X {
static inline void P (TPAR);
static inline void U (TPAR);
};
#define SPECIALIZE_X(x, y, F) \
template <cuint n, cuint b> \
struct X <n, b, (x), (y)> { \
static inline void P (TPAR) { P##F (n, b); } \
static inline void U (TPAR) { U##F (n, b); } \
static inline void S () { cout << " " << #F << " (" << n \
<< ", " << b << ");" << endl; } \
}
// real execution work hides here
SPECIALIZE_X(true, true, CC);
SPECIALIZE_X(true, false, CD);
SPECIALIZE_X(false, true, DC);
SPECIALIZE_X(false, false, DD);
static inline void nop(TPAR) {
IGNORE(i);
IGNORE(o);
}
template <cbool end>
struct E {
static inline void P(TPAR) {
TCALL(nop);
}
static inline void U(TPAR) {
TCALL(nop);
}
static inline void S() { }
};
// end states advance the input or output
template <>
struct E<true> {
static inline void P(TPAR) {
IGNORE(i);
++o;
}
static inline void U(TPAR) {
IGNORE(o);
++i;
}
static inline void S() {
cout << " EE ();" << endl;
}
};
template <cuint n, cuint prev, cuint current, cuint next>
struct W {
// WX = execute single step
// WE = end at alignment 0 (mod 32)
#define WX X<n, current, before, after>
#define WE E<end>
// dirty state before and after the W routine
static cbool before = (current == 0 || prev < current);
static cbool end = (next == 0);
static cbool after = (end || current < next);
static inline void P (TPAR) {
TCALL(WX::P);
TCALL(WE::P);
}
static inline void U (TPAR) {
TCALL(WX::U);
TCALL(WE::U);
}
static inline void S () {
WX::S();
WE::S();
}
};
// recursive metaprogram; assumes 0th step is specialized
template <cuint n, cuint step>
struct R {
// RR = recursion
// RW = does the work for this step
#define RR R<n, (step - 1)>
#define RW W<n, prev, current, next>
static cuint prev = ((step - 1) * n) % 32U;
static cuint current = (step * n) % 32U;
static cuint next = ((step + 1) * n) % 32U;
static inline void P (TPAR) {
TCALL(RR::P);
TCALL(RW::P);
}
static inline void U (TPAR) {
TCALL(RR::U);
TCALL(RW::U);
}
static inline void S () {
RR::S ();
RW::S ();
}
};
// each bit size recurses from 31 to 0
#define FR(n) R<n, 31>
#define SPECIALIZE_R(n) \
void p##n (PARAMS) { TCALL(FR(n)::P); } \
void u##n (PARAMS) { TCALL(FR(n)::U); } \
void s##n () { FR(n)::S(); } \
template<> struct R <(n), 0> { \
static inline void P (TPAR) { PCC (n, 0); } \
static inline void U (TPAR) { UCC (n, 0); } \
static inline void S () { \
cout << " CC (" << n << ", 0);" << endl; } };
// primarily defines {p,u}{1..31}
FOREACH_1to31(SPECIALIZE_R)
// {p,u}32 calls slowcpy
static void slowcpy (PARAMS) {
for (int j = 0; j < 32; ++j)
*o++ = *i++;
}
void p32(PARAMS) {
TCALL(slowcpy);
}
void u32(PARAMS) {
TCALL(slowcpy);
}
void s32() {
for (int j = 0; j < 32; ++j)
cout << " *o++ = *i++;" << endl;
}
}
namespace switchfns {
// pack and unpack switch functions call the recursive metaprograms
#define CASE_P(n) case (n): do { TCALL(metaprog::p##n); return; } while(0);
#define CASE_U(n) case (n): do { TCALL(metaprog::u##n); return; } while(0);
#define CASE_S(n) case (n): do { metaprog::s##n(); return; } while (0);
#define DEF_SWITCH(fname, cases) \
void fname (PARAMS, cuint bit) { \
switch (bit) { \
FOREACH_1to32 (cases); \
default: die("invalid case"); } }
DEF_SWITCH(pack, CASE_P)
DEF_SWITCH(unpack, CASE_U)
void spec (cuint bit) {
switch (bit) {
FOREACH_1to32 (CASE_S);
default:
die("invalid case");
}
}
}
#ifdef BENCH
namespace bench {
struct ZTimer {
struct timeval t1, t2;
ZTimer() : t1(), t2() {
reset();
}
int elapsed () {
return ((t2.tv_sec - t1.tv_sec) * 1000) +
((t2.tv_usec - t1.tv_usec) / 1000);
}
void reset () {
gettimeofday (&t1, 0);
t2 = t1;
}
int split () {
gettimeofday (&t2, 0);
return elapsed ();
}
};
NOINLINE void fast_pack (cvuint & i, vuint & o, cuint bit) {
cuint N = i.size () / 32;
for (uint k = 0; k < N; ++k)
switchfns::pack (&i[0] + 32 * k,
&o[0] + (32 * bit) * k / 32, bit);
}
NOINLINE void fast_unpack (cvuint & i, vuint & o, cuint bit) {
cuint N = o.size () / 32;
for (uint k = 0; k < N; ++k)
switchfns::unpack (&i[0] + (32 * bit) * k / 32,
&o[0] + 32 * k, bit);
}
bool validate (cvuint & i, cvuint & o, cuint bit) {
if (bit == 32)
return i == o;
cuint N = i.size ();
for (uint k = 0; k < N; ++k)
if (MASK(i[k], bit) != MASK(o[k], bit)) {
cerr << " Error at k = " << k << " i[k]= " << i[k]
<< " o[k]=" << o[k] << endl;
return false;
}
return true;
}
vuint generateArray (cuint N) {
vuint ans (N);
for (uint k = 0; k < N; ++k)
ans[k] = rand ();
return ans;
}
void simplebenchmark (cuint N = 1U << 26, cuint T = 5) {
ZTimer z;
cvuint data = generateArray (N);
vuint compressed (N, 0), recovered (N, 0);
cout << "bits" << "\t" << "pack" << "\t" << "unpack" << endl;
for (uint bit = 1; bit <= 32; ++bit) {
double packtime = 0, unpacktime = 0;
for (uint t = 0; t < T; ++t) {
compressed.clear ();
compressed.resize (N * bit / 32, 0);
recovered.clear ();
recovered.resize (N, 0);
z.reset ();
fast_pack (data, compressed, bit);
packtime += z.split ();
z.reset ();
fast_unpack (compressed, recovered, bit);
unpacktime += z.split ();
if (!validate (data, recovered, bit))
die("failed validation");
}
cout << bit << "\t" << packtime << "\t" << unpacktime << endl;
}
}
}
#endif
#ifdef SPECS
namespace specs {
void print() {
for (uint bit = 1; bit<= 32; ++bit) {
cout << "SPEC(" << bit << ") {" << endl;
switchfns::spec(bit);
cout << "}" << endl;
}
}
}
#endif
#ifdef MAIN
int main () {
#ifdef SPECS
specs::print();
#endif
#ifdef BENCH
bench::simplebenchmark ();
#endif
return 0;
}
#endif