/** * bitpacking.cpp * * Original 6199 line source by Daniel Lemire, http://lemire.me/blog/ * Question: if you pack and unpack bits, is it much faster if you * pack into 8 or 16 bits than, say, 31 or 7 bits? * * Modified 354 line source by willcodeforfood@reddit * g++ -DMAIN -DBENCH -O3 bitpacking.cpp -o bench * g++ -DMAIN -DSPECS -O1 bitpacking.cpp -o specs * g++ -O3 bitpacking.cpp -c -o bitpack.o */ extern "C" { # include } #include #include #include #include #ifdef __USE_GNU # define RESTRICT __restrict__ # define NOINLINE __attribute__ ((noinline)) #else # define RESTRICT # define NOINLINE #endif #define PARAMS cuint * RESTRICT i, uint * RESTRICT o #define FOREACH_1to31(F) \ F (1) F (2) F (3) F (4) F (5) F (6) F (7) F (8) \ F (9) F (10) F (11) F (12) F (13) F (14) F (15) F (16) \ F (17) F (18) F (19) F (20) F (21) F (22) F (23) F (24) \ F (25) F (26) F (27) F (28) F (29) F (30) F (31) #define FOREACH_1to32(F) FOREACH_1to31(F) F (32) #define SUB(a, b) ((a) - (b)) #define LSH(a, b) ((a) << (b)) #define RSH(a, b) ((a) >> (b)) #define MASK(x, k) ((x) % (1U << (k))) #define die(why) do { cerr << why << endl; assert(0); \ exit(EXIT_FAILURE); } while(0) using namespace std; typedef const bool cbool; typedef const uint cuint; typedef vector vuint; typedef const vuint cvuint; // P = pack, U = unpack, S = spec // defines 96 functions: {p,u}{1..32}(PARAMS) and s{1..32}() namespace metaprog { #define TPAR cuint * RESTRICT &i, uint * RESTRICT &o #define TCALL(f) (f (i, o)) #define IGNORE(x) ((void)x) // in this context, dirty means that some bits weren't processed. // [PU][CD][CD] = (pack|unpack) start:(clean|dirty) end:(clean|dirty) // [PU]Dx = called when dirty at the start; handles remainders // [PU]Rx = adds the remainder bits to the output // these functional macros aren't fully general; some aren't expr-safe, // and some evaluate their params more than once. #define PCD(n, k) do { (*o) |= LSH (MASK ((*i), n), k); } while (0) #define UCD(n, k) do { (*o) = MASK (RSH ((*i), k), n); } while (0) #define PCC(n, k) do { PCD(n, k); (++i); } while (0) #define UCC(n, k) do { UCD(n, k); (++o); } while (0) #define PRx(n, k) do { (*o) |= RSH (MASK ((*i), n), SUB (n, k)); } while (0) #define URx(n, k) do { (*o) |= LSH (MASK ((*i), k), SUB (n, k)); } while (0) #define PDx(n, k) do { (++o); PRx(n, k); (++i); } while (0) #define UDx(n, k) do { (++i); URx(n, k); (++o); } while (0) #define PDD(n, k) do { PDx(n, k); PCD(n, k); } while (0) #define UDD(n, k) do { UDx(n, k); UCD(n, k); } while (0) #define PDC(n, k) do { PDD(n, k); (++i); } while (0) #define UDC(n, k) do { UDD(n, k); (++o); } while (0) // before and after refer to dirty state relative to X template struct X { static inline void P (TPAR); static inline void U (TPAR); }; #define SPECIALIZE_X(x, y, F) \ template \ struct X { \ static inline void P (TPAR) { P##F (n, b); } \ static inline void U (TPAR) { U##F (n, b); } \ static inline void S () { cout << " " << #F << " (" << n \ << ", " << b << ");" << endl; } \ } // real execution work hides here SPECIALIZE_X(true, true, CC); SPECIALIZE_X(true, false, CD); SPECIALIZE_X(false, true, DC); SPECIALIZE_X(false, false, DD); static inline void nop(TPAR) { IGNORE(i); IGNORE(o); } template struct E { static inline void P(TPAR) { TCALL(nop); } static inline void U(TPAR) { TCALL(nop); } static inline void S() { } }; // end states advance the input or output template <> struct E { static inline void P(TPAR) { IGNORE(i); ++o; } static inline void U(TPAR) { IGNORE(o); ++i; } static inline void S() { cout << " EE ();" << endl; } }; template struct W { // WX = execute single step // WE = end at alignment 0 (mod 32) #define WX X #define WE E // dirty state before and after the W routine static cbool before = (current == 0 || prev < current); static cbool end = (next == 0); static cbool after = (end || current < next); static inline void P (TPAR) { TCALL(WX::P); TCALL(WE::P); } static inline void U (TPAR) { TCALL(WX::U); TCALL(WE::U); } static inline void S () { WX::S(); WE::S(); } }; // recursive metaprogram; assumes 0th step is specialized template struct R { // RR = recursion // RW = does the work for this step #define RR R #define RW W static cuint prev = ((step - 1) * n) % 32U; static cuint current = (step * n) % 32U; static cuint next = ((step + 1) * n) % 32U; static inline void P (TPAR) { TCALL(RR::P); TCALL(RW::P); } static inline void U (TPAR) { TCALL(RR::U); TCALL(RW::U); } static inline void S () { RR::S (); RW::S (); } }; // each bit size recurses from 31 to 0 #define FR(n) R #define SPECIALIZE_R(n) \ void p##n (PARAMS) { TCALL(FR(n)::P); } \ void u##n (PARAMS) { TCALL(FR(n)::U); } \ void s##n () { FR(n)::S(); } \ template<> struct R <(n), 0> { \ static inline void P (TPAR) { PCC (n, 0); } \ static inline void U (TPAR) { UCC (n, 0); } \ static inline void S () { \ cout << " CC (" << n << ", 0);" << endl; } }; // primarily defines {p,u}{1..31} FOREACH_1to31(SPECIALIZE_R) // {p,u}32 calls slowcpy static void slowcpy (PARAMS) { for (int j = 0; j < 32; ++j) *o++ = *i++; } void p32(PARAMS) { TCALL(slowcpy); } void u32(PARAMS) { TCALL(slowcpy); } void s32() { for (int j = 0; j < 32; ++j) cout << " *o++ = *i++;" << endl; } } namespace switchfns { // pack and unpack switch functions call the recursive metaprograms #define CASE_P(n) case (n): do { TCALL(metaprog::p##n); return; } while(0); #define CASE_U(n) case (n): do { TCALL(metaprog::u##n); return; } while(0); #define CASE_S(n) case (n): do { metaprog::s##n(); return; } while (0); #define DEF_SWITCH(fname, cases) \ void fname (PARAMS, cuint bit) { \ switch (bit) { \ FOREACH_1to32 (cases); \ default: die("invalid case"); } } DEF_SWITCH(pack, CASE_P) DEF_SWITCH(unpack, CASE_U) void spec (cuint bit) { switch (bit) { FOREACH_1to32 (CASE_S); default: die("invalid case"); } } } #ifdef BENCH namespace bench { struct ZTimer { struct timeval t1, t2; ZTimer() : t1(), t2() { reset(); } int elapsed () { return ((t2.tv_sec - t1.tv_sec) * 1000) + ((t2.tv_usec - t1.tv_usec) / 1000); } void reset () { gettimeofday (&t1, 0); t2 = t1; } int split () { gettimeofday (&t2, 0); return elapsed (); } }; NOINLINE void fast_pack (cvuint & i, vuint & o, cuint bit) { cuint N = i.size () / 32; for (uint k = 0; k < N; ++k) switchfns::pack (&i[0] + 32 * k, &o[0] + (32 * bit) * k / 32, bit); } NOINLINE void fast_unpack (cvuint & i, vuint & o, cuint bit) { cuint N = o.size () / 32; for (uint k = 0; k < N; ++k) switchfns::unpack (&i[0] + (32 * bit) * k / 32, &o[0] + 32 * k, bit); } bool validate (cvuint & i, cvuint & o, cuint bit) { if (bit == 32) return i == o; cuint N = i.size (); for (uint k = 0; k < N; ++k) if (MASK(i[k], bit) != MASK(o[k], bit)) { cerr << " Error at k = " << k << " i[k]= " << i[k] << " o[k]=" << o[k] << endl; return false; } return true; } vuint generateArray (cuint N) { vuint ans (N); for (uint k = 0; k < N; ++k) ans[k] = rand (); return ans; } void simplebenchmark (cuint N = 1U << 26, cuint T = 5) { ZTimer z; cvuint data = generateArray (N); vuint compressed (N, 0), recovered (N, 0); cout << "bits" << "\t" << "pack" << "\t" << "unpack" << endl; for (uint bit = 1; bit <= 32; ++bit) { double packtime = 0, unpacktime = 0; for (uint t = 0; t < T; ++t) { compressed.clear (); compressed.resize (N * bit / 32, 0); recovered.clear (); recovered.resize (N, 0); z.reset (); fast_pack (data, compressed, bit); packtime += z.split (); z.reset (); fast_unpack (compressed, recovered, bit); unpacktime += z.split (); if (!validate (data, recovered, bit)) die("failed validation"); } cout << bit << "\t" << packtime << "\t" << unpacktime << endl; } } } #endif #ifdef SPECS namespace specs { void print() { for (uint bit = 1; bit<= 32; ++bit) { cout << "SPEC(" << bit << ") {" << endl; switchfns::spec(bit); cout << "}" << endl; } } } #endif #ifdef MAIN int main () { #ifdef SPECS specs::print(); #endif #ifdef BENCH bench::simplebenchmark (); #endif return 0; } #endif