Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- *
- * Daniel Lemire, February 2012, http://lemire.me/
- *
- * Suppose that you want to copy an array over
- * another, except for some exceptions.
- * What is faster?
- *
- * g++ -Ofast bitmapslow bitmapslow.cpp
- *
- *
- * Published on blog.
- *
- * ------------------------------------------------------------------------------
- * This is an updated version of the original code (Feb 26 2012).
- * Martin Trenkmann, martin.trenkmann@googlemail.com
- *
- * $ uname -a
- * Linux 2.6.32-5-amd64 #1 SMP Mon Jan 9 20:49:59 UTC 2012 x86_64 GNU/Linux
- *
- * $ cat /proc/cpuinfo | grep name
- * model name : Intel(R) Core(TM)2 CPU 6300 @ 1.86GHz
- *
- * $ g++ --version
- * g++ (Debian 4.4.5-8) 4.4.5
- * ------------------------------------------------------------------------------
- */
- #include <iostream>
- #include <vector>
- #include <cassert>
- #include <cstdlib>
- #include <cstring>
- #include <iomanip>
- #include <algorithm>
- #include <sys/stat.h>
- #include <sys/time.h>
- #include <sys/types.h>
- using namespace std;
- typedef unsigned int uint;
- typedef unsigned char byte;
- // mtrenkmann
- size_t NUMBER_OF_UPDATES;
- // mtrenkmann
- inline void
- init_num_updates()
- {
- assert((NUMBER_OF_UPDATES = 0, true));
- }
- // mtrenkmann
- inline void
- increment_num_updates()
- {
- assert(++NUMBER_OF_UPDATES);
- }
- // mtrenkmann
- inline size_t
- num_updates()
- {
- return NUMBER_OF_UPDATES;
- }
- class ZTimer
- {
- public:
- struct timeval t1, t2;
- public:
- ZTimer() : t1(), t2() { gettimeofday(&t1,0); t2 = t1; }
- void reset() {gettimeofday(&t1,0); t2 = t1;}
- int elapsed() { return ((t2.tv_sec - t1.tv_sec) * 1000) + ((t2.tv_usec - t1.tv_usec) / 1000); }
- int split() { gettimeofday(&t2,0); return elapsed(); }
- };
- // mtrenkmann
- // Copies num elements of uint from src to dst.
- void
- memcopy(const vector<uint>& src, vector<uint>& dst)
- {
- memcpy(dst.data(), src.data(), src.size() * sizeof(uint));
- }
- /**
- * This copies the data from begin to end. We write to "out".
- * mtrenkmann: justcopy -> loopcopy
- */
- void
- loopcopy(
- const uint * const begin,
- const uint * const end,
- uint * out)
- {
- for(const uint * i = begin; i!=end; ++i) {
- *(out++) = *i;
- }
- /* we could use this instead of the previous loop, but it is no faster:
- memcpy ( out, begin, (end-begin)*sizeof(uint) );
- at least under gcc 4.6
- */
- }
- /**
- * This copies the data from begin to end, except at some locations
- * given by exceptionbitmap where it must use the values from
- * exceptionval. We write to "out".
- */
- void
- bitmapcopy(
- const uint * const begin,
- const uint * const end,
- const uint * exceptionval,
- const vector<uint> & exceptionbitmap,
- uint * out)
- {
- init_num_updates();
- vector<uint>::const_iterator j = exceptionbitmap.begin();
- for(const uint * i = begin; i<end; ++i) {
- for(uint k = 0; k<32; ++k, ++i)
- {
- *(out++) = ((*j) & (1U<<k))? (increment_num_updates(), *(exceptionval++)) : *i;
- }
- --i; // mtrenkmann: off by one error
- ++j;
- }
- }
- /**
- * This copies the data from begin to end, except at some locations
- * given by exceptionbitmap where it must use the values from
- * exceptionval. We write to "out".
- */
- void
- bitmapcopy8bits(
- const uint * const begin,
- const uint * const end,
- const uint * exceptionval,
- const vector<uint> & exceptionbitmap,
- uint * out)
- {
- init_num_updates();
- const byte * j = reinterpret_cast<const byte *>(&exceptionbitmap[0]);
- for(const uint * i = begin; i<end; ++i) {
- for(uint k = 0; k<8; ++k, ++i)
- {
- *(out++) = ((*j) & (1U<<k))? (increment_num_updates(), *(exceptionval++)) : *i;
- }
- --i; // mtrenkmann: off by one error
- ++j;
- }
- }
- /**
- * This copies the data from begin to end, except at some locations
- * given by exceptionpos where it must use the values from
- * exceptionval. We write to "out".
- */
- void
- patchedcopy(
- const uint * const begin,
- const uint * const end,
- const uint * exceptionval,
- const vector<uint> & exceptionpos,
- uint * out)
- {
- init_num_updates();
- uint * tmpout = out;
- for(const uint * i = begin; i!=end; ++i) {
- *(tmpout++) = *i;
- }
- for(vector<uint>::const_iterator pos = exceptionpos.begin(); pos!= exceptionpos.end(); ++pos) {
- *(out + *pos) = *(exceptionval++);
- increment_num_updates();
- }
- }
- /**
- * This copies the data from begin to end, except at some locations
- * given by exceptionpos where it must use the values from
- * exceptionval. We write to "out".
- *
- * Proposed by Luc Golden
- */
- void
- patchedcopy2(
- const uint * const begin,
- const uint * const end,
- const uint * exceptionval,
- const vector<uint> & exceptionpos,
- uint * out)
- {
- init_num_updates();
- vector<uint>::const_iterator pos = exceptionpos.begin();
- const uint * i = begin;
- for(; i!=end;++i) {
- if(i-begin == *pos) {
- increment_num_updates();
- *(out++) = *(exceptionval++);
- ++pos;
- if(pos==exceptionpos.end()) {
- // no more exceptions
- for(;i!=end;++i) {
- *(out++) = *i;
- }
- break;
- }
- } else
- *(out++) = *i;
- }
- }
- // mtrenkmann
- // Copies values from src vector to dst vector with updates.
- // Preconditions:
- // - !positions.empty()
- // - positions.size() <= replacements.size()
- // - is_sorted_in_ascending_order(positions)
- void
- updatingcopy(
- const vector<uint>& src,
- vector<uint>& dst,
- const vector<uint>& positions,
- const vector<uint>& replacements)
- {
- init_num_updates();
- vector<uint>::const_iterator pos = positions.begin();
- vector<uint>::const_iterator rep = replacements.begin();
- for (size_t i = 0; i != src.size(); ++i)
- {
- if (i == *pos)
- {
- increment_num_updates();
- dst[i] = *rep;
- ++pos;
- ++rep;
- }
- else
- {
- dst[i] = src[i];
- }
- }
- }
- // mtrenkmann
- // Copies values from src vector to dst vector with updates.
- // Position and replacement values are together for better locality.
- // Preconditions:
- // - !updates.empty()
- // - is_sorted_in_ascending_order(updates.first)
- void
- updatingcopy2(
- const vector<uint>& src,
- vector<uint>& dst,
- const vector<pair<uint, uint> >& updates)
- {
- init_num_updates();
- vector<pair<uint, uint> >::const_iterator uit = updates.begin();
- for (size_t i = 0; i != src.size(); ++i)
- {
- if (i == uit->first)
- {
- increment_num_updates();
- dst[i] = uit->second;
- ++uit;
- }
- else
- {
- dst[i] = src[i];
- }
- }
- }
- // mtrenkmann
- // Copies values from src vector to dst vector with updates.
- // To avoid branching the update vector becomes an outer loop.
- // Preconditions:
- // - is_sorted_in_ascending_order(updates.first)
- void
- updatingcopy3(
- const vector<uint>& src,
- vector<uint>& dst,
- const vector<pair<uint, uint> >& updates)
- {
- init_num_updates();
- size_t i = 0;
- for (vector<pair<uint, uint> >::const_iterator it = updates.begin(); it != updates.end(); ++it)
- {
- for (; i != it->first; ++i)
- {
- dst[i] = src[i];
- }
- increment_num_updates();
- dst[i] = it->second;
- ++i;
- }
- for (; i != src.size(); ++i)
- {
- dst[i] = src[i];
- }
- }
- // mtrenkmann
- // Generates a vector with pairs containing update information.
- // pair.first indicates the index to be updated.
- // pair.second stores the replacement value.
- const vector<pair<uint, uint> >
- generate_update_pairs(uint n, uint density)
- {
- vector<pair<uint, uint> > updates;
- for (uint i = 0; i < n; i += density)
- {
- updates.push_back(make_pair(i, rand()));
- }
- return updates;
- }
- vector<uint>
- generateArray(uint N)
- {
- vector<uint> ans(N);
- for(uint k = 0; k<N; ++k)
- ans[k] = rand();
- return ans;
- }
- pair<vector<uint>, vector<uint> >
- generateExceptLocations(uint N, uint density)
- {
- vector<uint> positions;
- vector<uint> bitmap(N/32);
- for(uint k = 0; k<N; k+= density) {
- positions.push_back(k);
- #ifdef FIXME
- bitmap[k/32] |= (1<<(k%32)); // mtrenkmann: correct
- #else
- bitmap[k/32] &= (1<<(k%32)); // mtrenkmann: incorrect
- #endif
- }
- return pair<vector<uint>, vector<uint> >(positions,bitmap);
- }
- // mtrenkmann
- template <typename ContainerT> void
- check_equal(const ContainerT& lhs, const ContainerT& rhs)
- {
- assert(lhs.size() == rhs.size());
- assert(std::equal(lhs.begin(), lhs.end(), rhs.begin()));
- }
- // mtrenkmann
- template <typename T> void
- print(const T& value, size_t width = 16)
- {
- cout << setw(width) << left << value;
- }
- // mtrenkmann
- template <typename T1, typename T2> void
- print(const T1& v1, size_t w1, const T2& v2, size_t w2)
- {
- cout << setw(w1) << right << v1 << ' ' << setw(w2) << left << v2;
- }
- int main()
- {
- const uint N = 1000000*32;//1024*1024*100;
- // uint density = 100; // unused
- const vector<uint> data = generateArray(N);
- vector<uint> out(N);
- ZTimer z;
- print("density(%)", 12);
- print("memcopy", 9);
- print("loopcopy", 10);
- print("bitmapcopy");
- print("bitmapcopy8bits");
- print("patchedcopy");
- print("patchedcopy2");
- print("updatingcopy");
- print("updatingcopy2");
- print("updatingcopy3");
- cout << endl;
- const vector<uint> exceptionvals = generateArray(N);// more than we need
- for(uint density = 1; density<50; density+=5) {
- const pair<vector<uint>, vector<uint> > exceptionpos = generateExceptLocations(N, density);
- // mtrenkmann
- const vector<pair<uint, uint> > updates = generate_update_pairs(N, density);
- print(100.0 / density, 12);
- z.reset();
- memcopy(data, out);
- print(z.split(), 9);
- check_equal(data, out);
- z.reset();
- loopcopy(&data[0], (&data[0]) + N, &out[0]);
- print(z.split(), 10);
- check_equal(data, out);
- z.reset();
- bitmapcopy(&data[0], (&data[0]) + N, &exceptionvals[0], exceptionpos.second, &out[0]);
- print(z.split(), 3, num_updates(), 12);
- // check_equal(data, out);
- z.reset();
- bitmapcopy8bits(&data[0], (&data[0]) + N, &exceptionvals[0], exceptionpos.second, &out[0]);
- print(z.split(), 3, num_updates(), 12);
- // check_equal(data, out);
- z.reset();
- patchedcopy(&data[0], (&data[0]) + N, &exceptionvals[0], exceptionpos.first, &out[0]);
- print(z.split(), 3, num_updates(), 12);
- z.reset();
- patchedcopy2(&data[0], (&data[0]) + N, &exceptionvals[0], exceptionpos.first, &out[0]);
- print(z.split(), 3, num_updates(), 12);
- z.reset();
- updatingcopy(data, out, exceptionpos.first, exceptionvals);
- print(z.split(), 3, num_updates(), 12);
- z.reset();
- updatingcopy2(data, out, updates);
- print(z.split(), 3, num_updates(), 12);
- z.reset();
- updatingcopy3(data, out, updates);
- print(z.split(), 3, num_updates(), 12);
- cout << endl;
- }
- return 0;
- }
- /*
- --------------------------------------------------------------------------------
- TEST RUN 1
- --------------------------------------------------------------------------------
- $ g++ -Wall -O3 -o bitmapslow2 bitmapslow2.cpp
- density(%) memcopy loopcopy bitmapcopy bitmapcopy8bits patchedcopy patchedcopy2 updatingcopy updatingcopy2 updatingcopy3
- 100 52 82 87 0 85 0 232 32000000 125 32000000 111 32000000 109 32000000 109 32000000
- 16.6667 53 82 89 0 86 0 152 5333334 99 5333334 95 5333334 94 5333334 93 5333334
- 9.09091 52 82 88 0 85 0 148 2909091 94 2909091 92 2909091 91 2909091 91 2909091
- 6.25 52 81 88 0 86 0 148 2000000 92 2000000 89 2000000 89 2000000 89 2000000
- 4.7619 52 82 88 0 86 0 135 1523810 91 1523810 90 1523810 89 1523810 88 1523810
- 3.84615 52 82 89 0 86 0 129 1230770 91 1230770 89 1230770 88 1230770 87 1230770
- 3.22581 52 82 88 0 85 0 116 1032259 90 1032259 88 1032259 88 1032259 88 1032259
- 2.77778 52 82 88 0 85 0 111 888889 90 888889 88 888889 89 888889 87 888889
- 2.43902 52 82 87 0 87 0 108 780488 90 780488 88 780488 88 780488 86 780488
- 2.17391 52 82 88 0 85 0 104 695653 94 695653 89 695653 87 695653 86 695653
- -> memcopy is indeed faster than loopcopy
- -> the bitmapcopy versions do not update anything at all (bug)
- -> the updatingcopy versions beat the patchedcopy versions
- --------------------------------------------------------------------------------
- TEST RUN 2
- --------------------------------------------------------------------------------
- $ g++ -Wall -O3 -DFIXME -o bitmapslow2 bitmapslow2.cpp
- density(%) memcopy loopcopy bitmapcopy bitmapcopy8bits patchedcopy patchedcopy2 updatingcopy updatingcopy2 updatingcopy3
- 100 52 82 102 32000000 91 32000000 232 32000000 125 32000000 111 32000000 109 32000000 109 32000000
- 16.6667 52 82 93 5333334 90 5333334 153 5333334 99 5333334 95 5333334 94 5333334 93 5333334
- 9.09091 52 83 91 2909091 88 2909091 148 2909091 95 2909091 91 2909091 91 2909091 91 2909091
- 6.25 52 82 90 2000000 88 2000000 147 2000000 92 2000000 90 2000000 89 2000000 89 2000000
- 4.7619 52 83 90 1523810 87 1523810 135 1523810 93 1523810 89 1523810 89 1523810 88 1523810
- 3.84615 52 82 89 1230770 87 1230770 129 1230770 90 1230770 89 1230770 88 1230770 87 1230770
- 3.22581 52 82 89 1032259 86 1032259 117 1032259 90 1032259 88 1032259 90 1032259 87 1032259
- 2.77778 52 82 89 888889 88 888889 110 888889 90 888889 89 888889 88 888889 87 888889
- 2.43902 52 82 89 780488 86 780488 107 780488 90 780488 88 780488 88 780488 86 780488
- 2.17391 52 83 89 695653 86 695653 105 695653 90 695653 89 695653 88 695653 86 695653
- -> -DFIXME: now the bitmapcopy versions perform updates and are quite fast for high density
- -> the updatingcopy and bitmapcopy version have equal performance for low density
- -> raw pointer arithmetic does not perform better than operator[]
- -> (iterators were tested as well with same performance results)
- --------------------------------------------------------------------------------
- TEST RUN 3
- --------------------------------------------------------------------------------
- $ g++ -Wall -O3 -DFIXME -DNDEBUG -o bitmapslow2 bitmapslow2.cpp
- density(%) memcopy loopcopy bitmapcopy bitmapcopy8bits patchedcopy patchedcopy2 updatingcopy updatingcopy2 updatingcopy3
- 100 52 83 87 0 85 0 233 0 117 0 111 0 109 0 109 0
- 16.6667 53 82 94 0 89 0 152 0 98 0 94 0 95 0 93 0
- 9.09091 52 82 93 0 88 0 148 0 93 0 91 0 92 0 90 0
- 6.25 53 82 89 0 87 0 147 0 92 0 90 0 89 0 90 0
- 4.7619 52 83 93 0 87 0 135 0 91 0 89 0 88 0 89 0
- 3.84615 52 82 92 0 90 0 128 0 90 0 89 0 89 0 88 0
- 3.22581 52 83 90 0 86 0 118 0 90 0 88 0 88 0 88 0
- 2.77778 52 82 91 0 86 0 110 0 90 0 88 0 88 0 87 0
- 2.43902 52 82 91 0 86 0 109 0 89 0 88 0 88 0 87 0
- 2.17391 52 82 91 0 86 0 105 0 89 0 89 0 87 0 87 0
- -> -DNDEBUG: the update operation counter is disabled to get real performance, although it makes no difference
- -> the bitmapcopy versions are not to beat for high density, but the approach is more error-prone
- --------------------------------------------------------------------------------
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement