Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <algorithm>
- #include <cstdint>
- #include <cstdio>
- #include <xmmintrin.h>
- typedef int32_t v4 __attribute__ ((vector_size(sizeof(int32_t) * 4)));
- inline bool v4equal(v4 a, v4 b)
- {
- __m128i v1 = _mm_load_si128((__m128i*)&a);
- __m128i v2 = _mm_load_si128((__m128i*)&b);
- __m128i vcmp = _mm_cmpeq_epi32(v1, v2);
- uint16_t mask = _mm_movemask_epi8(vcmp);
- return mask == 0xffff;
- }
- inline bool v4anyGreater(v4 a, v4 b)
- {
- __m128i v1 = _mm_load_si128((__m128i*)&a);
- __m128i v2 = _mm_load_si128((__m128i*)&b);
- __m128i vcmp = _mm_cmpgt_epi32(v1, v2);
- uint16_t mask = _mm_movemask_epi8(vcmp);
- return mask != 0;
- }
- class Sujiko
- {
- private:
- uint16_t _usedSet;
- uint16_t _usedCells;
- uint8_t _cells[9];
- v4 _sums;
- uint8_t _diag1, _diag2;
- v4 _clues;
- inline bool filled();
- inline bool used(uint8_t v);
- inline void unset(size_t cell, uint8_t value);
- inline bool cantBeValid();
- inline bool valid();
- public:
- Sujiko(uint8_t clue1, uint8_t clue2, uint8_t clue3, uint8_t clue4)
- : _usedSet(0),
- _usedCells(0),
- _cells {0},
- _sums {0,0,0,0},
- _diag1(clue1 + clue3),
- _diag2(clue2 + clue4),
- _clues { clue4, clue3, clue2, clue1 }
- {}
- inline void set(size_t cell, uint8_t value);
- bool solve();
- void print();
- };
- static const v4 multipliers[] = {
- { 0, 0, 0, 1 },
- { 0, 0, 1, 1 },
- { 0, 0, 1, 0 },
- { 0, 1, 0, 1 },
- { 1, 1, 1, 1 },
- { 1, 0, 1, 0 },
- { 0, 1, 0, 0 },
- { 1, 1, 0, 0 },
- { 1, 0, 0, 0 },
- };
- void Sujiko::set(size_t cell, uint8_t value)
- {
- _sums += multipliers[cell] * value;
- _cells[cell] = value;
- _usedSet |= 1 << value;
- _usedCells |= 1 << cell;
- }
- void Sujiko::unset(size_t cell, uint8_t value)
- {
- _usedSet ^= 1 << value;
- _usedCells ^= 1 << cell;
- _cells[cell] = 0;
- _sums -= multipliers[cell] * value;
- }
- bool Sujiko::filled()
- {
- return _usedSet == 0b1111111110;
- }
- bool Sujiko::valid()
- {
- return filled() && v4equal(_sums, _clues);
- }
- bool Sujiko::cantBeValid()
- {
- return
- // These identities are from Wikipedia and allow us to bail earlier
- (_cells[0] + _cells[8]) + _diag2 > 45 + _cells[4] ||
- (_cells[2] + _cells[6]) + _diag1 > 45 + _cells[4] ||
- // General properties
- v4anyGreater(_sums, _clues);
- }
- bool Sujiko::used(uint8_t v)
- {
- return (_usedSet & (1 << v)) != 0;
- }
- // Precondition: not filled
- bool Sujiko::solve()
- {
- int firstUnusedCell = 4;
- // This loop gets unrolled, experiments with running from a higher known
- // index didn't make it faster. This static order works from most
- // constrained to least constrained.
- static const size_t constrainedOrder[] = { 4, 1, 5, 7, 3, 0, 2, 6, 8 };
- for (int i = 0; i < 9; i++) {
- // The compiler inlines the test values from above
- if ((_usedCells & (1 << constrainedOrder[i])) == 0) {
- firstUnusedCell = constrainedOrder[i];
- break;
- }
- }
- v4 diff = _clues - _sums;
- if (firstUnusedCell == 0 || firstUnusedCell == 2 || firstUnusedCell == 6 || firstUnusedCell == 8) {
- uint16_t toUse = (((1 << diff[0]) | (1 << diff[1])) | ((1 << diff[2]) | (1 << diff[3]))) & 0b1111111110;
- if ((_usedSet ^ toUse) != 0b1111111110) {
- // Means we would need to duplicate a value
- return false;
- } else {
- if ((_usedCells & (1 << 0)) == 0)
- set(0, diff[3]);
- if ((_usedCells & (1 << 2)) == 0)
- set(2, diff[2]);
- if ((_usedCells & (1 << 6)) == 0)
- set(6, diff[1]);
- if ((_usedCells & (1 << 8)) == 0)
- set(8, diff[0]);
- return true;
- }
- }
- uint8_t highestUnused = 63 - __builtin_clzl(~_usedSet & 0b1111111111);
- const uint16_t originalSet = _usedSet;
- for (uint8_t testValue = highestUnused; testValue != 0; --testValue) {
- // Testing against the original set is a bit faster because there is no
- // longer a data dependency
- if ((originalSet & (1 << testValue)) == 0) {
- set(firstUnusedCell, testValue);
- if (!cantBeValid()) {
- // This is the same as not filled
- if (firstUnusedCell != 8) {
- if (solve()) {
- return true;
- }
- }
- if (valid()) {
- return true;
- }
- }
- unset(firstUnusedCell, testValue);
- }
- }
- return false;
- }
- void Sujiko::print()
- {
- printf("%d,%d,%d,%d\n%d%d%d\n%d%d%d\n%d%d%d\n",
- _clues[3], _clues[2], _clues[1], _clues[0],
- _cells[0], _cells[1], _cells[2],
- _cells[3], _cells[4], _cells[5],
- _cells[6], _cells[7], _cells[8]);
- }
- static __inline__ unsigned long long rdtsc(void)
- {
- unsigned hi, lo;
- __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
- return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
- }
- static const int ATTEMPTS = 300000;
- int main(int argc, char* argv[])
- {
- Sujiko board(22, 21, 24, 24);
- board.set(1, 4);
- board.set(2, 3);
- board.solve();
- unsigned long long start = rdtsc();
- for (int i = 0; i < ATTEMPTS; i++) {
- Sujiko board(22, 21, 24, 24);
- board.set(1, 4);
- board.set(2, 3);
- board.solve();
- }
- unsigned long long end = rdtsc();
- unsigned long long cycles = end - start;
- board.print();
- printf("Average time is %f cycles, %f us\n", (double)cycles / ATTEMPTS, (double)cycles / 2.8e9 * 1e6 / (double)ATTEMPTS);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement