Advertisement
Guest User

Game of Life bitsliced

a guest
Nov 2nd, 2014
301
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 6.71 KB | None | 0 0
  1. // Compile with gcc -msse2 -O3
  2.  
  3. #include <stdio.h>
  4. #include <string.h>
  5. #include <emmintrin.h>
  6.  
  7. // Width/Height of one tile, in cells
  8. #define WIDTH 2048
  9. #define HEIGHT 2048
  10.  
  11. // Width/Height of board, in tiles
  12. #define TILES_WIDE 16
  13. #define TILES_TALL 8
  14.  
  15. // Width/Height of board, in cells
  16. #define CELLS_WIDE (WIDTH*TILES_WIDE)
  17. #define CELLS_TALL (HEIGHT*TILES_TALL)
  18.  
  19. // Fast bit-sliced implementation
  20.  
  21. static __m128i ComputeCell(
  22.     const __m128i* row0,
  23.     const __m128i* row1,
  24.     const __m128i* row2)
  25. {
  26.   /*
  27.    * Cell numbering:
  28.    *   row0  0 1 2
  29.    *   row1  3   4
  30.    *   row2  5 6 7
  31.    * Intermediate variable naming pattern:
  32.    *   s470 is bit 0 of the sum of cells 4 through 7.
  33.    *   c470 is the carry out of the same sum.
  34.    * Optimizations:
  35.    *   Sums of two cells can be 0, 1, or 2, but never 3, which allows
  36.    *   us to omit some carry logic when computing s032 and s472.
  37.    *   Also, sums >= 4 are treated the same later, so a sum that should
  38.    *   be 8 is represented as 4 instead to fit the result in 3 bits.
  39.    */
  40.   __m128i s010 = row0[0] ^ row0[1];
  41.   __m128i s011 = row0[0] & row0[1];
  42.   __m128i s230 = row0[2] ^ row1[0];
  43.   __m128i s231 = row0[2] & row1[0];
  44.  
  45.   __m128i s030 = s010 ^ s230;
  46.   __m128i c030 = s010 & s230;
  47.   __m128i s031 = c030 ^ s011 ^ s231;
  48.   __m128i s032 = s011 & s231;
  49.  
  50.   __m128i s450 = row1[2] ^ row2[0];
  51.   __m128i s451 = row1[2] & row2[0];
  52.   __m128i s670 = row2[1] ^ row2[2];
  53.   __m128i s671 = row2[1] & row2[2];
  54.  
  55.   __m128i s470 = s450 ^ s670;
  56.   __m128i c470 = s450 & s670;
  57.   __m128i s471 = c470 ^ s451 ^ s671;
  58.   __m128i s472 = s451 & s671;
  59.  
  60.   __m128i s070 = s030 ^ s470;
  61.   __m128i c070 = s030 & s470;
  62.   __m128i s071i = s031 ^ s471;  // Intermediate values
  63.   __m128i c071i = s031 & s471;  // to simplify carry calculation.
  64.   __m128i s071 = s071i ^ c070;
  65.   __m128i c071 = c071i | (s071i & c070);
  66.   __m128i s072 = s032 | s472 | c071;
  67.  
  68.   // Cell is live if:
  69.   // (sum == 3) || ((sum == 2) && cell), which is the same as:
  70.   // (!sum[2]) && ((sum[1] && sum[0]) || (sum[1] && cell)),
  71.   // which can be further simplified to:
  72.   // (!sum[2]) && sum[1] && (sum[0] || cell)
  73.   return _mm_andnot_si128(s072, s071 & (s070 | row1[1]));
  74. }
  75.  
  76. static void ComputeCells(const __m128i* oldGeneration, __m128i* newGeneration) {
  77.   int row;
  78.   const __m128i* pold;
  79.   __m128i* pnew;
  80.   __m128i* pend;
  81.   for (row = 1; row < HEIGHT-1; ++row) {
  82.     pnew = newGeneration + row*WIDTH + 1;
  83.     pold = oldGeneration + row*WIDTH;
  84.     for(pend = pnew + WIDTH - 2; pnew != pend; ++pold, ++pnew) {
  85.       *pnew = ComputeCell(pold - WIDTH, pold, pold + WIDTH);
  86.     }
  87.   }
  88. }
  89.  
  90. // Slow accessors
  91.  
  92. static __m128i* GetPointerFor(__m128i* buffer, int row, int col) {
  93.   int row_index = row % HEIGHT;
  94.   int col_index = col % WIDTH;
  95.   return &(buffer[row_index * WIDTH + col_index]);
  96. }
  97.  
  98. static int GetBitOffsetFor(int row, int col) {
  99.   return (row / HEIGHT) * TILES_WIDE + (col / WIDTH);
  100. }
  101.  
  102. static char* GetBytePointerFor(__m128i* buffer, int row, int col) {
  103.   char* base = (char*)GetPointerFor(buffer, row, col);
  104.   return base + GetBitOffsetFor(row, col) / 8;
  105. }
  106.  
  107. static char GetByteMaskFor(int row, int col) {
  108.   return 1 << (GetBitOffsetFor(row, col) % 8);
  109. }
  110.  
  111. static char GetValue(__m128i* buffer, int row, int col) {
  112.   char val = GetByteMaskFor(row, col) & *GetBytePointerFor(buffer, row, col);
  113.   return val ? 1 : 0;
  114. }
  115.  
  116. static void SetValue(__m128i* buffer, int row, int col, char val) {
  117.   char mask = GetByteMaskFor(row, col);
  118.   char* ptr = GetBytePointerFor(buffer, row, col);
  119.   val = val ? 0 : mask;
  120.   *ptr = val ^ (mask | *ptr);
  121. }
  122.  
  123. // Border logic
  124.  
  125. static void UpdateOne(__m128i* oldGeneration, __m128i* newGeneration, int row, int col) {
  126.   int sum =
  127.       GetValue(oldGeneration, row-1, col-1) +
  128.       GetValue(oldGeneration, row-1, col  ) +
  129.       GetValue(oldGeneration, row-1, col+1) +
  130.       GetValue(oldGeneration, row  , col-1) +
  131.       GetValue(oldGeneration, row  , col+1) +
  132.       GetValue(oldGeneration, row+1, col-1) +
  133.       GetValue(oldGeneration, row+1, col  ) +
  134.       GetValue(oldGeneration, row+1, col+1);
  135.   char newVal = (sum == 3 || (sum == 2 && GetValue(oldGeneration, row, col)));
  136.   SetValue(newGeneration, row, col, newVal);
  137. }
  138.  
  139. static void UpdateBorders(__m128i* oldGeneration, __m128i* newGeneration) {
  140.   int row, col;
  141.   // Vertical borders
  142.   for (col = WIDTH-1; col < CELLS_WIDE-1; col += WIDTH) {
  143.     for(row = 1; row < CELLS_TALL-1; ++row) {
  144.       UpdateOne(oldGeneration, newGeneration, row, col);
  145.       UpdateOne(oldGeneration, newGeneration, row, col+1);
  146.     }
  147.   }
  148.   // Horizontal borders
  149.   for (row = HEIGHT-1; row < CELLS_TALL-1; row += HEIGHT) {
  150.     for(col = 1; col < CELLS_WIDE-1; ++col) {
  151.       UpdateOne(oldGeneration, newGeneration, row, col);
  152.       UpdateOne(oldGeneration, newGeneration, row+1, col);
  153.     }
  154.   }
  155. }
  156.  
  157. // Operations
  158.  
  159. void ComputeNextGeneration(__m128i* oldGeneration, __m128i* newGeneration) {
  160.   ComputeCells(oldGeneration, newGeneration);
  161.   UpdateBorders(oldGeneration, newGeneration);
  162. }
  163.  
  164. void Init(__m128i* buffer, const char* data) {
  165.   int r, c;
  166.   memset(buffer, 0, WIDTH*HEIGHT*sizeof(__m128i));
  167.   r=0, c=0;
  168.   while(*data) {
  169.     if('|' == *data) { ++r; c=-1; }
  170.     else if('#' == *data) { SetValue(buffer, r, c, 1); }
  171.     ++c;
  172.     ++data;
  173.   }
  174. }
  175.  
  176. void Print(__m128i* buffer, int row, int col, int width, int height) {
  177.   int r, c;
  178.   for (r = 0; r < height; ++r) {
  179.     for (c = 0; c < width; ++c) {
  180.       if (GetValue(buffer, row+r, col+c))
  181.         putchar('#');
  182.       else
  183.         putchar('.');
  184.     }
  185.     putchar('\n');
  186.   }
  187. }
  188.  
  189. static const char* GOSPER_GUN =
  190. "                                                |"
  191. "                                                |"
  192. "                           #                    |"
  193. "                         # #                    |"
  194. "               ##      ##            ##         |"
  195. "              #   #    ##            ##         |"
  196. "   ##        #     #   ##                       |"
  197. "   ##        #   # ##    # #                    |"
  198. "             #     #       #                    |"
  199. "              #   #                             |"
  200. "               ##                               |"
  201. "                                                |"
  202. ;
  203.  
  204. // Double-buffered data
  205. static __m128i tiles0[WIDTH*HEIGHT];
  206. static __m128i tiles1[WIDTH*HEIGHT];
  207.  
  208. int main() {
  209.   __m128i* oldGeneration = tiles0;
  210.   __m128i* newGeneration = tiles1;
  211.   __m128i* t;
  212.   int i;
  213.  
  214.   Init(oldGeneration, GOSPER_GUN);
  215.   // Simulates for 100 generations.
  216.   for (i = 0; i < 100; ++i) {
  217.     putchar('\n');
  218.     Print(oldGeneration, 0, 0, 79, 30);
  219.     ComputeNextGeneration(oldGeneration, newGeneration);
  220.     t = oldGeneration;
  221.     oldGeneration = newGeneration;
  222.     newGeneration = t;
  223.   }
  224.   return 0;
  225. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement