Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Compile with gcc -msse2 -O3
- #include <stdio.h>
- #include <string.h>
- #include <emmintrin.h>
- // Width/Height of one tile, in cells
- #define WIDTH 2048
- #define HEIGHT 2048
- // Width/Height of board, in tiles
- #define TILES_WIDE 16
- #define TILES_TALL 8
- // Width/Height of board, in cells
- #define CELLS_WIDE (WIDTH*TILES_WIDE)
- #define CELLS_TALL (HEIGHT*TILES_TALL)
- // Fast bit-sliced implementation
- static __m128i ComputeCell(
- const __m128i* row0,
- const __m128i* row1,
- const __m128i* row2)
- {
- /*
- * Cell numbering:
- * row0 0 1 2
- * row1 3 4
- * row2 5 6 7
- * Intermediate variable naming pattern:
- * s470 is bit 0 of the sum of cells 4 through 7.
- * c470 is the carry out of the same sum.
- * Optimizations:
- * Sums of two cells can be 0, 1, or 2, but never 3, which allows
- * us to omit some carry logic when computing s032 and s472.
- * Also, sums >= 4 are treated the same later, so a sum that should
- * be 8 is represented as 4 instead to fit the result in 3 bits.
- */
- __m128i s010 = row0[0] ^ row0[1];
- __m128i s011 = row0[0] & row0[1];
- __m128i s230 = row0[2] ^ row1[0];
- __m128i s231 = row0[2] & row1[0];
- __m128i s030 = s010 ^ s230;
- __m128i c030 = s010 & s230;
- __m128i s031 = c030 ^ s011 ^ s231;
- __m128i s032 = s011 & s231;
- __m128i s450 = row1[2] ^ row2[0];
- __m128i s451 = row1[2] & row2[0];
- __m128i s670 = row2[1] ^ row2[2];
- __m128i s671 = row2[1] & row2[2];
- __m128i s470 = s450 ^ s670;
- __m128i c470 = s450 & s670;
- __m128i s471 = c470 ^ s451 ^ s671;
- __m128i s472 = s451 & s671;
- __m128i s070 = s030 ^ s470;
- __m128i c070 = s030 & s470;
- __m128i s071i = s031 ^ s471; // Intermediate values
- __m128i c071i = s031 & s471; // to simplify carry calculation.
- __m128i s071 = s071i ^ c070;
- __m128i c071 = c071i | (s071i & c070);
- __m128i s072 = s032 | s472 | c071;
- // Cell is live if:
- // (sum == 3) || ((sum == 2) && cell), which is the same as:
- // (!sum[2]) && ((sum[1] && sum[0]) || (sum[1] && cell)),
- // which can be further simplified to:
- // (!sum[2]) && sum[1] && (sum[0] || cell)
- return _mm_andnot_si128(s072, s071 & (s070 | row1[1]));
- }
- static void ComputeCells(const __m128i* oldGeneration, __m128i* newGeneration) {
- int row;
- const __m128i* pold;
- __m128i* pnew;
- __m128i* pend;
- for (row = 1; row < HEIGHT-1; ++row) {
- pnew = newGeneration + row*WIDTH + 1;
- pold = oldGeneration + row*WIDTH;
- for(pend = pnew + WIDTH - 2; pnew != pend; ++pold, ++pnew) {
- *pnew = ComputeCell(pold - WIDTH, pold, pold + WIDTH);
- }
- }
- }
- // Slow accessors
- static __m128i* GetPointerFor(__m128i* buffer, int row, int col) {
- int row_index = row % HEIGHT;
- int col_index = col % WIDTH;
- return &(buffer[row_index * WIDTH + col_index]);
- }
- static int GetBitOffsetFor(int row, int col) {
- return (row / HEIGHT) * TILES_WIDE + (col / WIDTH);
- }
- static char* GetBytePointerFor(__m128i* buffer, int row, int col) {
- char* base = (char*)GetPointerFor(buffer, row, col);
- return base + GetBitOffsetFor(row, col) / 8;
- }
- static char GetByteMaskFor(int row, int col) {
- return 1 << (GetBitOffsetFor(row, col) % 8);
- }
- static char GetValue(__m128i* buffer, int row, int col) {
- char val = GetByteMaskFor(row, col) & *GetBytePointerFor(buffer, row, col);
- return val ? 1 : 0;
- }
- static void SetValue(__m128i* buffer, int row, int col, char val) {
- char mask = GetByteMaskFor(row, col);
- char* ptr = GetBytePointerFor(buffer, row, col);
- val = val ? 0 : mask;
- *ptr = val ^ (mask | *ptr);
- }
- // Border logic
- static void UpdateOne(__m128i* oldGeneration, __m128i* newGeneration, int row, int col) {
- int sum =
- GetValue(oldGeneration, row-1, col-1) +
- GetValue(oldGeneration, row-1, col ) +
- GetValue(oldGeneration, row-1, col+1) +
- GetValue(oldGeneration, row , col-1) +
- GetValue(oldGeneration, row , col+1) +
- GetValue(oldGeneration, row+1, col-1) +
- GetValue(oldGeneration, row+1, col ) +
- GetValue(oldGeneration, row+1, col+1);
- char newVal = (sum == 3 || (sum == 2 && GetValue(oldGeneration, row, col)));
- SetValue(newGeneration, row, col, newVal);
- }
- static void UpdateBorders(__m128i* oldGeneration, __m128i* newGeneration) {
- int row, col;
- // Vertical borders
- for (col = WIDTH-1; col < CELLS_WIDE-1; col += WIDTH) {
- for(row = 1; row < CELLS_TALL-1; ++row) {
- UpdateOne(oldGeneration, newGeneration, row, col);
- UpdateOne(oldGeneration, newGeneration, row, col+1);
- }
- }
- // Horizontal borders
- for (row = HEIGHT-1; row < CELLS_TALL-1; row += HEIGHT) {
- for(col = 1; col < CELLS_WIDE-1; ++col) {
- UpdateOne(oldGeneration, newGeneration, row, col);
- UpdateOne(oldGeneration, newGeneration, row+1, col);
- }
- }
- }
- // Operations
- void ComputeNextGeneration(__m128i* oldGeneration, __m128i* newGeneration) {
- ComputeCells(oldGeneration, newGeneration);
- UpdateBorders(oldGeneration, newGeneration);
- }
- void Init(__m128i* buffer, const char* data) {
- int r, c;
- memset(buffer, 0, WIDTH*HEIGHT*sizeof(__m128i));
- r=0, c=0;
- while(*data) {
- if('|' == *data) { ++r; c=-1; }
- else if('#' == *data) { SetValue(buffer, r, c, 1); }
- ++c;
- ++data;
- }
- }
- void Print(__m128i* buffer, int row, int col, int width, int height) {
- int r, c;
- for (r = 0; r < height; ++r) {
- for (c = 0; c < width; ++c) {
- if (GetValue(buffer, row+r, col+c))
- putchar('#');
- else
- putchar('.');
- }
- putchar('\n');
- }
- }
- static const char* GOSPER_GUN =
- " |"
- " |"
- " # |"
- " # # |"
- " ## ## ## |"
- " # # ## ## |"
- " ## # # ## |"
- " ## # # ## # # |"
- " # # # |"
- " # # |"
- " ## |"
- " |"
- ;
- // Double-buffered data
- static __m128i tiles0[WIDTH*HEIGHT];
- static __m128i tiles1[WIDTH*HEIGHT];
- int main() {
- __m128i* oldGeneration = tiles0;
- __m128i* newGeneration = tiles1;
- __m128i* t;
- int i;
- Init(oldGeneration, GOSPER_GUN);
- // Simulates for 100 generations.
- for (i = 0; i < 100; ++i) {
- putchar('\n');
- Print(oldGeneration, 0, 0, 79, 30);
- ComputeNextGeneration(oldGeneration, newGeneration);
- t = oldGeneration;
- oldGeneration = newGeneration;
- newGeneration = t;
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement