Advertisement
stgatilov

Transpose array of arrays

Nov 30th, 2015
408
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.93 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <time.h>
  4. #include <assert.h>
  5. #include <algorithm>
  6. using namespace std;
  7.  
  8. #ifdef _MSC_VER
  9.     #define FORCEINLINE __forceinline
  10.     #define ALIGN(n) __declspec(align(n))
  11.     #define RESTRICT __restrict
  12. #else
  13.     #define FORCEINLINE __attribute__((always_inline)) inline
  14.     #define ALIGN(n) __attribute__((aligned(n)))
  15.     #define RESTRICT __restrict__
  16. #endif
  17.  
  18.  
  19. struct Matrix {
  20.     int r, c;
  21.     float **data;
  22.  
  23.     static Matrix Create(int _r, int _c) {
  24.         Matrix res;
  25.         res.r = _r, res.c = _c;
  26.         res.data = new float*[res.r];
  27.         for (int i = 0; i < res.r; i++) {
  28.             //res.data[i] = (float*)_mm_malloc(res.c * sizeof(float), 16);
  29.             res.data[i] = new float[res.c];
  30.             for (int j = 0; j < res.c; j++)
  31.                 res.data[i][j] = (float)rand();
  32.         }
  33.         return res;
  34.     }
  35. };
  36.  
  37. //===============================================================
  38.  
  39. void Transpose(Matrix &dst, const Matrix &src) {
  40.     int r = dst.r, c = dst.c;
  41.     assert(r == src.c && c == src.r);
  42.     for (int i = 0; i < r; i++) {
  43.         for (int j = 0; j < c; j++)
  44.             dst.data[i][j] = src.data[j][i];
  45.     }
  46. }
  47.  
  48. //===============================================================
  49.  
  50. template<int BLOCK>
  51. void ProcessFullBlock(float **dst, float **src, int a, int b) {
  52.     const float *RESTRICT srcLines[BLOCK];
  53.     for (int j = 0; j < BLOCK; j++)
  54.         srcLines[j] = src[b + j] + a;
  55.     for (int i = 0; i < BLOCK; i++) {
  56.         float *RESTRICT dstLine = dst[a + i] + b;
  57.         for (int j = 0; j < BLOCK; j++)
  58.             dstLine[j] = srcLines[j][i];
  59.     }
  60. }
  61.  
  62. void ProcessPartialBlock(float **dst, float **src, int r, int c, int a, int b, int block) {
  63.     for (int i = a; i < min(r, a + block); i++)
  64.         for (int j = b; j < min(c, b + block); j++)
  65.             dst[i][j] = src[j][i];
  66. }
  67.  
  68. template<int BLOCK>
  69. void TransposeBlocked(Matrix &dst, const Matrix &src) {
  70.     int r = dst.r, c = dst.c;
  71.     assert(r == src.c && c == src.r);
  72.     for (int i = 0; i < r; i += BLOCK)
  73.         for (int j = 0; j < c; j += BLOCK) {
  74.             if (i + BLOCK <= r && j + BLOCK <= c)
  75.                 ProcessFullBlock<BLOCK>(dst.data, src.data, i, j);
  76.             else
  77.                 ProcessPartialBlock(dst.data, src.data, r, c, i, j, BLOCK);
  78.         }
  79. }
  80.  
  81. //===============================================================
  82.  
  83. #include <xmmintrin.h>
  84.  
  85. template<int BLOCK>
  86. void ProcessFullBlockSSE(float **dst, float **src, int a, int b) {
  87.     static ALIGN(16) float buffer[BLOCK][BLOCK];
  88.     for (int j = 0; j < BLOCK; j++)
  89.         memcpy(buffer[j], src[b + j] + a, sizeof(buffer[j]));
  90.     for (int i = 0; i < BLOCK; i += 4) {
  91.         float *RESTRICT row0 = dst[a + i+0] + b;
  92.         float *RESTRICT row1 = dst[a + i+1] + b;
  93.         float *RESTRICT row2 = dst[a + i+2] + b;
  94.         float *RESTRICT row3 = dst[a + i+3] + b;
  95.         for (int j = 0; j < BLOCK; j += 4) {
  96.             __m128 r0 = _mm_load_ps(&buffer[j+0][i]);
  97.             __m128 r1 = _mm_load_ps(&buffer[j+1][i]);
  98.             __m128 r2 = _mm_load_ps(&buffer[j+2][i]);
  99.             __m128 r3 = _mm_load_ps(&buffer[j+3][i]);
  100.             _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
  101.             _mm_storeu_ps(&row0[j], r0);
  102.             _mm_storeu_ps(&row1[j], r1);
  103.             _mm_storeu_ps(&row2[j], r2);
  104.             _mm_storeu_ps(&row3[j], r3);
  105.         }
  106.     }
  107. }
  108.  
  109. template<int BLOCK>
  110. void TransposeBlockedSSE(Matrix &dst, const Matrix &src) {
  111.     int r = dst.r, c = dst.c;
  112.     assert(r == src.c && c == src.r);
  113.     for (int i = 0; i < r; i += BLOCK)
  114.         for (int j = 0; j < c; j += BLOCK) {
  115.             if (i + BLOCK <= r && j + BLOCK <= c)
  116.                 ProcessFullBlockSSE<BLOCK>(dst.data, src.data, i, j);
  117.             else
  118.                 ProcessPartialBlock(dst.data, src.data, r, c, i, j, BLOCK);
  119.         }
  120. }
  121.  
  122. //===============================================================
  123.  
  124. void AssertTransposed(const Matrix &dst, const Matrix &src) {
  125.     assert(dst.r == src.c);
  126.     assert(dst.c == src.r);
  127.     for (int i = 0; i < dst.r; i++)
  128.         for (int j = 0; j < dst.c; j++)
  129.             assert(dst.data[i][j] == src.data[j][i]);
  130. }
  131.  
  132.  
  133. int main() {
  134.     static const int LAUNCHES = 100;
  135.     Matrix src = Matrix::Create(10000, 500);
  136.     Matrix dst = Matrix::Create(500, 10000);
  137.  
  138.    
  139.     int startTime = clock();
  140.     for (int q = 0; q < LAUNCHES; q++) Transpose(dst, src);
  141.     printf("Trivial: %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  142.     AssertTransposed(dst, src);
  143.  
  144.  
  145.     startTime = clock();
  146.     for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<4>(dst, src);
  147.     printf("Blocked(4): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  148.     AssertTransposed(dst, src);
  149.  
  150.     startTime = clock();
  151.     for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<16>(dst, src);
  152.     printf("Blocked(16): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  153.     AssertTransposed(dst, src);
  154.  
  155.     startTime = clock();
  156.     for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<64>(dst, src);
  157.     printf("Blocked(64): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  158.     AssertTransposed(dst, src);
  159.  
  160.     startTime = clock();
  161.     for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<128>(dst, src);
  162.     printf("Blocked(128): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  163.     AssertTransposed(dst, src);
  164.  
  165.     startTime = clock();
  166.     for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<256>(dst, src);
  167.     printf("Blocked(256): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  168.     AssertTransposed(dst, src);
  169.  
  170.  
  171.     startTime = clock();
  172.     for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<16>(dst, src);
  173.     printf("BlockedSSE(16): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  174.     AssertTransposed(dst, src);
  175.  
  176.     startTime = clock();
  177.     for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<64>(dst, src);
  178.     printf("BlockedSSE(64): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  179.     AssertTransposed(dst, src);
  180.  
  181.     startTime = clock();
  182.     for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<128>(dst, src);
  183.     printf("BlockedSSE(128): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  184.     AssertTransposed(dst, src);
  185.  
  186.     startTime = clock();
  187.     for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<256>(dst, src);
  188.     printf("BlockedSSE(256): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
  189.     AssertTransposed(dst, src);
  190.  
  191.     return 0;
  192. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement