Transpose array of arrays

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <algorithm>
using namespace std;

#ifdef _MSC_VER
    #define FORCEINLINE __forceinline
    #define ALIGN(n) __declspec(align(n))
    #define RESTRICT __restrict
#else
    #define FORCEINLINE __attribute__((always_inline)) inline
    #define ALIGN(n) __attribute__((aligned(n)))
    #define RESTRICT __restrict__
#endif


struct Matrix {
    int r, c;
    float **data;

    static Matrix Create(int _r, int _c) {
        Matrix res;
        res.r = _r, res.c = _c;
        res.data = new float*[res.r];
        for (int i = 0; i < res.r; i++) {
            //res.data[i] = (float*)_mm_malloc(res.c * sizeof(float), 16);
            res.data[i] = new float[res.c];
            for (int j = 0; j < res.c; j++)
                res.data[i][j] = (float)rand();
        }
        return res;
    }
};

//===============================================================

void Transpose(Matrix &dst, const Matrix &src) {
    int r = dst.r, c = dst.c;
    assert(r == src.c && c == src.r);
    for (int i = 0; i < r; i++) {
        for (int j = 0; j < c; j++)
            dst.data[i][j] = src.data[j][i];
    }
}

//===============================================================

template<int BLOCK>
void ProcessFullBlock(float **dst, float **src, int a, int b) {
    const float *RESTRICT srcLines[BLOCK];
    for (int j = 0; j < BLOCK; j++)
        srcLines[j] = src[b + j] + a;
    for (int i = 0; i < BLOCK; i++) {
        float *RESTRICT dstLine = dst[a + i] + b;
        for (int j = 0; j < BLOCK; j++)
            dstLine[j] = srcLines[j][i];
    }
}

void ProcessPartialBlock(float **dst, float **src, int r, int c, int a, int b, int block) {
    for (int i = a; i < min(r, a + block); i++)
        for (int j = b; j < min(c, b + block); j++)
            dst[i][j] = src[j][i];
}

template<int BLOCK>
void TransposeBlocked(Matrix &dst, const Matrix &src) {
    int r = dst.r, c = dst.c;
    assert(r == src.c && c == src.r);
    for (int i = 0; i < r; i += BLOCK)
        for (int j = 0; j < c; j += BLOCK) {
            if (i + BLOCK <= r && j + BLOCK <= c)
                ProcessFullBlock<BLOCK>(dst.data, src.data, i, j);
            else
                ProcessPartialBlock(dst.data, src.data, r, c, i, j, BLOCK);
        }
}

//===============================================================

#include <xmmintrin.h>

template<int BLOCK>
void ProcessFullBlockSSE(float **dst, float **src, int a, int b) {
    static ALIGN(16) float buffer[BLOCK][BLOCK];
    for (int j = 0; j < BLOCK; j++)
        memcpy(buffer[j], src[b + j] + a, sizeof(buffer[j]));
    for (int i = 0; i < BLOCK; i += 4) {
        float *RESTRICT row0 = dst[a + i+0] + b;
        float *RESTRICT row1 = dst[a + i+1] + b;
        float *RESTRICT row2 = dst[a + i+2] + b;
        float *RESTRICT row3 = dst[a + i+3] + b;
        for (int j = 0; j < BLOCK; j += 4) {
            __m128 r0 = _mm_load_ps(&buffer[j+0][i]);
            __m128 r1 = _mm_load_ps(&buffer[j+1][i]);
            __m128 r2 = _mm_load_ps(&buffer[j+2][i]);
            __m128 r3 = _mm_load_ps(&buffer[j+3][i]);
            _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
            _mm_storeu_ps(&row0[j], r0);
            _mm_storeu_ps(&row1[j], r1);
            _mm_storeu_ps(&row2[j], r2);
            _mm_storeu_ps(&row3[j], r3);
        }
    }
}

template<int BLOCK>
void TransposeBlockedSSE(Matrix &dst, const Matrix &src) {
    int r = dst.r, c = dst.c;
    assert(r == src.c && c == src.r);
    for (int i = 0; i < r; i += BLOCK)
        for (int j = 0; j < c; j += BLOCK) {
            if (i + BLOCK <= r && j + BLOCK <= c)
                ProcessFullBlockSSE<BLOCK>(dst.data, src.data, i, j);
            else
                ProcessPartialBlock(dst.data, src.data, r, c, i, j, BLOCK);
        }
}

//===============================================================

void AssertTransposed(const Matrix &dst, const Matrix &src) {
    assert(dst.r == src.c);
    assert(dst.c == src.r);
    for (int i = 0; i < dst.r; i++)
        for (int j = 0; j < dst.c; j++)
            assert(dst.data[i][j] == src.data[j][i]);
}


int main() {
    static const int LAUNCHES = 100;
    Matrix src = Matrix::Create(10000, 500);
    Matrix dst = Matrix::Create(500, 10000);


    int startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) Transpose(dst, src);
    printf("Trivial: %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);


    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<4>(dst, src);
    printf("Blocked(4): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);

    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<16>(dst, src);
    printf("Blocked(16): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);

    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<64>(dst, src);
    printf("Blocked(64): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);

    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<128>(dst, src);
    printf("Blocked(128): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);

    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<256>(dst, src);
    printf("Blocked(256): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);


    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<16>(dst, src);
    printf("BlockedSSE(16): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);

    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<64>(dst, src);
    printf("BlockedSSE(64): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);

    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<128>(dst, src);
    printf("BlockedSSE(128): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);

    startTime = clock();
    for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<256>(dst, src);
    printf("BlockedSSE(256): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
    AssertTransposed(dst, src);

    return 0;
}