Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <time.h>
- #include <assert.h>
- #include <algorithm>
- using namespace std;
- #ifdef _MSC_VER
- #define FORCEINLINE __forceinline
- #define ALIGN(n) __declspec(align(n))
- #define RESTRICT __restrict
- #else
- #define FORCEINLINE __attribute__((always_inline)) inline
- #define ALIGN(n) __attribute__((aligned(n)))
- #define RESTRICT __restrict__
- #endif
- struct Matrix {
- int r, c;
- float **data;
- static Matrix Create(int _r, int _c) {
- Matrix res;
- res.r = _r, res.c = _c;
- res.data = new float*[res.r];
- for (int i = 0; i < res.r; i++) {
- //res.data[i] = (float*)_mm_malloc(res.c * sizeof(float), 16);
- res.data[i] = new float[res.c];
- for (int j = 0; j < res.c; j++)
- res.data[i][j] = (float)rand();
- }
- return res;
- }
- };
- //===============================================================
- void Transpose(Matrix &dst, const Matrix &src) {
- int r = dst.r, c = dst.c;
- assert(r == src.c && c == src.r);
- for (int i = 0; i < r; i++) {
- for (int j = 0; j < c; j++)
- dst.data[i][j] = src.data[j][i];
- }
- }
- //===============================================================
- template<int BLOCK>
- void ProcessFullBlock(float **dst, float **src, int a, int b) {
- const float *RESTRICT srcLines[BLOCK];
- for (int j = 0; j < BLOCK; j++)
- srcLines[j] = src[b + j] + a;
- for (int i = 0; i < BLOCK; i++) {
- float *RESTRICT dstLine = dst[a + i] + b;
- for (int j = 0; j < BLOCK; j++)
- dstLine[j] = srcLines[j][i];
- }
- }
- void ProcessPartialBlock(float **dst, float **src, int r, int c, int a, int b, int block) {
- for (int i = a; i < min(r, a + block); i++)
- for (int j = b; j < min(c, b + block); j++)
- dst[i][j] = src[j][i];
- }
- template<int BLOCK>
- void TransposeBlocked(Matrix &dst, const Matrix &src) {
- int r = dst.r, c = dst.c;
- assert(r == src.c && c == src.r);
- for (int i = 0; i < r; i += BLOCK)
- for (int j = 0; j < c; j += BLOCK) {
- if (i + BLOCK <= r && j + BLOCK <= c)
- ProcessFullBlock<BLOCK>(dst.data, src.data, i, j);
- else
- ProcessPartialBlock(dst.data, src.data, r, c, i, j, BLOCK);
- }
- }
- //===============================================================
- #include <xmmintrin.h>
- template<int BLOCK>
- void ProcessFullBlockSSE(float **dst, float **src, int a, int b) {
- static ALIGN(16) float buffer[BLOCK][BLOCK];
- for (int j = 0; j < BLOCK; j++)
- memcpy(buffer[j], src[b + j] + a, sizeof(buffer[j]));
- for (int i = 0; i < BLOCK; i += 4) {
- float *RESTRICT row0 = dst[a + i+0] + b;
- float *RESTRICT row1 = dst[a + i+1] + b;
- float *RESTRICT row2 = dst[a + i+2] + b;
- float *RESTRICT row3 = dst[a + i+3] + b;
- for (int j = 0; j < BLOCK; j += 4) {
- __m128 r0 = _mm_load_ps(&buffer[j+0][i]);
- __m128 r1 = _mm_load_ps(&buffer[j+1][i]);
- __m128 r2 = _mm_load_ps(&buffer[j+2][i]);
- __m128 r3 = _mm_load_ps(&buffer[j+3][i]);
- _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
- _mm_storeu_ps(&row0[j], r0);
- _mm_storeu_ps(&row1[j], r1);
- _mm_storeu_ps(&row2[j], r2);
- _mm_storeu_ps(&row3[j], r3);
- }
- }
- }
- template<int BLOCK>
- void TransposeBlockedSSE(Matrix &dst, const Matrix &src) {
- int r = dst.r, c = dst.c;
- assert(r == src.c && c == src.r);
- for (int i = 0; i < r; i += BLOCK)
- for (int j = 0; j < c; j += BLOCK) {
- if (i + BLOCK <= r && j + BLOCK <= c)
- ProcessFullBlockSSE<BLOCK>(dst.data, src.data, i, j);
- else
- ProcessPartialBlock(dst.data, src.data, r, c, i, j, BLOCK);
- }
- }
- //===============================================================
- void AssertTransposed(const Matrix &dst, const Matrix &src) {
- assert(dst.r == src.c);
- assert(dst.c == src.r);
- for (int i = 0; i < dst.r; i++)
- for (int j = 0; j < dst.c; j++)
- assert(dst.data[i][j] == src.data[j][i]);
- }
- int main() {
- static const int LAUNCHES = 100;
- Matrix src = Matrix::Create(10000, 500);
- Matrix dst = Matrix::Create(500, 10000);
- int startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) Transpose(dst, src);
- printf("Trivial: %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<4>(dst, src);
- printf("Blocked(4): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<16>(dst, src);
- printf("Blocked(16): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<64>(dst, src);
- printf("Blocked(64): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<128>(dst, src);
- printf("Blocked(128): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlocked<256>(dst, src);
- printf("Blocked(256): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<16>(dst, src);
- printf("BlockedSSE(16): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<64>(dst, src);
- printf("BlockedSSE(64): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<128>(dst, src);
- printf("BlockedSSE(128): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- startTime = clock();
- for (int q = 0; q < LAUNCHES; q++) TransposeBlockedSSE<256>(dst, src);
- printf("BlockedSSE(256): %0.3lf sec\n", double(clock() - startTime) / CLOCKS_PER_SEC);
- AssertTransposed(dst, src);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement