Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <arm_neon.h>
- #include <sys/time.h>
- #include <stdio.h>
- #include <stdint.h>
- #include <stdlib.h>
- #include <inttypes.h>
- static void transpose(float m[16], float dst[16])
- {
- for (uint32_t x = 0; x < 4; ++x)
- for (uint32_t y = 0; y < 4; ++y)
- dst[y * 4 + x] = m[x * 4 + y];
- }
- static void transposeAsm(float m[16], float dst[16])
- {
- asm volatile
- (
- "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1]\n\t" // dst.m[0, 4, 8, 12] = m[0-3]
- "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0]\n\t" // dst.m[1, 5, 9, 13] = m[4-7]
- : // output
- : "r"(dst), "r"(m) // input
- : "v0", "v1", "v2", "v3", "memory"
- );
- }
- static void transposeIntrin(float m[16], float dst[16])
- {
- float32x4_t vecABCD1 = vld1q_f32(&m[0]);
- float32x4_t vecABCD2 = vld1q_f32(&m[4]);
- float32x4_t vecABCD3 = vld1q_f32(&m[8]);
- float32x4_t vecABCD4 = vld1q_f32(&m[12]);
- float32x4x2_t resA = vtrnq_f32(vecABCD1, vecABCD2);
- float32x4x2_t resB = vtrnq_f32(vecABCD3, vecABCD4);
- float32x2_t resA0L = vget_low_f32(resA.val[0]);
- float32x2_t resA0H = vget_high_f32(resA.val[0]);
- float32x2_t resA1L = vget_low_f32(resA.val[1]);
- float32x2_t resA1H = vget_high_f32(resA.val[1]);
- float32x2_t resB0L = vget_low_f32(resB.val[0]);
- float32x2_t resB0H = vget_high_f32(resB.val[0]);
- float32x2_t resB1L = vget_low_f32(resB.val[1]);
- float32x2_t resB1H = vget_high_f32(resB.val[1]);
- float32x4_t vec1 = vcombine_f32(resA0L, resB0L);
- float32x4_t vec2 = vcombine_f32(resA1L, resB1L);
- float32x4_t vec3 = vcombine_f32(resA0H, resB0H);
- float32x4_t vec4 = vcombine_f32(resA1H, resB1H);
- vst1q_f32(&dst[0], vec1);
- vst1q_f32(&dst[4], vec2);
- vst1q_f32(&dst[8], vec3);
- vst1q_f32(&dst[12], vec4);
- }
- static inline uint64_t getMicrosec()
- {
- struct timeval time;
- gettimeofday(&time, NULL);
- return ((uint64_t)time.tv_sec * 1000000) + time.tv_usec;
- }
- int main(int argc, char * argv[])
- {
- float m1[16];
- float m2[16];
- float val = 0.0F;
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- m1[x * 4 + y] = (val += 1.0f);
- transpose(m1, m2);
- val = 0.0F;
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- if (m2[y * 4 + x] != (val += 1.0f))
- return EXIT_FAILURE;
- val = 0.0F;
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- m1[x * 4 + y] = (val += 1.0f);
- transposeAsm(m1, m2);
- val = 0.0F;
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- if (m2[y * 4 + x] != (val += 1.0f))
- return EXIT_FAILURE;
- val = 0.0F;
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- m1[x * 4 + y] = (val += 1.0f);
- transposeIntrin(m1, m2);
- val = 0.0F;
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- if (m2[y * 4 + x] != (val += 1.0f))
- return EXIT_FAILURE;
- const uint32_t COUNT = 10000000;
- uint64_t start, finish;
- val = 0.0F;
- start = getMicrosec();
- for (uint32_t i = 0; i < COUNT; ++i)
- {
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- {
- val *= 100;
- val /= 3.14;
- m1[x * 4 + y] = val;
- }
- transpose(m1, m2);
- }
- finish = getMicrosec();
- printf("%" PRIu64 "\n", finish - start);
- val = 0.0F;
- start = getMicrosec();
- for (uint32_t i = 0; i < COUNT; ++i)
- {
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- {
- val *= 100;
- val /= 3.14;
- m1[x * 4 + y] = val;
- }
- transposeAsm(m1, m2);
- }
- finish = getMicrosec();
- printf("ASM: %" PRIu64 "\n", finish - start);
- val = 0.0F;
- start = getMicrosec();
- for (uint32_t i = 0; i < COUNT; ++i)
- {
- for (size_t x = 0; x < 4; ++x)
- for (size_t y = 0; y < 4; ++y)
- {
- val *= 100;
- val /= 3.14;
- m1[x * 4 + y] = val;
- }
- transposeIntrin(m1, m2);
- }
- finish = getMicrosec();
- printf("Intrin: %" PRIu64 "\n", finish - start);
- return EXIT_SUCCESS;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement