Advertisement
elvman

NEON transpose test

Apr 17th, 2019
201
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 4.40 KB | None | 0 0
  1. #include <arm_neon.h>
  2. #include <sys/time.h>
  3. #include <stdio.h>
  4. #include <stdint.h>
  5. #include <stdlib.h>
  6. #include <inttypes.h>
  7.  
  8. static void transpose(float m[16], float dst[16])
  9. {
  10.     for (uint32_t x = 0; x < 4; ++x)
  11.         for (uint32_t y = 0; y < 4; ++y)
  12.             dst[y * 4 + x] = m[x * 4 + y];
  13. }
  14.  
  15. static void transposeAsm(float m[16], float dst[16])
  16. {
  17.     asm volatile
  18.     (
  19.      "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1]\n\t" // dst.m[0, 4, 8, 12] = m[0-3]
  20.      "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0]\n\t" // dst.m[1, 5, 9, 13] = m[4-7]
  21.      : // output
  22.      : "r"(dst), "r"(m) // input
  23.      : "v0", "v1", "v2", "v3", "memory"
  24.      );
  25. }
  26.  
  27. static void transposeIntrin(float m[16], float dst[16])
  28. {
  29.     float32x4_t vecABCD1 = vld1q_f32(&m[0]);
  30.     float32x4_t vecABCD2 = vld1q_f32(&m[4]);
  31.     float32x4_t vecABCD3 = vld1q_f32(&m[8]);
  32.     float32x4_t vecABCD4 = vld1q_f32(&m[12]);
  33.     float32x4x2_t resA = vtrnq_f32(vecABCD1, vecABCD2);
  34.     float32x4x2_t resB = vtrnq_f32(vecABCD3, vecABCD4);
  35.  
  36.     float32x2_t resA0L = vget_low_f32(resA.val[0]);
  37.     float32x2_t resA0H = vget_high_f32(resA.val[0]);
  38.     float32x2_t resA1L = vget_low_f32(resA.val[1]);
  39.     float32x2_t resA1H = vget_high_f32(resA.val[1]);
  40.     float32x2_t resB0L = vget_low_f32(resB.val[0]);
  41.     float32x2_t resB0H = vget_high_f32(resB.val[0]);
  42.     float32x2_t resB1L = vget_low_f32(resB.val[1]);
  43.     float32x2_t resB1H = vget_high_f32(resB.val[1]);
  44.  
  45.     float32x4_t vec1 = vcombine_f32(resA0L, resB0L);
  46.     float32x4_t vec2 = vcombine_f32(resA1L, resB1L);
  47.     float32x4_t vec3 = vcombine_f32(resA0H, resB0H);
  48.     float32x4_t vec4 = vcombine_f32(resA1H, resB1H);
  49.  
  50.     vst1q_f32(&dst[0], vec1);
  51.     vst1q_f32(&dst[4], vec2);
  52.     vst1q_f32(&dst[8], vec3);
  53.     vst1q_f32(&dst[12], vec4);
  54. }
  55.  
  56. static inline uint64_t getMicrosec()
  57. {
  58.     struct timeval time;
  59.     gettimeofday(&time, NULL);
  60.     return ((uint64_t)time.tv_sec * 1000000) + time.tv_usec;
  61. }
  62.  
  63. int main(int argc, char * argv[])
  64. {
  65.     float m1[16];
  66.     float m2[16];
  67.     float val = 0.0F;
  68.  
  69.     for (size_t x = 0; x < 4; ++x)
  70.         for (size_t y = 0; y < 4; ++y)
  71.             m1[x * 4 + y] = (val += 1.0f);
  72.  
  73.     transpose(m1, m2);
  74.  
  75.     val = 0.0F;
  76.     for (size_t x = 0; x < 4; ++x)
  77.         for (size_t y = 0; y < 4; ++y)
  78.             if (m2[y * 4 + x] != (val += 1.0f))
  79.                 return EXIT_FAILURE;
  80.  
  81.     val = 0.0F;
  82.     for (size_t x = 0; x < 4; ++x)
  83.         for (size_t y = 0; y < 4; ++y)
  84.             m1[x * 4 + y] = (val += 1.0f);
  85.  
  86.     transposeAsm(m1, m2);
  87.  
  88.     val = 0.0F;
  89.     for (size_t x = 0; x < 4; ++x)
  90.         for (size_t y = 0; y < 4; ++y)
  91.             if (m2[y * 4 + x] != (val += 1.0f))
  92.                 return EXIT_FAILURE;
  93.  
  94.     val = 0.0F;
  95.  
  96.     for (size_t x = 0; x < 4; ++x)
  97.         for (size_t y = 0; y < 4; ++y)
  98.             m1[x * 4 + y] = (val += 1.0f);
  99.  
  100.     transposeIntrin(m1, m2);
  101.  
  102.     val = 0.0F;
  103.     for (size_t x = 0; x < 4; ++x)
  104.         for (size_t y = 0; y < 4; ++y)
  105.             if (m2[y * 4 + x] != (val += 1.0f))
  106.                 return EXIT_FAILURE;
  107.  
  108.     const uint32_t COUNT = 10000000;
  109.  
  110.     uint64_t start, finish;
  111.  
  112.     val = 0.0F;
  113.     start = getMicrosec();
  114.  
  115.     for (uint32_t i = 0; i < COUNT; ++i)
  116.     {
  117.         for (size_t x = 0; x < 4; ++x)
  118.             for (size_t y = 0; y < 4; ++y)
  119.             {
  120.                 val *= 100;
  121.                 val /= 3.14;
  122.                 m1[x * 4 + y] = val;
  123.             }
  124.  
  125.         transpose(m1, m2);
  126.     }
  127.  
  128.     finish = getMicrosec();
  129.  
  130.     printf("%" PRIu64 "\n", finish - start);
  131.  
  132.     val = 0.0F;
  133.     start = getMicrosec();
  134.  
  135.     for (uint32_t i = 0; i < COUNT; ++i)
  136.     {
  137.         for (size_t x = 0; x < 4; ++x)
  138.             for (size_t y = 0; y < 4; ++y)
  139.             {
  140.                 val *= 100;
  141.                 val /= 3.14;
  142.                 m1[x * 4 + y] = val;
  143.             }
  144.  
  145.         transposeAsm(m1, m2);
  146.     }
  147.  
  148.     finish = getMicrosec();
  149.  
  150.     printf("ASM: %" PRIu64 "\n", finish - start);
  151.  
  152.     val = 0.0F;
  153.     start = getMicrosec();
  154.  
  155.     for (uint32_t i = 0; i < COUNT; ++i)
  156.     {
  157.         for (size_t x = 0; x < 4; ++x)
  158.             for (size_t y = 0; y < 4; ++y)
  159.             {
  160.                 val *= 100;
  161.                 val /= 3.14;
  162.                 m1[x * 4 + y] = val;
  163.             }
  164.  
  165.         transposeIntrin(m1, m2);
  166.     }
  167.  
  168.     finish = getMicrosec();
  169.  
  170.     printf("Intrin: %" PRIu64 "\n", finish - start);
  171.  
  172.     return EXIT_SUCCESS;
  173. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement