Advertisement
Guest User

Untitled

a guest
Jun 22nd, 2014
226
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 10.34 KB | None | 0 0
  1. diff --git a/Makefile b/Makefile
  2. index 38d3910..c17f68e 100644
  3. --- a/Makefile
  4. +++ b/Makefile
  5. @@ -8,13 +8,13 @@ BINDIR        = bin
  6.  OBJDIR         = obj
  7.  VPATH      = src
  8.  
  9. -C_FILES        = main rgb2yuv_wiki rgb2yuv_novell_ch rgb2yuv_tables rgb2yuv_sse rgb2yuv_gstreamer
  10. +C_FILES        = main rgb2yuv_wiki rgb2yuv_novell_ch rgb2yuv_tables rgb2yuv_sse rgb2yuv_sse2 rgb2yuv_gstreamer
  11.  OBJS       = $(addprefix $(OBJDIR)/, $(addsuffix .o, $(C_FILES)))
  12.  
  13.  CFLAGS     = -fmessage-length=0 -fstack-protector -O2 -Wall -D_FORTIFY_SOURCE=2 -funwind-tables -fasynchronous-unwind-tables -g \
  14.               -Wall -Wextra -Wno-unused-parameter -Wformat-y2k -Winit-self -msse4.1 \
  15. -             -Wstrict-prototypes -Winline -Wnested-externs -Wbad-function-cast -Wshadow
  16. -LDFLAGS        = -lrt
  17. +             -Wstrict-prototypes -Winline -Wnested-externs -Wbad-function-cast -Wshadow -std=gnu11
  18. +LDFLAGS        = -lrt -lgomp
  19.  
  20.  #
  21.  # Targets
  22. diff --git a/src/main.c b/src/main.c
  23. index 4689649..7368fab 100644
  24. --- a/src/main.c
  25. +++ b/src/main.c
  26. @@ -3,69 +3,56 @@
  27.  #include "rgb2yuv.h"
  28.  #include <stdlib.h>
  29.  #include <stdio.h>
  30. +#include <omp.h>
  31.  
  32. -#define PIXELS_COUNT   (100 * 1000 * 1000)
  33. +#define PIXELS_COUNT  1080
  34.  
  35.  static uint8_t pixels_wiki[4 * PIXELS_COUNT];
  36.  static uint8_t pixels_novel_ch[4 * PIXELS_COUNT];
  37.  static uint8_t pixels_tables[4 * PIXELS_COUNT];
  38.  static uint8_t pixels_gstreamer[4 * PIXELS_COUNT];
  39.  static uint8_t pixels_see[4 * PIXELS_COUNT];
  40. -
  41. -int main(int argc, char *argv[])
  42. -{
  43. -   struct timespec now;
  44. -   clock_t start, finish;
  45. -
  46. -   clock_gettime(CLOCK_REALTIME, &now);
  47. -   srand(now.tv_nsec);
  48. -
  49. -   // prepare input buffers
  50. -
  51. -   int i;
  52. -   for (i = 0; i < 4 * PIXELS_COUNT; i++)
  53. -   {
  54. -       uint8_t x = ((i % 4) * rand());
  55. -       pixels_wiki[i] = x;
  56. -       pixels_novel_ch[i] = x;
  57. -       pixels_tables[i] = x;
  58. -                pixels_gstreamer[i] = x;
  59. -                pixels_see[i] = x;
  60. -   }
  61. -
  62. -   // run conversations
  63. -
  64. -   start = clock();
  65. -   rgb2yuv_wiki(pixels_wiki, PIXELS_COUNT);
  66. -   finish = clock();
  67. -   printf("rgb2yuv_wiki: %.3f sec\n", (float) (finish - start) / CLOCKS_PER_SEC);
  68. -
  69. -   start = clock();
  70. -   rgb2yuv_novell_ch(pixels_novel_ch, PIXELS_COUNT);
  71. -   finish = clock();
  72. -   printf("rgb2yuv_novell_ch: %.3f sec\n", (float) (finish - start) / CLOCKS_PER_SEC);
  73. -
  74. -   start = clock();
  75. -   rgb2yuv_tables_init();
  76. -   rgb2yuv_tables(pixels_tables, PIXELS_COUNT);
  77. -   finish = clock();
  78. -   printf("rgb2yuv_tables: %.3f sec\n", (float) (finish - start) / CLOCKS_PER_SEC);
  79. -        
  80. -        start = clock();
  81. -        rgb2yuv_gstreamer(pixels_gstreamer, PIXELS_COUNT);
  82. -        finish = clock();
  83. -        printf("rgb2yuv_gstreamer: %.3f sec\n", (float) (finish - start) / CLOCKS_PER_SEC);
  84. -        
  85. -        start = clock();
  86. -        rgb2yuv_sse(pixels_see, PIXELS_COUNT);
  87. -        finish = clock();
  88. -        printf("rgb2yuv_sse: %.3f sec\n", (float) (finish - start) / CLOCKS_PER_SEC);
  89. -
  90. -
  91. -   for (i = 0; i < 100; i++)
  92. -   {
  93. -       printf("%d: %d %d %d %d %d\n", i, pixels_wiki[i], pixels_novel_ch[i], pixels_tables[i], pixels_gstreamer[i], pixels_see[i]);
  94. -   }
  95. -
  96. -   return 0;
  97. +static uint8_t pixels_see2[4 * PIXELS_COUNT];
  98. +
  99. +enum {KB = 1024, MB = KB * 1024, GB = MB * 1024};
  100. +#define Mbench(name, calls, data_len, call) ({\
  101. +  uint64_t i = calls;\
  102. +  double start = omp_get_wtime();\
  103. +  do {\
  104. +    call;\
  105. +  } while(--i);\
  106. +  double time = omp_get_wtime() - start;\
  107. +  fprintf(stderr, "%s: (%lu)%luKB: total %luGB: %fGB/s\n", name, (uint64_t)calls, (uint64_t)data_len/KB, (data_len * calls)/GB, ((double)(data_len * calls)/time)/GB);\
  108. +})
  109. +
  110. +int main(int argc, char * argv[]) {
  111. +  struct timespec now;
  112. +  clock_gettime(CLOCK_REALTIME, &now);
  113. +  srand(now.tv_nsec);
  114. +  // prepare input buffers
  115. +
  116. +  for(uint32_t i = 0; i < 4 * PIXELS_COUNT; i++) {
  117. +    uint8_t x = ((i % 4) * rand());
  118. +    pixels_wiki[i] = x;
  119. +    pixels_novel_ch[i] = x;
  120. +    pixels_tables[i] = x;
  121. +    pixels_gstreamer[i] = x;
  122. +    pixels_see[i] = x;
  123. +    pixels_see2[i] = x;
  124. +  }
  125. +
  126. +  // run conversations
  127. +  Mbench("rgb2yuv_wiki", (1024 * 1024), sizeof(pixels_wiki), rgb2yuv_wiki(pixels_wiki, PIXELS_COUNT));
  128. +  Mbench("rgb2yuv_novell_ch", (1024 * 1024), sizeof(pixels_novel_ch), rgb2yuv_novell_ch(pixels_novel_ch, PIXELS_COUNT));
  129. +  Mbench("rgb2yuv_tables", (1024 * 1024), sizeof(pixels_tables), rgb2yuv_tables(pixels_tables, PIXELS_COUNT));
  130. +  Mbench("rgb2yuv_gstreamer", (1024 * 1024), sizeof(pixels_gstreamer), rgb2yuv_gstreamer(pixels_gstreamer, PIXELS_COUNT));
  131. +  Mbench("rgb2yuv_sse", (1024 * 1024), sizeof(pixels_see), rgb2yuv_sse(pixels_see, PIXELS_COUNT));
  132. +  Mbench("rgb2yuv_sse2", (1024 * 1024), sizeof(pixels_see2), rgb2yuv_sse2(pixels_see2, PIXELS_COUNT));
  133. +
  134. +
  135. +//   for(uint32_t i = 0; i < 100; i++) {
  136. +//     printf("%d: %d %d %d %d %d\n", i, pixels_wiki[i], pixels_novel_ch[i], pixels_tables[i], pixels_gstreamer[i], pixels_see[i]);
  137. +//   }
  138. +
  139. +  return 0;
  140.  }
  141. diff --git a/src/rgb2yuv_sse2.c b/src/rgb2yuv_sse2.c
  142. new file mode 100644
  143. index 0000000..4dc53a8
  144. --- /dev/null
  145. +++ b/src/rgb2yuv_sse2.c
  146. @@ -0,0 +1,130 @@
  147. +/**
  148. + * @file rgb2yuv_sse.c
  149. + *
  150. + * RGB2YUV sse
  151. + */
  152. +
  153. +#include "rgb2yuv.h"
  154. +#include <xmmintrin.h>
  155. +#include <emmintrin.h>
  156. +#include <tmmintrin.h>
  157. +
  158. +typedef struct {
  159. +  __m128i r, g, b;
  160. +} r8g8b8_t;
  161. +
  162. +typedef struct {
  163. +  __m128i y, u, v;
  164. +} y8u8v8_t;
  165. +
  166. +typedef struct {
  167. +  __m128i r_coef, g_coef, b_coef, x_coef;
  168. +} rgb_to_yuv_mat_v8_t;
  169. +
  170. +static inline rgb_to_yuv_mat_v8_t create_vecm(int32_t r_coef, int32_t g_coef, int32_t b_coef, int32_t x_coef) {
  171. +  return (rgb_to_yuv_mat_v8_t) {
  172. +    _mm_set1_epi16(r_coef),
  173. +    _mm_set1_epi16(g_coef),
  174. +    _mm_set1_epi16(b_coef),
  175. +    _mm_set1_epi16(x_coef)
  176. +  };
  177. +}
  178. +
  179. +static inline __m128i op(r8g8b8_t v, rgb_to_yuv_mat_v8_t m) {
  180. +  __m128i a = _mm_mullo_epi16(v.r, m.r_coef);
  181. +  __m128i b = _mm_mullo_epi16(v.g, m.g_coef);
  182. +  __m128i c = _mm_mullo_epi16(v.b, m.b_coef);
  183. +  __m128i ret = _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, m.x_coef));
  184. +  return _mm_srai_epi16(ret, 8);
  185. +}
  186. +
  187. +static inline __m128i r8g8b8_to_y8(r8g8b8_t v) {
  188. +  return op(v, create_vecm(47, 157, 16, 4096));
  189. +}
  190. +static inline __m128i r8g8b8_to_u8(r8g8b8_t v) {
  191. +  return op(v, create_vecm(-26, -87, 112, 32768));
  192. +}
  193. +static inline __m128i r8g8b8_to_v8(r8g8b8_t v) {
  194. +  return op(v, create_vecm(112, -102, -10, 32768));
  195. +}
  196. +
  197. +static inline y8u8v8_t r8g8b8_to_y8u8v8(r8g8b8_t v) {
  198. +  return (y8u8v8_t) {r8g8b8_to_y8(v), r8g8b8_to_u8(v), r8g8b8_to_v8(v)};
  199. +}
  200. +
  201. +// static inline r8g8b8_t p16x2_to_r8g8b8(__m128i a, __m128i b) {
  202. +//   __m128i v_byte1 = _mm_set1_epi32(0x000000ff);
  203. +//   __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);
  204. +//   __m128i r8 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a, 1), v_byte1), _mm_and_si128(_mm_slli_si128(b, 1), v_byte3));
  205. +//   __m128i g8 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a, 2), v_byte1), _mm_and_si128(b, v_byte3));
  206. +//   __m128i b8 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a, 3), v_byte1), _mm_and_si128(_mm_srli_si128(b, 1), v_byte3));
  207. +//   return (r8g8b8_t) {r8, g8, b8};
  208. +// }
  209. +
  210. +static inline r8g8b8_t p16x2_to_r8g8b8(__m128i a, __m128i b) {
  211. +  __m128i shuff_ra  = _mm_setr_epi8(1, 255, 255, 255, 5, 255, 255, 255, 9, 255, 255, 255, 13, 255, 255, 255);
  212. +  __m128i shuff_ga  = _mm_setr_epi8(2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14, 255, 255, 255);
  213. +  __m128i shuff_ba  = _mm_setr_epi8(3, 255, 255, 255, 7, 255, 255, 255, 11, 255, 255, 255, 15, 255, 255, 255);
  214. +  __m128i shuff_rb  = _mm_setr_epi8(255, 255, 1, 255, 255, 255, 5, 255, 255, 255, 9, 255, 255, 255, 13, 255);
  215. +  __m128i shuff_gb  = _mm_setr_epi8(255, 255, 2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14, 255);
  216. +  __m128i shuff_bb  = _mm_setr_epi8(255, 255, 3, 255, 255, 255, 7, 255, 255, 255, 11, 255, 255, 255, 15, 255);
  217. +  __m128i r8 = _mm_or_si128(_mm_shuffle_epi8(a, shuff_ra), _mm_shuffle_epi8(b, shuff_rb));
  218. +  __m128i g8 = _mm_or_si128(_mm_shuffle_epi8(a, shuff_ga), _mm_shuffle_epi8(b, shuff_gb));
  219. +  __m128i b8 = _mm_or_si128(_mm_shuffle_epi8(a, shuff_ba), _mm_shuffle_epi8(b, shuff_bb));
  220. +  return (r8g8b8_t) {r8, g8, b8};
  221. +}
  222. +
  223. +static inline __m128i y8u8v8_to_p16a(y8u8v8_t v) {
  224. +  __m128i shuff_ya  = _mm_setr_epi8(255, 0, 255, 255, 255, 4, 255, 255, 255, 8, 255, 255, 255, 12, 255, 255);
  225. +  __m128i shuff_ua  = _mm_setr_epi8(255, 255, 0, 255, 255, 255, 4, 255, 255, 255, 8, 255, 255, 255, 12, 255);
  226. +  __m128i shuff_va  = _mm_setr_epi8(255, 255, 255, 0, 255, 255, 255, 4, 255, 255, 255, 8, 255, 255, 255, 12);
  227. +  return _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v.y, shuff_ya), _mm_shuffle_epi8(v.u, shuff_ua)), _mm_shuffle_epi8(v.v, shuff_va));
  228. +}
  229. +
  230. +static inline __m128i y8u8v8_to_p16b(y8u8v8_t v) {
  231. +  __m128i shuff_yb  = _mm_setr_epi8(255, 2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14, 255, 255);
  232. +  __m128i shuff_ub  = _mm_setr_epi8(255, 255, 2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14, 255);
  233. +  __m128i shuff_vb  = _mm_setr_epi8(255, 255, 255, 2, 255, 255, 255, 6, 255, 255, 255, 10, 255, 255, 255, 14);
  234. +  return _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(v.y, shuff_yb), _mm_shuffle_epi8(v.u, shuff_ub)), _mm_shuffle_epi8(v.v, shuff_vb));
  235. +}
  236. +
  237. +void rgb2yuv_sse2(void * pixels, uint64_t len) {
  238. +  __m128i * it = pixels, * end = (__m128i *)((int32_t *)pixels + len);
  239. +  do {
  240. +    {
  241. +    __m128i a = *(it + 0), b = *(it + 1);
  242. +    y8u8v8_t v = r8g8b8_to_y8u8v8(p16x2_to_r8g8b8(a, b));
  243. +    a = y8u8v8_to_p16a(v), b = y8u8v8_to_p16b(v);
  244. +    *(it + 0) = a, *(it + 1) = b;
  245. +    it += 2;
  246. +    }
  247. +//     {
  248. +//     __m128i a = *(it + 0), b = *(it + 1);
  249. +//     y8u8v8_t v = r8g8b8_to_y8u8v8(p16x2_to_r8g8b8(a, b));
  250. +//     a = y8u8v8_to_p16a(v), b = y8u8v8_to_p16b(v);
  251. +//     *(it + 0) = a, *(it + 1) = b;
  252. +//     it += 2;
  253. +//     }
  254. +//     {
  255. +//     __m128i a = *(it + 0), b = *(it + 1);
  256. +//     y8u8v8_t v = r8g8b8_to_y8u8v8(p16x2_to_r8g8b8(a, b));
  257. +//     a = y8u8v8_to_p16a(v), b = y8u8v8_to_p16b(v);
  258. +//     *(it + 0) = a, *(it + 1) = b;
  259. +//     it += 2;
  260. +//     }
  261. +//     {
  262. +//     __m128i a = *(it + 0), b = *(it + 1);
  263. +//     y8u8v8_t v = r8g8b8_to_y8u8v8(p16x2_to_r8g8b8(a, b));
  264. +//     a = y8u8v8_to_p16a(v), b = y8u8v8_to_p16b(v);
  265. +//     *(it + 0) = a, *(it + 1) = b;
  266. +//     it += 2;
  267. +//     }
  268. +//     {
  269. +//     __m128i a = *(it + 0), b = *(it + 1);
  270. +//     y8u8v8_t v = r8g8b8_to_y8u8v8(p16x2_to_r8g8b8(a, b));
  271. +//     a = y8u8v8_to_p16a(v), b = y8u8v8_to_p16b(v);
  272. +//     *(it + 0) = a, *(it + 1) = b;
  273. +//     it += 2;
  274. +//     }
  275. +  } while(it < end);
  276. +}
  277. \ No newline at end of file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement