Advertisement
Guest User

rgb2yuv with sse3

a guest
Jun 21st, 2014
344
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 3.55 KB | None | 0 0
  1. void
  2. videoconvert_convert_matrix8 (VideoConvert * convert, gpointer pixels)
  3. {
  4.     int i;
  5.     guint8 *p = pixels;
  6.  
  7.     __m128i v_byte1 = _mm_set1_epi32(0x000000ff);
  8.     __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);
  9.     __m128i v_mat_yr = _mm_set1_epi16(47);
  10.     __m128i v_mat_yg = _mm_set1_epi16(157);
  11.     __m128i v_mat_yb = _mm_set1_epi16(16);
  12.     __m128i v_mat_ur = _mm_set1_epi16(-26);
  13.     __m128i v_mat_ug = _mm_set1_epi16(-87);
  14.     __m128i v_mat_ub_vr = _mm_set1_epi16(112);
  15.     __m128i v_mat_vg = _mm_set1_epi16(-102);
  16.     __m128i v_mat_vb = _mm_set1_epi16(-10);
  17.  
  18.     __m128i cnst128 = _mm_set1_epi16(128);
  19.     __m128i cnst4096  = _mm_set1_epi16(4096);
  20.     __m128i cnst32768 = _mm_set1_epi16((short int)32768);
  21.  
  22.     __m128i mask2   = _mm_set1_epi32(0x00ff00ff);
  23.  
  24.     __m128i mask_y1 = _mm_set_epi8((char)128, (char)128, 12, (char)128,   (char)128, (char)128, 8, (char)128,
  25.                     (char)128, (char)128, 4, (char)128,   (char)128, (char)128, 0, (char)128);
  26.  
  27.     __m128i mask_y2 = _mm_set_epi8((char)128, (char)128, 14,  (char)128,  (char)128, (char)128, 10, (char)128,
  28.                     (char)128, (char)128, 6, (char)128,   (char)128, (char)128, 2, (char)128);
  29.  
  30.     __m128i mask_u1 = _mm_set_epi8((char)128, 12, (char)128, (char)128,   (char)128, 8, (char)128, (char)128,
  31.                     (char)128, 4, (char)128, (char)128,   (char)128, 0, (char)128, (char)128);
  32.  
  33.     __m128i mask_u2 = _mm_set_epi8((char)128, 14, (char)128, (char)128,   (char)128, 10, (char)128, (char)128,
  34.                     (char)128, 6, (char)128, (char)128,   (char)128, 2, (char)128, (char)128);
  35.  
  36.     __m128i mask_v1 = _mm_set_epi8(12, (char)128, (char)128, (char)128,   8, (char)128, (char)128, (char)128,
  37.                     4, (char)128, (char)128, (char)128,   0, (char)128, (char)128, (char)128);
  38.  
  39.     __m128i mask_v2 = _mm_set_epi8(14, (char)128, (char)128, (char)128,   10, (char)128, (char)128, (char)128,
  40.                     6, (char)128, (char)128, (char)128,   2, (char)128, (char)128, (char)128);
  41.  
  42.     #pragma omp parallel for
  43.     for (i=0; i<convert->width / 8; i++) {
  44.         __m128i a1, a2, r, g, b, y, u, v, res;
  45.  
  46.         a1 = _mm_loadu_si128((__m128i *)&p[i*32]);
  47.         a2 = _mm_loadu_si128((__m128i *)&p[i*32 + 16]);
  48.  
  49.         r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1), v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1), v_byte3));
  50.         g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2), v_byte1), _mm_and_si128(a2, v_byte3));
  51.         b = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3), v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1), v_byte3));
  52.  
  53.  
  54.         y = _mm_add_epi16(
  55.             _mm_add_epi16(
  56.                 _mm_mullo_epi16(r, v_mat_yr),
  57.                 _mm_mullo_epi16(g, v_mat_yg)),
  58.             _mm_add_epi16(
  59.                 _mm_mullo_epi16(b, v_mat_yb),
  60.                 cnst4096));
  61.  
  62.         y = _mm_and_si128(_mm_srai_epi16(y, 8), mask2);
  63.  
  64.         u = _mm_add_epi16(
  65.             _mm_add_epi16(
  66.                 _mm_mullo_epi16(r, v_mat_ur),
  67.                 _mm_mullo_epi16(g, v_mat_ug)),
  68.             _mm_add_epi16(
  69.                 _mm_mullo_epi16(b, v_mat_ub_vr),
  70.                 cnst32768));
  71.  
  72.         u = _mm_and_si128(
  73.             _mm_add_epi16(
  74.                 _mm_srai_epi16(u, 8),
  75.                 cnst128),
  76.             mask2);
  77.  
  78.         v = _mm_add_epi16(
  79.             _mm_add_epi16(
  80.                 _mm_mullo_epi16(r, v_mat_ub_vr),
  81.                 _mm_mullo_epi16(g, v_mat_vg)),
  82.             _mm_add_epi16(
  83.                 _mm_mullo_epi16(b, v_mat_vb),
  84.                 cnst32768));
  85.  
  86.         v = _mm_and_si128(
  87.             _mm_add_epi16(
  88.                 _mm_srai_epi16(v, 8),
  89.                 cnst128),
  90.             mask2);
  91.  
  92.  
  93.         res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y1), _mm_shuffle_epi8(u, mask_u1));
  94.         res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v1));
  95.  
  96.         _mm_storeu_si128((__m128i *)&p[i*32], res);
  97.  
  98.         res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y2), _mm_shuffle_epi8(u, mask_u2));
  99.         res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v2));
  100.  
  101.         _mm_storeu_si128((__m128i *)&p[i*32 + 16], res);
  102.     }
  103. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement