Advertisement
Guest User

rgb2yuv with sse3

a guest
Jun 21st, 2014
365
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 3.54 KB | None | 0 0
  1. void
  2. videoconvert_convert_matrix8_sse2 (VideoConvert * convert, gpointer pixels)
  3. {
  4.     int i;
  5.     guint8 *p = pixels;
  6.  
  7.     __m128i v_byte1 = _mm_set1_epi32(0x000000ff);
  8.     __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);
  9.     __m128i v_mat_yr = _mm_set1_epi16(66);
  10.     __m128i v_mat_yg = _mm_set1_epi16(129);
  11.     __m128i v_mat_yb = _mm_set1_epi16(25);
  12.     __m128i v_mat_ur = _mm_set1_epi16(-38);
  13.     __m128i v_mat_ug = _mm_set1_epi16(-74);
  14.     __m128i v_mat_ub_vr = _mm_set1_epi16(112);
  15.     __m128i v_mat_vg = _mm_set1_epi16(-94);
  16.     __m128i v_mat_vb = _mm_set1_epi16(-18);
  17.  
  18.     __m128i cnst128 = _mm_set1_epi16(128);
  19.     __m128i cnst16  = _mm_set1_epi16(16);
  20.  
  21.     __m128i mask2   = _mm_set1_epi32(0x00ff00ff);
  22.  
  23.     __m128i mask_y1 = _mm_set_epi8((char)128, (char)128, 12, (char)128,   (char)128, (char)128, 8, (char)128,
  24.                     (char)128, (char)128, 4, (char)128,   (char)128, (char)128, 0, (char)128);
  25.  
  26.     __m128i mask_y2 = _mm_set_epi8((char)128, (char)128, 14,  (char)128,  (char)128, (char)128, 10, (char)128,
  27.                     (char)128, (char)128, 6, (char)128,   (char)128, (char)128, 2, (char)128);
  28.  
  29.     __m128i mask_u1 = _mm_set_epi8((char)128, 12, (char)128, (char)128,   (char)128, 8, (char)128, (char)128,
  30.                     (char)128, 4, (char)128, (char)128,   (char)128, 0, (char)128, (char)128);
  31.  
  32.     __m128i mask_u2 = _mm_set_epi8((char)128, 14, (char)128, (char)128,   (char)128, 10, (char)128, (char)128,
  33.                     (char)128, 6, (char)128, (char)128,   (char)128, 2, (char)128, (char)128);
  34.  
  35.     __m128i mask_v1 = _mm_set_epi8(12, (char)128, (char)128, (char)128,   8, (char)128, (char)128, (char)128,
  36.                     4, (char)128, (char)128, (char)128,   0, (char)128, (char)128, (char)128);
  37.  
  38.     __m128i mask_v2 = _mm_set_epi8(14, (char)128, (char)128, (char)128,   10, (char)128, (char)128, (char)128,
  39.                     6, (char)128, (char)128, (char)128,   2, (char)128, (char)128, (char)128);
  40.  
  41.     /*#pragma omp parallel for*/
  42.     for (i=0; i<convert->width / 8; i++) {
  43.         __m128i a1, a2, r, g, b, y, u, v, res;
  44.  
  45.         a1 = _mm_loadu_si128((__m128i *)&p[i*32]);
  46.         a2 = _mm_loadu_si128((__m128i *)&p[i*32 + 16]);
  47.  
  48.         r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1), v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1), v_byte3));
  49.         g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2), v_byte1), _mm_and_si128(a2, v_byte3));
  50.         b = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3), v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1), v_byte3));
  51.  
  52.  
  53.         y = _mm_add_epi16(
  54.             _mm_add_epi16(
  55.                 _mm_mullo_epi16(r, v_mat_yr),
  56.                 _mm_mullo_epi16(g, v_mat_yg)),
  57.             _mm_add_epi16(
  58.                 _mm_mullo_epi16(b, v_mat_yb),
  59.                 cnst128));
  60.  
  61.         y = _mm_and_si128(
  62.             _mm_add_epi16(
  63.                 _mm_srai_epi16(y, 8),
  64.                 cnst16),
  65.             mask2);
  66.  
  67.         u = _mm_add_epi16(
  68.             _mm_add_epi16(
  69.                 _mm_mullo_epi16(r, v_mat_ur),
  70.                 _mm_mullo_epi16(g, v_mat_ug)),
  71.             _mm_add_epi16(
  72.                 _mm_mullo_epi16(b, v_mat_ub_vr),
  73.                 cnst128));
  74.  
  75.         u = _mm_and_si128(
  76.             _mm_add_epi16(
  77.                 _mm_srai_epi16(u, 8),
  78.                 cnst128),
  79.             mask2);
  80.  
  81.         v = _mm_add_epi16(
  82.             _mm_add_epi16(
  83.                 _mm_mullo_epi16(r, v_mat_ub_vr),
  84.                 _mm_mullo_epi16(g, v_mat_vg)),
  85.             _mm_add_epi16(
  86.                 _mm_mullo_epi16(b, v_mat_vb),
  87.                 cnst128));
  88.  
  89.         v = _mm_and_si128(
  90.             _mm_add_epi16(
  91.                 _mm_srai_epi16(v, 8),
  92.                 cnst128),
  93.             mask2);
  94.  
  95.  
  96.         res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y1), _mm_shuffle_epi8(u, mask_u1));
  97.         res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v1));
  98.  
  99.         _mm_storeu_si128((__m128i *)&p[i*32], res);
  100.  
  101.         res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y2), _mm_shuffle_epi8(u, mask_u2));
  102.         res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v2));
  103.  
  104.         _mm_storeu_si128((__m128i *)&p[i*32 + 16], res);
  105.     }
  106. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement