Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- void
- videoconvert_convert_matrix8_sse2 (VideoConvert * convert, gpointer pixels)
- {
- int i;
- guint8 *p = pixels;
- __m128i v_byte1 = _mm_set1_epi32(0x000000ff);
- __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);
- __m128i v_mat_yr = _mm_set1_epi16(66);
- __m128i v_mat_yg = _mm_set1_epi16(129);
- __m128i v_mat_yb = _mm_set1_epi16(25);
- __m128i v_mat_ur = _mm_set1_epi16(-38);
- __m128i v_mat_ug = _mm_set1_epi16(-74);
- __m128i v_mat_ub_vr = _mm_set1_epi16(112);
- __m128i v_mat_vg = _mm_set1_epi16(-94);
- __m128i v_mat_vb = _mm_set1_epi16(-18);
- __m128i cnst128 = _mm_set1_epi16(128);
- __m128i cnst16 = _mm_set1_epi16(16);
- __m128i mask2 = _mm_set1_epi32(0x00ff00ff);
- __m128i mask_y1 = _mm_set_epi8((char)128, (char)128, 12, (char)128, (char)128, (char)128, 8, (char)128,
- (char)128, (char)128, 4, (char)128, (char)128, (char)128, 0, (char)128);
- __m128i mask_y2 = _mm_set_epi8((char)128, (char)128, 14, (char)128, (char)128, (char)128, 10, (char)128,
- (char)128, (char)128, 6, (char)128, (char)128, (char)128, 2, (char)128);
- __m128i mask_u1 = _mm_set_epi8((char)128, 12, (char)128, (char)128, (char)128, 8, (char)128, (char)128,
- (char)128, 4, (char)128, (char)128, (char)128, 0, (char)128, (char)128);
- __m128i mask_u2 = _mm_set_epi8((char)128, 14, (char)128, (char)128, (char)128, 10, (char)128, (char)128,
- (char)128, 6, (char)128, (char)128, (char)128, 2, (char)128, (char)128);
- __m128i mask_v1 = _mm_set_epi8(12, (char)128, (char)128, (char)128, 8, (char)128, (char)128, (char)128,
- 4, (char)128, (char)128, (char)128, 0, (char)128, (char)128, (char)128);
- __m128i mask_v2 = _mm_set_epi8(14, (char)128, (char)128, (char)128, 10, (char)128, (char)128, (char)128,
- 6, (char)128, (char)128, (char)128, 2, (char)128, (char)128, (char)128);
- /*#pragma omp parallel for*/
- for (i=0; i<convert->width / 8; i++) {
- __m128i a1, a2, r, g, b, y, u, v, res;
- a1 = _mm_loadu_si128((__m128i *)&p[i*32]);
- a2 = _mm_loadu_si128((__m128i *)&p[i*32 + 16]);
- r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 1), v_byte1), _mm_and_si128(_mm_slli_si128(a2, 1), v_byte3));
- g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 2), v_byte1), _mm_and_si128(a2, v_byte3));
- b = _mm_or_si128(_mm_and_si128(_mm_srli_si128(a1, 3), v_byte1), _mm_and_si128(_mm_srli_si128(a2, 1), v_byte3));
- y = _mm_add_epi16(
- _mm_add_epi16(
- _mm_mullo_epi16(r, v_mat_yr),
- _mm_mullo_epi16(g, v_mat_yg)),
- _mm_add_epi16(
- _mm_mullo_epi16(b, v_mat_yb),
- cnst128));
- y = _mm_and_si128(
- _mm_add_epi16(
- _mm_srai_epi16(y, 8),
- cnst16),
- mask2);
- u = _mm_add_epi16(
- _mm_add_epi16(
- _mm_mullo_epi16(r, v_mat_ur),
- _mm_mullo_epi16(g, v_mat_ug)),
- _mm_add_epi16(
- _mm_mullo_epi16(b, v_mat_ub_vr),
- cnst128));
- u = _mm_and_si128(
- _mm_add_epi16(
- _mm_srai_epi16(u, 8),
- cnst128),
- mask2);
- v = _mm_add_epi16(
- _mm_add_epi16(
- _mm_mullo_epi16(r, v_mat_ub_vr),
- _mm_mullo_epi16(g, v_mat_vg)),
- _mm_add_epi16(
- _mm_mullo_epi16(b, v_mat_vb),
- cnst128));
- v = _mm_and_si128(
- _mm_add_epi16(
- _mm_srai_epi16(v, 8),
- cnst128),
- mask2);
- res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y1), _mm_shuffle_epi8(u, mask_u1));
- res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v1));
- _mm_storeu_si128((__m128i *)&p[i*32], res);
- res = _mm_or_si128(_mm_shuffle_epi8(y, mask_y2), _mm_shuffle_epi8(u, mask_u2));
- res = _mm_or_si128(res, _mm_shuffle_epi8(v, mask_v2));
- _mm_storeu_si128((__m128i *)&p[i*32 + 16], res);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement