Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- void ff_hevc_put_hevc_qpel_v4_8_sse (
- int16_t *dst, ptrdiff_t dststride,
- uint8_t *_src, ptrdiff_t _srcstride,
- int width, int height, int mx, int my) {
- int x, y;
- int shift = 8 - 8;
- const int8_t *filter_v = ff_hevc_qpel_filters[my - 1];
- __m128i x1, x2, x3, x4, r1, r2, r3, r4, c1, c2, c3, c4, t1, t2;
- const __m128i c0 = _mm_setzero_si128();
- uint8_t *src = (uint8_t*) _src;
- ptrdiff_t srcstride = _srcstride;
- c1 = _mm_unpacklo_epi8(_mm_set1_epi8(filter_v[0]), _mm_set1_epi8(filter_v[1]));
- c2 = _mm_unpacklo_epi8(_mm_set1_epi8(filter_v[2]), _mm_set1_epi8(filter_v[3]));
- c3 = _mm_unpacklo_epi8(_mm_set1_epi8(filter_v[4]), _mm_set1_epi8(filter_v[5]));
- c4 = _mm_unpacklo_epi8(_mm_set1_epi8(filter_v[6]), _mm_set1_epi8(filter_v[7]));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x += 4) {
- x1 = _mm_loadl_epi64((__m128i *) &src[x - 3 * srcstride]);
- x2 = _mm_loadl_epi64((__m128i *) &src[x - 2 * srcstride]);
- x3 = _mm_loadl_epi64((__m128i *) &src[x - 1 * srcstride]);
- x4 = _mm_loadl_epi64((__m128i *) &src[x ]);
- x1 = _mm_unpacklo_epi8(x1, x2);
- x2 = _mm_unpacklo_epi8(x3, x4);
- x1 = _mm_maddubs_epi16(x1, c1);
- x2 = _mm_maddubs_epi16(x2, c2);
- r1 = _mm_add_epi16(x1, x2);
- x1 = _mm_loadl_epi64((__m128i *) &src[x + srcstride]);
- x2 = _mm_loadl_epi64((__m128i *) &src[x + 2 * srcstride]);
- x3 = _mm_loadl_epi64((__m128i *) &src[x + 3 * srcstride]);
- x4 = _mm_loadl_epi64((__m128i *) &src[x + 4 * srcstride]);
- x1 = _mm_unpacklo_epi8(x1, x2);
- x2 = _mm_unpacklo_epi8(x3, x4);
- x1 = _mm_maddubs_epi16(x1, c3);
- x2 = _mm_maddubs_epi16(x2, c4);
- r3 = _mm_add_epi16(x1, x2);
- r1 = _mm_add_epi16(r1, r3);
- _mm_storel_epi64((__m128i *) &dst[x], r1);
- }
- src += srcstride;
- dst += dststride;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement