Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff --git a/gst/videoconvert/videoconvert.c b/gst/videoconvert/videoconvert.c
- index 380822d..9fe568e 100644
- --- a/gst/videoconvert/videoconvert.c
- +++ b/gst/videoconvert/videoconvert.c
- @@ -27,6 +27,9 @@
- #include <glib.h>
- #include <string.h>
- #include <math.h>
- +#include <stdio.h>
- +#include <stdlib.h>
- +#include <stdint.h>
- #include "gstvideoconvertorc.h"
- @@ -1195,6 +1198,176 @@ convert_Y444_AYUV (VideoConvert * convert, GstVideoFrame * dest,
- }
- #if G_BYTE_ORDER == G_LITTLE_ENDIAN
- +
- +#include <xmmintrin.h>
- +#include <emmintrin.h>
- +#include <pmmintrin.h>
- +#include <tmmintrin.h>
- +
- +/*
- +==== SSSE3 BGRA-to-YUV420 Converter ====
- +
- +Uses the same principle as the fallback converter, but uses 16-bit integers so it can do 8 operations at once.
- +It processes blocks of 16x2 pixels of the input image and produces 16x2 Y, 8x1 U and 8x1 V values.
- +
- +The code uses interleaving to reduce the number of shuffles. So for example the order for red is [ r0 r4 r1 r5 r2 r6 r3 r7 ].
- +For the averaging of 2x2 blocks, it uses 32-bit horizontal addition instead of 16-bit because of this interleaving.
- +The order of the final result is [ sr0 sr2 sr1 sr3 sr4 sr6 sr5 sr7 ].
- +
- +If the width is not a multiple of 16, the remainder (right edge of the image) is converted without SSSE3.
- +
- +This converter is about 4 times faster than the fallback converter.
- +*/
- +
- +#define Convert3_ReadRGB(ptr1, ptr2, ca, cb, r, g, b) \
- + __m128i ca = _mm_loadu_si128((__m128i*) (ptr1)), cb = _mm_loadu_si128((__m128i*) (ptr2)); \
- + __m128i r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(ca, 2), v_byte1), _mm_and_si128( cb , v_byte3)); \
- + __m128i g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(ca, 1), v_byte1), _mm_and_si128(_mm_slli_si128(cb, 1), v_byte3)); \
- + __m128i b = _mm_or_si128(_mm_and_si128( ca , v_byte1), _mm_and_si128(_mm_slli_si128(cb, 2), v_byte3));
- +#define Convert3_CalcY(r, g, b, y) \
- + __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(r, v_mat_yr), _mm_mullo_epi16(g, v_mat_yg)), _mm_add_epi16(_mm_mullo_epi16(b, v_mat_yb), v_offset_y));
- +#define Convert3_WriteY(ptr, y1, y2) \
- + _mm_stream_si128((__m128i*) (ptr), _mm_or_si128(_mm_shuffle_epi8(y1, v_shuffle1), _mm_shuffle_epi8(y2, v_shuffle2)));
- +
- +void Convert_BGRA_YUV420_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
- +// assert(w % 2 == 0 && h % 2 == 0);
- +// assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0);
- +// assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0);
- +// assert((uintptr_t) out_data[2] % 16 == 0 && out_stride[2] % 16 == 0);
- +
- + __m128i v_byte1 = _mm_set1_epi32(0x000000ff);
- + __m128i v_byte3 = _mm_set1_epi32(0x00ff0000);
- + __m128i v_mat_yr = _mm_set1_epi16(66);
- + __m128i v_mat_yg = _mm_set1_epi16(129);
- + __m128i v_mat_yb = _mm_set1_epi16(25);
- + __m128i v_mat_ur = _mm_set1_epi16(-38);
- + __m128i v_mat_ug = _mm_set1_epi16(-74);
- + __m128i v_mat_ub_vr = _mm_set1_epi16(112);
- + __m128i v_mat_vg = _mm_set1_epi16(-94);
- + __m128i v_mat_vb = _mm_set1_epi16(-18);
- + __m128i v_offset_y = _mm_set1_epi16(128 + (16 << 8));
- + __m128i v_offset_uv = _mm_set1_epi16(128 + (128 << 8));
- + __m128i v_2 = _mm_set1_epi16(2);
- + __m128i v_shuffle1 = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, 255, 255, 255, 255, 255, 255, 255, 255);
- + __m128i v_shuffle2 = _mm_setr_epi8(255, 255, 255, 255, 255, 255, 255, 255, 1, 5, 9, 13, 3, 7, 11, 15);
- + __m128i v_shuffle3 = _mm_setr_epi8(1, 5, 3, 7, 9, 13, 11, 15, 255, 255, 255, 255, 255, 255, 255, 255);
- + //__m128i v_shuffle4 = _mm_setr_epi8(255, 255, 255, 255, 255, 255, 255, 255, 1, 5, 3, 7, 9, 13, 11, 15);
- +
- + const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
- +
- + for(unsigned int j = 0; j < h / 2; ++j) {
- + const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) (j * 2));
- + const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * (int) (j * 2 + 1));
- + uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) (j * 2);
- + uint8_t *yuv_y2 = out_data[0] + out_stride[0] * (int) (j * 2 + 1);
- + uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
- + uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
- + for(unsigned int i = 0; i < w / 16; ++i) {
- + __m128i ra, ga, ba;
- + {
- + Convert3_ReadRGB(rgb1, rgb1 + 4, ca0, cb0, r0, g0, b0);
- + Convert3_ReadRGB(rgb1 + 8, rgb1 + 12, ca1, cb1, r1, g1, b1);
- + rgb1 += 16;
- + Convert3_CalcY(r0, g0, b0, y0);
- + Convert3_CalcY(r1, g1, b1, y1);
- + Convert3_WriteY(yuv_y1, y0, y1);
- + yuv_y1 += 16;
- + _mm_prefetch(rgb1 + 16, _MM_HINT_T0);
- + ra = _mm_hadd_epi32(r0, r1);
- + ga = _mm_hadd_epi32(g0, g1);
- + ba = _mm_hadd_epi32(b0, b1);
- + }
- + {
- + Convert3_ReadRGB(rgb2, rgb2 + 4, ca0, cb0, r0, g0, b0);
- + Convert3_ReadRGB(rgb2 + 8, rgb2 + 12, ca1, cb1, r1, g1, b1);
- + rgb2 += 16;
- + Convert3_CalcY(r0, g0, b0, y0);
- + Convert3_CalcY(r1, g1, b1, y1);
- + Convert3_WriteY(yuv_y2, y0, y1);
- + yuv_y2 += 16;
- + _mm_prefetch(rgb2 + 16, _MM_HINT_T0);
- + ra = _mm_add_epi16(ra, _mm_hadd_epi32(r0, r1));
- + ga = _mm_add_epi16(ga, _mm_hadd_epi32(g0, g1));
- + ba = _mm_add_epi16(ba, _mm_hadd_epi32(b0, b1));
- + }
- + {
- + ra = _mm_srli_epi16(_mm_add_epi16(ra, v_2), 2);
- + ga = _mm_srli_epi16(_mm_add_epi16(ga, v_2), 2);
- + ba = _mm_srli_epi16(_mm_add_epi16(ba, v_2), 2);
- + __m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ur), _mm_mullo_epi16(ga, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_ub_vr), v_offset_uv));
- + _mm_storel_epi64((__m128i*) yuv_u, _mm_shuffle_epi8(u, v_shuffle3));
- + yuv_u += 8;
- + __m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ub_vr), _mm_mullo_epi16(ga, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_vb), v_offset_uv));
- + _mm_storel_epi64((__m128i*) yuv_v, _mm_shuffle_epi8(v, v_shuffle3));
- + yuv_v += 8;
- + }
- + }
- + for(unsigned int i = 0; i < (w & 15) / 2; ++i) {
- + uint32_t c[4] = {rgb1[0], rgb1[1], rgb2[0], rgb2[1]};
- + rgb1 += 2; rgb2 += 2;
- + int r[4] = {(int) ((c[0] >> 16) & 0xff), (int) ((c[1] >> 16) & 0xff), (int) ((c[2] >> 16) & 0xff), (int) ((c[3] >> 16) & 0xff)};
- + int g[4] = {(int) ((c[0] >> 8) & 0xff), (int) ((c[1] >> 8) & 0xff), (int) ((c[2] >> 8) & 0xff), (int) ((c[3] >> 8) & 0xff)};
- + int b[4] = {(int) ((c[0] ) & 0xff), (int) ((c[1] ) & 0xff), (int) ((c[2] ) & 0xff), (int) ((c[3] ) & 0xff)};
- + yuv_y1[0] = (66 * r[0] + 129 * g[0] + 25 * b[0] + offset_y) >> 8;
- + yuv_y1[1] = (66 * r[1] + 129 * g[1] + 25 * b[1] + offset_y) >> 8;
- + yuv_y2[0] = (66 * r[2] + 129 * g[2] + 25 * b[2] + offset_y) >> 8;
- + yuv_y2[1] = (66 * r[3] + 129 * g[3] + 25 * b[3] + offset_y) >> 8;
- + yuv_y1 += 2; yuv_y2 += 2;
- + int sr = r[0] + r[1] + r[2] + r[3];
- + int sg = g[0] + g[1] + g[2] + g[3];
- + int sb = b[0] + b[1] + b[2] + b[3];
- + *yuv_u = (-38 * sr + -74 * sg + 112 * sb + offset_uv) >> 10;
- + *yuv_v = (112 * sr + -94 * sg + -18 * sb + offset_uv) >> 10;
- + ++yuv_u; ++yuv_v;
- + }
- + }
- +
- + _mm_sfence();
- +
- +}
- +
- +void Convert_BGRA_YUV420_Fallback(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
- +// assert(w % 2 == 0 && h % 2 == 0);
- +
- + const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
- +
- + for(unsigned int j = 0; j < h / 2; ++j) {
- + const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) j * 2);
- + const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * ((int) j * 2 + 1));
- + uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) j * 2;
- + uint8_t *yuv_y2 = out_data[0] + out_stride[0] * ((int) j * 2 + 1);
- + uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
- + uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
- + for(unsigned int i = 0; i < w / 2; ++i) {
- + uint32_t c[4] = {rgb1[0], rgb1[1], rgb2[0], rgb2[1]};
- + rgb1 += 2; rgb2 += 2;
- + int r[4] = {(int) ((c[0] >> 16) & 0xff), (int) ((c[1] >> 16) & 0xff), (int) ((c[2] >> 16) & 0xff), (int) ((c[3] >> 16) & 0xff)};
- + int g[4] = {(int) ((c[0] >> 8) & 0xff), (int) ((c[1] >> 8) & 0xff), (int) ((c[2] >> 8) & 0xff), (int) ((c[3] >> 8) & 0xff)};
- + int b[4] = {(int) ((c[0] ) & 0xff), (int) ((c[1] ) & 0xff), (int) ((c[2] ) & 0xff), (int) ((c[3] ) & 0xff)};
- + yuv_y1[0] = (66 * r[0] + 129 * g[0] + 25 * b[0] + offset_y) >> 8;
- + yuv_y1[1] = (66 * r[1] + 129 * g[1] + 25 * b[1] + offset_y) >> 8;
- + yuv_y2[0] = (66 * r[2] + 129 * g[2] + 25 * b[2] + offset_y) >> 8;
- + yuv_y2[1] = (66 * r[3] + 129 * g[3] + 25 * b[3] + offset_y) >> 8;
- + yuv_y1 += 2; yuv_y2 += 2;
- + int sr = r[0] + r[1] + r[2] + r[3];
- + int sg = g[0] + g[1] + g[2] + g[3];
- + int sb = b[0] + b[1] + b[2] + b[3];
- + *yuv_u = (-38 * sr + -74 * sg + 112 * sb + offset_uv) >> 10;
- + *yuv_v = (112 * sr + -94 * sg + -18 * sb + offset_uv) >> 10;
- + ++yuv_u; ++yuv_v;
- + }
- + }
- +
- +}
- +
- +static void convert_BGRx_I420(VideoConvert * convert, GstVideoFrame * dest, const GstVideoFrame * src) {
- + uint8_t * out_data[3] = {FRAME_GET_Y_LINE(dest, 0), FRAME_GET_U_LINE(dest, 0), FRAME_GET_V_LINE(dest, 0)};
- + int out_stride[3] = {FRAME_GET_Y_STRIDE(dest), FRAME_GET_U_STRIDE(dest), FRAME_GET_V_STRIDE(dest)};
- + uint8_t * in_data = FRAME_GET_LINE(src, 0);
- + int in_stride = FRAME_GET_STRIDE(src);
- + Convert_BGRA_YUV420_SSSE3(convert->width, convert->height, in_data, in_stride, out_data, out_stride);
- +}
- +
- static void
- convert_AYUV_ARGB (VideoConvert * convert, GstVideoFrame * dest,
- const GstVideoFrame * src)
- @@ -1416,6 +1589,12 @@ static const VideoTransform transforms[] = {
- convert_Y444_Y42B},
- #if G_BYTE_ORDER == G_LITTLE_ENDIAN
- + {GST_VIDEO_FORMAT_BGRx, GST_VIDEO_COLOR_MATRIX_UNKNOWN, GST_VIDEO_FORMAT_I420,
- + GST_VIDEO_COLOR_MATRIX_UNKNOWN, TRUE, TRUE, FALSE, 0, 0,
- + convert_BGRx_I420},
- +// {GST_VIDEO_FORMAT_BGRx, GST_VIDEO_COLOR_MATRIX_UNKNOWN, GST_VIDEO_FORMAT_NV12,
- +// GST_VIDEO_COLOR_MATRIX_UNKNOWN, TRUE, TRUE, FALSE, 0, 0,
- +// convert_BGRx_NV12},
- {GST_VIDEO_FORMAT_AYUV, GST_VIDEO_COLOR_MATRIX_UNKNOWN, GST_VIDEO_FORMAT_ARGB,
- GST_VIDEO_COLOR_MATRIX_UNKNOWN, TRUE, TRUE, TRUE, 0, 0,
- convert_AYUV_ARGB},
Advertisement
Add Comment
Please, Sign In to add comment