Guest User

Untitled

a guest
Jun 26th, 2014
258
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 10.22 KB | None | 0 0
  1. diff --git a/gst/videoconvert/videoconvert.c b/gst/videoconvert/videoconvert.c
  2. index 380822d..9fe568e 100644
  3. --- a/gst/videoconvert/videoconvert.c
  4. +++ b/gst/videoconvert/videoconvert.c
  5. @@ -27,6 +27,9 @@
  6.  #include <glib.h>
  7.  #include <string.h>
  8.  #include <math.h>
  9. +#include <stdio.h>
  10. +#include <stdlib.h>
  11. +#include <stdint.h>
  12.  
  13.  #include "gstvideoconvertorc.h"
  14.  
  15. @@ -1195,6 +1198,176 @@ convert_Y444_AYUV (VideoConvert * convert, GstVideoFrame * dest,
  16.  }
  17.  
  18.  #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  19. +
  20. +#include <xmmintrin.h>
  21. +#include <emmintrin.h>
  22. +#include <pmmintrin.h>
  23. +#include <tmmintrin.h>
  24. +
  25. +/*
  26. +==== SSSE3 BGRA-to-YUV420 Converter ====
  27. +
  28. +Uses the same principle as the fallback converter, but uses 16-bit integers so it can do 8 operations at once.
  29. +It processes blocks of 16x2 pixels of the input image and produces 16x2 Y, 8x1 U and 8x1 V values.
  30. +
  31. +The code uses interleaving to reduce the number of shuffles. So for example the order for red is [ r0 r4 r1 r5 r2 r6 r3 r7 ].
  32. +For the averaging of 2x2 blocks, it uses 32-bit horizontal addition instead of 16-bit because of this interleaving.
  33. +The order of the final result is [ sr0 sr2 sr1 sr3 sr4 sr6 sr5 sr7 ].
  34. +
  35. +If the width is not a multiple of 16, the remainder (right edge of the image) is converted without SSSE3.
  36. +
  37. +This converter is about 4 times faster than the fallback converter.
  38. +*/
  39. +
  40. +#define Convert3_ReadRGB(ptr1, ptr2, ca, cb, r, g, b) \
  41. +   __m128i ca = _mm_loadu_si128((__m128i*) (ptr1)), cb = _mm_loadu_si128((__m128i*) (ptr2)); \
  42. +   __m128i r = _mm_or_si128(_mm_and_si128(_mm_srli_si128(ca, 2), v_byte1), _mm_and_si128(               cb    , v_byte3)); \
  43. +   __m128i g = _mm_or_si128(_mm_and_si128(_mm_srli_si128(ca, 1), v_byte1), _mm_and_si128(_mm_slli_si128(cb, 1), v_byte3)); \
  44. +   __m128i b = _mm_or_si128(_mm_and_si128(               ca    , v_byte1), _mm_and_si128(_mm_slli_si128(cb, 2), v_byte3));
  45. +#define Convert3_CalcY(r, g, b, y) \
  46. +   __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(r, v_mat_yr), _mm_mullo_epi16(g, v_mat_yg)), _mm_add_epi16(_mm_mullo_epi16(b, v_mat_yb), v_offset_y));
  47. +#define Convert3_WriteY(ptr, y1, y2) \
  48. +   _mm_stream_si128((__m128i*) (ptr), _mm_or_si128(_mm_shuffle_epi8(y1, v_shuffle1), _mm_shuffle_epi8(y2, v_shuffle2)));
  49. +
  50. +void Convert_BGRA_YUV420_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
  51. +//     assert(w % 2 == 0 && h % 2 == 0);
  52. +//     assert((uintptr_t) out_data[0] % 16 == 0 && out_stride[0] % 16 == 0);
  53. +//     assert((uintptr_t) out_data[1] % 16 == 0 && out_stride[1] % 16 == 0);
  54. +//     assert((uintptr_t) out_data[2] % 16 == 0 && out_stride[2] % 16 == 0);
  55. +
  56. +   __m128i v_byte1     = _mm_set1_epi32(0x000000ff);
  57. +   __m128i v_byte3     = _mm_set1_epi32(0x00ff0000);
  58. +   __m128i v_mat_yr    = _mm_set1_epi16(66);
  59. +   __m128i v_mat_yg    = _mm_set1_epi16(129);
  60. +   __m128i v_mat_yb    = _mm_set1_epi16(25);
  61. +   __m128i v_mat_ur    = _mm_set1_epi16(-38);
  62. +   __m128i v_mat_ug    = _mm_set1_epi16(-74);
  63. +   __m128i v_mat_ub_vr = _mm_set1_epi16(112);
  64. +   __m128i v_mat_vg    = _mm_set1_epi16(-94);
  65. +   __m128i v_mat_vb    = _mm_set1_epi16(-18);
  66. +   __m128i v_offset_y  = _mm_set1_epi16(128 + (16 << 8));
  67. +   __m128i v_offset_uv = _mm_set1_epi16(128 + (128 << 8));
  68. +   __m128i v_2         = _mm_set1_epi16(2);
  69. +   __m128i v_shuffle1  = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, 255, 255, 255, 255, 255, 255, 255, 255);
  70. +   __m128i v_shuffle2  = _mm_setr_epi8(255, 255, 255, 255, 255, 255, 255, 255, 1, 5, 9, 13, 3, 7, 11, 15);
  71. +   __m128i v_shuffle3  = _mm_setr_epi8(1, 5, 3, 7, 9, 13, 11, 15, 255, 255, 255, 255, 255, 255, 255, 255);
  72. +   //__m128i v_shuffle4  = _mm_setr_epi8(255, 255, 255, 255, 255, 255, 255, 255, 1, 5, 3, 7, 9, 13, 11, 15);
  73. +
  74. +   const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
  75. +
  76. +   for(unsigned int j = 0; j < h / 2; ++j) {
  77. +       const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) (j * 2));
  78. +       const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * (int) (j * 2 + 1));
  79. +       uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) (j * 2);
  80. +       uint8_t *yuv_y2 = out_data[0] + out_stride[0] * (int) (j * 2 + 1);
  81. +       uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
  82. +       uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
  83. +       for(unsigned int i = 0; i < w / 16; ++i) {
  84. +           __m128i ra, ga, ba;
  85. +           {
  86. +               Convert3_ReadRGB(rgb1, rgb1 + 4, ca0, cb0, r0, g0, b0);
  87. +               Convert3_ReadRGB(rgb1 + 8, rgb1 + 12, ca1, cb1, r1, g1, b1);
  88. +               rgb1 += 16;
  89. +               Convert3_CalcY(r0, g0, b0, y0);
  90. +               Convert3_CalcY(r1, g1, b1, y1);
  91. +               Convert3_WriteY(yuv_y1, y0, y1);
  92. +               yuv_y1 += 16;
  93. +               _mm_prefetch(rgb1 + 16, _MM_HINT_T0);
  94. +               ra = _mm_hadd_epi32(r0, r1);
  95. +               ga = _mm_hadd_epi32(g0, g1);
  96. +               ba = _mm_hadd_epi32(b0, b1);
  97. +           }
  98. +           {
  99. +               Convert3_ReadRGB(rgb2, rgb2 + 4, ca0, cb0, r0, g0, b0);
  100. +               Convert3_ReadRGB(rgb2 + 8, rgb2 + 12, ca1, cb1, r1, g1, b1);
  101. +               rgb2 += 16;
  102. +               Convert3_CalcY(r0, g0, b0, y0);
  103. +               Convert3_CalcY(r1, g1, b1, y1);
  104. +               Convert3_WriteY(yuv_y2, y0, y1);
  105. +               yuv_y2 += 16;
  106. +               _mm_prefetch(rgb2 + 16, _MM_HINT_T0);
  107. +               ra = _mm_add_epi16(ra, _mm_hadd_epi32(r0, r1));
  108. +               ga = _mm_add_epi16(ga, _mm_hadd_epi32(g0, g1));
  109. +               ba = _mm_add_epi16(ba, _mm_hadd_epi32(b0, b1));
  110. +           }
  111. +           {
  112. +               ra = _mm_srli_epi16(_mm_add_epi16(ra, v_2), 2);
  113. +               ga = _mm_srli_epi16(_mm_add_epi16(ga, v_2), 2);
  114. +               ba = _mm_srli_epi16(_mm_add_epi16(ba, v_2), 2);
  115. +               __m128i u = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ur), _mm_mullo_epi16(ga, v_mat_ug)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_ub_vr), v_offset_uv));
  116. +               _mm_storel_epi64((__m128i*) yuv_u, _mm_shuffle_epi8(u, v_shuffle3));
  117. +               yuv_u += 8;
  118. +               __m128i v = _mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(ra, v_mat_ub_vr), _mm_mullo_epi16(ga, v_mat_vg)), _mm_add_epi16(_mm_mullo_epi16(ba, v_mat_vb), v_offset_uv));
  119. +               _mm_storel_epi64((__m128i*) yuv_v, _mm_shuffle_epi8(v, v_shuffle3));
  120. +               yuv_v += 8;
  121. +           }
  122. +       }
  123. +       for(unsigned int i = 0; i < (w & 15) / 2; ++i) {
  124. +           uint32_t c[4] = {rgb1[0], rgb1[1], rgb2[0], rgb2[1]};
  125. +           rgb1 += 2; rgb2 += 2;
  126. +           int r[4] = {(int) ((c[0] >> 16) & 0xff), (int) ((c[1] >> 16) & 0xff), (int) ((c[2] >> 16) & 0xff), (int) ((c[3] >> 16) & 0xff)};
  127. +           int g[4] = {(int) ((c[0] >>  8) & 0xff), (int) ((c[1] >>  8) & 0xff), (int) ((c[2] >>  8) & 0xff), (int) ((c[3] >>  8) & 0xff)};
  128. +           int b[4] = {(int) ((c[0]      ) & 0xff), (int) ((c[1]      ) & 0xff), (int) ((c[2]      ) & 0xff), (int) ((c[3]      ) & 0xff)};
  129. +           yuv_y1[0] = (66 * r[0] + 129 * g[0] + 25 * b[0] + offset_y) >> 8;
  130. +           yuv_y1[1] = (66 * r[1] + 129 * g[1] + 25 * b[1] + offset_y) >> 8;
  131. +           yuv_y2[0] = (66 * r[2] + 129 * g[2] + 25 * b[2] + offset_y) >> 8;
  132. +           yuv_y2[1] = (66 * r[3] + 129 * g[3] + 25 * b[3] + offset_y) >> 8;
  133. +           yuv_y1 += 2; yuv_y2 += 2;
  134. +           int sr = r[0] + r[1] + r[2] + r[3];
  135. +           int sg = g[0] + g[1] + g[2] + g[3];
  136. +           int sb = b[0] + b[1] + b[2] + b[3];
  137. +           *yuv_u = (-38 * sr + -74 * sg + 112 * sb + offset_uv) >> 10;
  138. +           *yuv_v = (112 * sr + -94 * sg + -18 * sb + offset_uv) >> 10;
  139. +           ++yuv_u; ++yuv_v;
  140. +       }
  141. +   }
  142. +
  143. +   _mm_sfence();
  144. +
  145. +}
  146. +
  147. +void Convert_BGRA_YUV420_Fallback(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
  148. +//     assert(w % 2 == 0 && h % 2 == 0);
  149. +
  150. +   const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
  151. +
  152. +   for(unsigned int j = 0; j < h / 2; ++j) {
  153. +       const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) j * 2);
  154. +       const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * ((int) j * 2 + 1));
  155. +       uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) j * 2;
  156. +       uint8_t *yuv_y2 = out_data[0] + out_stride[0] * ((int) j * 2 + 1);
  157. +       uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
  158. +       uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
  159. +       for(unsigned int i = 0; i < w / 2; ++i) {
  160. +           uint32_t c[4] = {rgb1[0], rgb1[1], rgb2[0], rgb2[1]};
  161. +           rgb1 += 2; rgb2 += 2;
  162. +           int r[4] = {(int) ((c[0] >> 16) & 0xff), (int) ((c[1] >> 16) & 0xff), (int) ((c[2] >> 16) & 0xff), (int) ((c[3] >> 16) & 0xff)};
  163. +           int g[4] = {(int) ((c[0] >>  8) & 0xff), (int) ((c[1] >>  8) & 0xff), (int) ((c[2] >>  8) & 0xff), (int) ((c[3] >>  8) & 0xff)};
  164. +           int b[4] = {(int) ((c[0]      ) & 0xff), (int) ((c[1]      ) & 0xff), (int) ((c[2]      ) & 0xff), (int) ((c[3]      ) & 0xff)};
  165. +           yuv_y1[0] = (66 * r[0] + 129 * g[0] + 25 * b[0] + offset_y) >> 8;
  166. +           yuv_y1[1] = (66 * r[1] + 129 * g[1] + 25 * b[1] + offset_y) >> 8;
  167. +           yuv_y2[0] = (66 * r[2] + 129 * g[2] + 25 * b[2] + offset_y) >> 8;
  168. +           yuv_y2[1] = (66 * r[3] + 129 * g[3] + 25 * b[3] + offset_y) >> 8;
  169. +           yuv_y1 += 2; yuv_y2 += 2;
  170. +           int sr = r[0] + r[1] + r[2] + r[3];
  171. +           int sg = g[0] + g[1] + g[2] + g[3];
  172. +           int sb = b[0] + b[1] + b[2] + b[3];
  173. +           *yuv_u = (-38 * sr + -74 * sg + 112 * sb + offset_uv) >> 10;
  174. +           *yuv_v = (112 * sr + -94 * sg + -18 * sb + offset_uv) >> 10;
  175. +           ++yuv_u; ++yuv_v;
  176. +       }
  177. +   }
  178. +
  179. +}
  180. +
  181. +static void convert_BGRx_I420(VideoConvert * convert, GstVideoFrame * dest, const GstVideoFrame * src) {
  182. +  uint8_t * out_data[3] = {FRAME_GET_Y_LINE(dest, 0), FRAME_GET_U_LINE(dest, 0), FRAME_GET_V_LINE(dest, 0)};
  183. +  int out_stride[3] = {FRAME_GET_Y_STRIDE(dest), FRAME_GET_U_STRIDE(dest), FRAME_GET_V_STRIDE(dest)};
  184. +  uint8_t * in_data = FRAME_GET_LINE(src, 0);
  185. +  int in_stride = FRAME_GET_STRIDE(src);
  186. +  Convert_BGRA_YUV420_SSSE3(convert->width, convert->height, in_data, in_stride, out_data, out_stride);
  187. +}
  188. +
  189.  static void
  190.  convert_AYUV_ARGB (VideoConvert * convert, GstVideoFrame * dest,
  191.      const GstVideoFrame * src)
  192. @@ -1416,6 +1589,12 @@ static const VideoTransform transforms[] = {
  193.        convert_Y444_Y42B},
  194.  
  195.  #if G_BYTE_ORDER == G_LITTLE_ENDIAN
  196. +  {GST_VIDEO_FORMAT_BGRx, GST_VIDEO_COLOR_MATRIX_UNKNOWN, GST_VIDEO_FORMAT_I420,
  197. +        GST_VIDEO_COLOR_MATRIX_UNKNOWN, TRUE, TRUE, FALSE, 0, 0,
  198. +      convert_BGRx_I420},
  199. +//   {GST_VIDEO_FORMAT_BGRx, GST_VIDEO_COLOR_MATRIX_UNKNOWN, GST_VIDEO_FORMAT_NV12,
  200. +//         GST_VIDEO_COLOR_MATRIX_UNKNOWN, TRUE, TRUE, FALSE, 0, 0,
  201. +//       convert_BGRx_NV12},
  202.    {GST_VIDEO_FORMAT_AYUV, GST_VIDEO_COLOR_MATRIX_UNKNOWN, GST_VIDEO_FORMAT_ARGB,
  203.          GST_VIDEO_COLOR_MATRIX_UNKNOWN, TRUE, TRUE, TRUE, 0, 0,
  204.        convert_AYUV_ARGB},
Advertisement
Add Comment
Please, Sign In to add comment