Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From 5a463b2ff722915b2f27a8aeb4d1eaaa49de28f3 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Tue, 13 Apr 2010 01:08:29 -0700
- Subject: [PATCH 1/6] Add CP128/M128 macros using SSE, fix some aliasing
- Significantly improve the speed of cache_load and cache_save functions.
- Also fix a ton of pessimization in cache_save and cache_load due to aliasing.
- ---
- common/common.h | 5 +
- common/macroblock.c | 203 +++++++++++++++++++++++++++------------------------
- common/x86/util.h | 8 ++
- 3 files changed, 120 insertions(+), 96 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index b8c6dfd..38e9b74 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -88,12 +88,17 @@ do {\
- typedef union { uint16_t i; uint8_t c[2]; } MAY_ALIAS x264_union16_t;
- typedef union { uint32_t i; uint16_t b[2]; uint8_t c[4]; } MAY_ALIAS x264_union32_t;
- typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
- +typedef struct { uint64_t i[2]; } x264_uint128_t;
- +typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
- #define M16(src) (((x264_union16_t*)(src))->i)
- #define M32(src) (((x264_union32_t*)(src))->i)
- #define M64(src) (((x264_union64_t*)(src))->i)
- +#define M128(src) (((x264_union128_t*)(src))->i)
- +#define M128_CONST(x) ((x264_uint128_t){{x,x}})
- #define CP16(dst,src) M16(dst) = M16(src)
- #define CP32(dst,src) M32(dst) = M32(src)
- #define CP64(dst,src) M64(dst) = M64(src)
- +#define CP128(dst,src) M128(dst) = M128(src)
- #include "x264.h"
- #include "bs.h"
- diff --git a/common/macroblock.c b/common/macroblock.c
- index 0b9b903..fb4c1a5 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -1026,19 +1026,23 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- int left = h->mb.i_mb_left_xy;
- int top = h->mb.i_mb_top_xy;
- + /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing.*/
- + /* By only dereferencing them once, we avoid this issue. */
- + int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
- + uint8_t (*nnz)[24] = h->mb.non_zero_count;
- +
- /* load cache */
- if( h->mb.i_neighbour & MB_TOP )
- {
- h->mb.cache.i_cbp_top = h->mb.cbp[top];
- -
- /* load intra4x4 */
- - CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[top][0] );
- + CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );
- /* load non_zero_count */
- - CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[top][12] );
- + CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
- /* shift because x264_scan8[16] is misaligned */
- - M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[top][18] ) << 8;
- - M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[top][22] ) << 8;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &nnz[top][18] ) << 8;
- + M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &nnz[top][22] ) << 8;
- }
- else
- {
- @@ -1058,22 +1062,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- h->mb.cache.i_cbp_left = h->mb.cbp[left];
- /* load intra4x4 */
- - h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[left][4];
- - h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[left][5];
- - h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = h->mb.intra4x4_pred_mode[left][6];
- - h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = h->mb.intra4x4_pred_mode[left][3];
- + h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
- + h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left][5];
- + h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left][6];
- + h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left][3];
- /* load non_zero_count */
- - h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = h->mb.non_zero_count[left][3];
- - h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = h->mb.non_zero_count[left][7];
- - h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = h->mb.non_zero_count[left][11];
- - h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.non_zero_count[left][15];
- + h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left][3];
- + h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left][7];
- + h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left][11];
- + h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left][15];
- - h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = h->mb.non_zero_count[left][16+1];
- - h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = h->mb.non_zero_count[left][16+3];
- + h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left][16+1];
- + h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left][16+3];
- - h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = h->mb.non_zero_count[left][16+4+1];
- - h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = h->mb.non_zero_count[left][16+4+3];
- + h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
- + h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
- }
- else
- {
- @@ -1146,11 +1150,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
- {
- + int16_t (*mv)[2] = h->mb.mv[l];
- + int8_t *ref = h->mb.ref[l];
- +
- int i8 = x264_scan8[0] - 1 - 1*8;
- if( h->mb.i_neighbour & MB_TOPLEFT )
- {
- - h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 - 1];
- - CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 - 1] );
- + h->mb.cache.ref[l][i8] = ref[top_8x8 - 1];
- + CP32( h->mb.cache.mv[l][i8], mv[top_4x4 - 1] );
- }
- else
- {
- @@ -1162,24 +1169,22 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- if( h->mb.i_neighbour & MB_TOP )
- {
- h->mb.cache.ref[l][i8+0] =
- - h->mb.cache.ref[l][i8+1] = h->mb.ref[l][top_8x8 + 0];
- + h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
- h->mb.cache.ref[l][i8+2] =
- - h->mb.cache.ref[l][i8+3] = h->mb.ref[l][top_8x8 + 1];
- - CP64( h->mb.cache.mv[l][i8+0], h->mb.mv[l][top_4x4+0] );
- - CP64( h->mb.cache.mv[l][i8+2], h->mb.mv[l][top_4x4+2] );
- + h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
- + CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
- }
- else
- {
- - M64( h->mb.cache.mv[l][i8+0] ) = 0;
- - M64( h->mb.cache.mv[l][i8+2] ) = 0;
- + M128( h->mb.cache.mv[l][i8] ) = M128_CONST( 0 );
- M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U;
- }
- i8 = x264_scan8[0] + 4 - 1*8;
- if( h->mb.i_neighbour & MB_TOPRIGHT )
- {
- - h->mb.cache.ref[l][i8] = h->mb.ref[l][top_8x8 + 2];
- - CP32( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4 + 4] );
- + h->mb.cache.ref[l][i8] = ref[top_8x8 + 2];
- + CP32( h->mb.cache.mv[l][i8], mv[top_4x4 + 4] );
- }
- else
- h->mb.cache.ref[l][i8] = -2;
- @@ -1190,14 +1195,14 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- const int ir = h->mb.i_b8_xy - 1;
- const int iv = h->mb.i_b4_xy - 1;
- h->mb.cache.ref[l][i8+0*8] =
- - h->mb.cache.ref[l][i8+1*8] = h->mb.ref[l][ir + 0*s8x8];
- + h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8];
- h->mb.cache.ref[l][i8+2*8] =
- - h->mb.cache.ref[l][i8+3*8] = h->mb.ref[l][ir + 1*s8x8];
- + h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8];
- - CP32( h->mb.cache.mv[l][i8+0*8], h->mb.mv[l][iv + 0*s4x4] );
- - CP32( h->mb.cache.mv[l][i8+1*8], h->mb.mv[l][iv + 1*s4x4] );
- - CP32( h->mb.cache.mv[l][i8+2*8], h->mb.mv[l][iv + 2*s4x4] );
- - CP32( h->mb.cache.mv[l][i8+3*8], h->mb.mv[l][iv + 3*s4x4] );
- + CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] );
- + CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] );
- + CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] );
- + CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] );
- }
- else
- {
- @@ -1210,17 +1215,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- if( h->param.b_cabac )
- {
- + uint8_t (*mvd)[8][2] = h->mb.mvd[l];
- if( h->mb.i_neighbour & MB_TOP )
- - CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], h->mb.mvd[l][top][0] );
- + CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] );
- else
- M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0;
- if( h->mb.i_neighbour & MB_LEFT )
- {
- - CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], h->mb.mvd[l][left][4] );
- - CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], h->mb.mvd[l][left][5] );
- - CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], h->mb.mvd[l][left][6] );
- - CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], h->mb.mvd[l][left][3] );
- + CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left][4] );
- + CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left][5] );
- + CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left][6] );
- + CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left][3] );
- }
- else
- for( int i = 0; i < 4; i++ )
- @@ -1285,10 +1291,10 @@ void x264_macroblock_cache_save( x264_t *h )
- const int i_mb_4x4 = h->mb.i_b4_xy;
- const int i_mb_8x8 = h->mb.i_b8_xy;
- - /* GCC pessimizes direct stores to heap-allocated 8-bit arrays due to aliasing.*/
- + /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing.*/
- /* By only dereferencing them once, we avoid this issue. */
- - int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
- - uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
- + int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy];
- + uint8_t *nnz = h->mb.non_zero_count[i_mb_xy];
- x264_macroblock_store_pic( h, 0 );
- x264_macroblock_store_pic( h, 1 );
- @@ -1303,15 +1309,15 @@ void x264_macroblock_cache_save( x264_t *h )
- /* save intra4x4 */
- if( i_mb_type == I_4x4 )
- {
- - CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
- - M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
- - h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
- - h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
- + CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
- + M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
- + h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
- + h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
- }
- else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
- - M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
- + M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
- else
- - M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL;
- + M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL;
- if( i_mb_type == I_PCM )
- @@ -1322,19 +1328,19 @@ void x264_macroblock_cache_save( x264_t *h )
- h->mb.i_cbp_luma = 0xf;
- h->mb.cbp[i_mb_xy] = 0x72f; /* all set */
- h->mb.b_transform_8x8 = 0;
- - memset( non_zero_count, 16, sizeof( *h->mb.non_zero_count ) );
- + memset( nnz, 16, sizeof( *h->mb.non_zero_count ) );
- }
- else
- {
- /* save non zero count */
- - CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
- - CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
- - CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
- - CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
- - M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
- - M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
- - M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
- - M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
- + CP32( &nnz[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
- + CP32( &nnz[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
- + CP32( &nnz[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
- + CP32( &nnz[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
- + M16( &nnz[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
- + M16( &nnz[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
- + M16( &nnz[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
- + M16( &nnz[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
- if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
- h->mb.i_qp = h->mb.i_last_qp;
- @@ -1349,47 +1355,56 @@ void x264_macroblock_cache_save( x264_t *h )
- if( h->sh.i_type != SLICE_TYPE_I )
- {
- + int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4];
- + int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4];
- + int8_t *ref0 = &h->mb.ref[0][i_mb_8x8];
- + int8_t *ref1 = &h->mb.ref[1][i_mb_8x8];
- if( !IS_INTRA( i_mb_type ) )
- {
- - h->mb.ref[0][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
- - h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
- - h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
- - h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
- - for( int y = 0; y < 4; y++ )
- - {
- - CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
- - CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
- - }
- + ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]];
- + ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
- + ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
- + ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
- + CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] );
- + CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] );
- + CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] );
- + CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] );
- if( h->sh.i_type == SLICE_TYPE_B )
- {
- - h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
- - h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
- - h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
- - h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
- - for( int y = 0; y < 4; y++ )
- - {
- - CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
- - CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
- - }
- + ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
- + ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
- + ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
- + ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
- + CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] );
- + CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] );
- + CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] );
- + CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] );
- }
- }
- else
- {
- - for( int i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ )
- + M16( ref0+0*s8x8 ) = (uint8_t)(-1) * 0x0101;
- + M16( ref0+1*s8x8 ) = (uint8_t)(-1) * 0x0101;
- + M128( &mv0[0*s4x4] ) = M128_CONST( 0 );
- + M128( &mv0[1*s4x4] ) = M128_CONST( 0 );
- + M128( &mv0[2*s4x4] ) = M128_CONST( 0 );
- + M128( &mv0[3*s4x4] ) = M128_CONST( 0 );
- + if( h->sh.i_type == SLICE_TYPE_B )
- {
- - M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
- - M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
- - for( int y = 0; y < 4; y++ )
- - {
- - M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
- - M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
- - }
- + M16( ref1+0*s8x8 ) = (uint8_t)(-1) * 0x0101;
- + M16( ref1+1*s8x8 ) = (uint8_t)(-1) * 0x0101;
- + M128( &mv1[0*s4x4] ) = M128_CONST( 0 );
- + M128( &mv1[1*s4x4] ) = M128_CONST( 0 );
- + M128( &mv1[2*s4x4] ) = M128_CONST( 0 );
- + M128( &mv1[3*s4x4] ) = M128_CONST( 0 );
- }
- }
- }
- if( h->param.b_cabac )
- {
- + uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
- + uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
- if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
- h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
- else
- @@ -1397,27 +1412,23 @@ void x264_macroblock_cache_save( x264_t *h )
- if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
- {
- - CP64( h->mb.mvd[0][i_mb_xy][0], h->mb.cache.mvd[0][x264_scan8[10]] );
- - CP16( h->mb.mvd[0][i_mb_xy][4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
- - CP16( h->mb.mvd[0][i_mb_xy][5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
- - CP16( h->mb.mvd[0][i_mb_xy][6], h->mb.cache.mvd[0][x264_scan8[13]] );
- + CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] );
- + CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] );
- + CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] );
- + CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] );
- if( h->sh.i_type == SLICE_TYPE_B )
- {
- - CP64( h->mb.mvd[1][i_mb_xy][0], h->mb.cache.mvd[1][x264_scan8[10]] );
- - CP16( h->mb.mvd[1][i_mb_xy][4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
- - CP16( h->mb.mvd[1][i_mb_xy][5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
- - CP16( h->mb.mvd[1][i_mb_xy][6], h->mb.cache.mvd[1][x264_scan8[13]] );
- + CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] );
- + CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] );
- + CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] );
- + CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] );
- }
- }
- else
- {
- - M64( h->mb.mvd[0][i_mb_xy][0] ) = 0;
- - M64( h->mb.mvd[0][i_mb_xy][4] ) = 0;
- + M128( mvd0[0] ) = M128_CONST( 0 );
- if( h->sh.i_type == SLICE_TYPE_B )
- - {
- - M64( h->mb.mvd[1][i_mb_xy][0] ) = 0;
- - M64( h->mb.mvd[1][i_mb_xy][4] ) = 0;
- - }
- + M128( mvd1[0] ) = M128_CONST( 0 );
- }
- if( h->sh.i_type == SLICE_TYPE_B )
- diff --git a/common/x86/util.h b/common/x86/util.h
- index ccc0733..e094309 100644
- --- a/common/x86/util.h
- +++ b/common/x86/util.h
- @@ -25,6 +25,9 @@
- #define X264_X86_UTIL_H
- #ifdef __GNUC__
- +
- +#include <xmmintrin.h>
- +
- #define x264_median_mv x264_median_mv_mmxext
- static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
- {
- @@ -100,6 +103,11 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
- );
- return amvd;
- }
- +#undef M128_CONST
- +#define M128_CONST(x) ((__m128){x,x,x,x})
- +#define x264_union128_t x264_union128_sse_t
- +typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
- +
- #endif
- #endif
- --
- 1.7.0.4
- From 064db2907f52c95a7254f313edba9788dc6d9c03 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Wed, 14 Apr 2010 14:43:25 -0700
- Subject: [PATCH 2/6] Prefetch MB data in cache_load
- Dramatically reduces L1 cache misses.
- ~10% faster cache_load.
- ---
- common/macroblock.c | 38 +++++++++++++++++++++++++++++++-------
- common/osdep.h | 13 +++++++++++++
- 2 files changed, 44 insertions(+), 7 deletions(-)
- diff --git a/common/macroblock.c b/common/macroblock.c
- index fb4c1a5..5c9734f 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -941,6 +941,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
- static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y )
- {
- int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
- +
- h->mb.i_mb_x = mb_x;
- h->mb.i_mb_y = mb_y;
- h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
- @@ -986,6 +987,16 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
- if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) )
- h->mb.i_neighbour_intra |= MB_TOP;
- +
- + /* We only need to prefetch the top blocks because the left was just written
- + * to as part of the previous cache_save. Since most target CPUs use write-allocate
- + * caches, left blocks are near-guaranteed to be in L1 cache. Top--not so much. */
- + x264_prefetch( &h->mb.cbp[top] );
- + x264_prefetch( h->mb.intra4x4_pred_mode[top] );
- + x264_prefetch( &h->mb.non_zero_count[top][12] );
- + /* These aren't always allocated, but prefetching an invalid address can't hurt. */
- + x264_prefetch( &h->mb.mb_transform_size[top] );
- + x264_prefetch( &h->mb.skipbp[top] );
- }
- }
- @@ -1025,16 +1036,20 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- int left = h->mb.i_mb_left_xy;
- int top = h->mb.i_mb_top_xy;
- + int top_y = mb_y - (1 << h->mb.b_interlaced);
- + int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
- + int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
- /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing.*/
- /* By only dereferencing them once, we avoid this issue. */
- int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
- uint8_t (*nnz)[24] = h->mb.non_zero_count;
- + int16_t *cbp = h->mb.cbp;
- /* load cache */
- if( h->mb.i_neighbour & MB_TOP )
- {
- - h->mb.cache.i_cbp_top = h->mb.cbp[top];
- + h->mb.cache.i_cbp_top = cbp[top];
- /* load intra4x4 */
- CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );
- @@ -1059,7 +1074,7 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- if( h->mb.i_neighbour & MB_LEFT )
- {
- - h->mb.cache.i_cbp_left = h->mb.cbp[left];
- + h->mb.cache.i_cbp_left = cbp[left];
- /* load intra4x4 */
- h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left][4];
- @@ -1078,6 +1093,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
- h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
- +
- + /* Finish the prefetching */
- + if( h->sh.i_type != SLICE_TYPE_I )
- + for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
- + {
- + x264_prefetch( &h->mb.mv[l][top_4x4-1] );
- + /* Top right being not in the same cacheline as top left will happen
- + * once every 4 MBs, so one extra prefetch is worthwhile */
- + x264_prefetch( &h->mb.mv[l][top_4x4+4] );
- + x264_prefetch( &h->mb.ref[l][top_8x8-1] );
- + x264_prefetch( &h->mb.mvd[l][top] );
- + }
- }
- else
- {
- @@ -1142,11 +1169,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
- /* load ref/mv/mvd */
- if( h->sh.i_type != SLICE_TYPE_I )
- {
- - const int s8x8 = h->mb.i_b8_stride;
- - const int s4x4 = h->mb.i_b4_stride;
- - const int top_y = mb_y - (1 << h->mb.b_interlaced);
- - const int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
- - const int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
- + int s8x8 = h->mb.i_b8_stride;
- + int s4x4 = h->mb.i_b4_stride;
- for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
- {
- diff --git a/common/osdep.h b/common/osdep.h
- index f97547f..35772f7 100644
- --- a/common/osdep.h
- +++ b/common/osdep.h
- @@ -251,6 +251,19 @@ static int ALWAYS_INLINE x264_ctz( uint32_t x )
- }
- #endif
- +#if defined(__GNUC__) && defined(HAVE_MMX)
- +/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of
- + * using complex address modes properly unless we use inline asm. */
- +static ALWAYS_INLINE void x264_prefetch( void *p )
- +{
- + asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) );
- +}
- +#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1)
- +#define x264_prefetch(x) __builtin_prefetch(x)
- +#else
- +#define x264_prefetch(x)
- +#endif
- +
- #ifdef USE_REAL_PTHREAD
- #ifdef SYS_MINGW
- #define x264_lower_thread_priority(p)\
- --
- 1.7.0.4
- From 8891a9dc2c2602e09c1fc1636b3e3da584cadee2 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Thu, 15 Apr 2010 16:32:31 -0700
- Subject: [PATCH 3/6] Move deblocking/hpel into sliced threads
- Instead of doing both as a separate pass, do them during the main encode.
- This requires disabling deblocking between slices (disable_deblock_idc == 2).
- Overall performance gain is about 11% on --preset superfast with sliced threads.
- Doesn't reduce the amount of actual computation done: only better parallelizes it.
- ---
- common/common.h | 5 ++-
- common/frame.c | 12 ++++-
- common/macroblock.c | 68 ++++++++++++++++++-------
- common/macroblock.h | 9 +++-
- encoder/encoder.c | 136 ++++++++++++++++++++++++++-------------------------
- encoder/lookahead.c | 9 ++-
- 6 files changed, 146 insertions(+), 93 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index 38e9b74..37f309d 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -566,7 +566,8 @@ struct x264_t
- int16_t (*mvr[2][32])[2]; /* 16x16 mv for each possible ref */
- int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
- int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
- - uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
- + uint16_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of
- + * NOTE: this will fail on resolutions above 2^16 pixels... */
- /* buffer for weighted versions of the reference frames */
- uint8_t *p_weight_buf[16];
- @@ -763,7 +764,9 @@ struct x264_t
- ALIGNED_16( uint16_t nr_offset[2][64] );
- uint32_t nr_count[2];
- + /* Buffers that are allocated per-thread even in sliced threads. */
- void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
- + uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
- /* CPU functions dependents */
- x264_predict_t predict_16x16[4+3];
- diff --git a/common/frame.c b/common/frame.c
- index abcfd14..872e067 100644
- --- a/common/frame.c
- +++ b/common/frame.c
- @@ -658,6 +658,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
- int stride2y = stridey << b_interlaced;
- int strideuv = h->fdec->i_stride[1];
- int stride2uv = strideuv << b_interlaced;
- + int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
- uint8_t (*nnz_backup)[16] = h->scratch_buffer;
- if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
- @@ -778,9 +779,18 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
- * i_dir == 1 -> horizontal edge */
- #define DEBLOCK_DIR(i_dir)\
- {\
- - int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
- + int i_edge = 0;\
- int i_qpn, mbn_xy, mbn_8x8, mbn_4x4;\
- ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
- + /* We don't have to consider the MBAFF case of a slice breaking in the middle\
- + * of a row because x264 doesn't support that case. If we add support for that,\
- + * this will have to become significantly more complex. */\
- + if( i_dir == 0 && (mb_x == 0 || (!deblock_on_slice_edges &&\
- + h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-1])) )\
- + i_edge++;\
- + if( i_dir == 1 && (mb_y <= b_interlaced || (!deblock_on_slice_edges &&\
- + h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-(h->mb.i_mb_stride<<b_interlaced)])) )\
- + i_edge++;\
- if( i_edge )\
- i_edge+= b_8x8_transform;\
- else\
- diff --git a/common/macroblock.c b/common/macroblock.c
- index 5c9734f..4ef959f 100644
- --- a/common/macroblock.c
- +++ b/common/macroblock.c
- @@ -675,7 +675,7 @@ void x264_mb_mc( x264_t *h )
- }
- }
- -int x264_macroblock_cache_init( x264_t *h )
- +int x264_macroblock_cache_allocate( x264_t *h )
- {
- int i_mb_count = h->mb.i_mb_count;
- @@ -689,6 +689,8 @@ int x264_macroblock_cache_init( x264_t *h )
- CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
- CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
- CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
- + CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
- + memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
- /* 0 -> 3 top(4), 4 -> 6 : left(3) */
- CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
- @@ -755,22 +757,11 @@ int x264_macroblock_cache_init( x264_t *h )
- #undef ALIGN
- }
- - for( int i = 0; i <= h->param.b_interlaced; i++ )
- - for( int j = 0; j < 3; j++ )
- - {
- - /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
- - CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
- - h->mb.intra_border_backup[i][j] += 8;
- - }
- -
- return 0;
- fail: return -1;
- }
- -void x264_macroblock_cache_end( x264_t *h )
- +void x264_macroblock_cache_free( x264_t *h )
- {
- - for( int i = 0; i <= h->param.b_interlaced; i++ )
- - for( int j = 0; j < 3; j++ )
- - x264_free( h->mb.intra_border_backup[i][j] - 8 );
- for( int i = 0; i < 2; i++ )
- for( int j = 0; j < 32; j++ )
- x264_free( h->mb.mvr[i][j] );
- @@ -783,6 +774,7 @@ void x264_macroblock_cache_end( x264_t *h )
- x264_free( h->mb.mvd[0] );
- x264_free( h->mb.mvd[1] );
- }
- + x264_free( h->mb.slice_table );
- x264_free( h->mb.intra4x4_pred_mode );
- x264_free( h->mb.non_zero_count );
- x264_free( h->mb.mb_transform_size );
- @@ -790,6 +782,47 @@ void x264_macroblock_cache_end( x264_t *h )
- x264_free( h->mb.cbp );
- x264_free( h->mb.qp );
- }
- +
- +int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
- +{
- + if( !b_lookahead )
- + for( int i = 0; i <= h->param.b_interlaced; i++ )
- + for( int j = 0; j < 3; j++ )
- + {
- + /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
- + CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
- + h->intra_border_backup[i][j] += 8;
- + }
- +
- + /* Allocate scratch buffer */
- + int scratch_size = 0;
- + if( !b_lookahead )
- + {
- + int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
- + int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
- + int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
- + int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
- + ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
- + int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
- + scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
- + }
- + int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
- + scratch_size = X264_MAX( scratch_size, buf_mbtree );
- + CHECKED_MALLOC( h->scratch_buffer, scratch_size );
- +
- + return 0;
- +fail: return -1;
- +}
- +
- +void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
- +{
- + if( !b_lookahead )
- + for( int i = 0; i <= h->param.b_interlaced; i++ )
- + for( int j = 0; j < 3; j++ )
- + x264_free( h->intra_border_backup[i][j] - 8 );
- + x264_free( h->scratch_buffer );
- +}
- +
- void x264_macroblock_slice_init( x264_t *h )
- {
- h->mb.mv[0] = h->fdec->mv[0];
- @@ -898,8 +931,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
- ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride
- : w * (mb_x + mb_y * i_stride);
- const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
- - const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
- - &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
- + const uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
- int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
- x264_frame_t **fref[2] = { h->fref0, h->fref1 };
- if( h->mb.b_interlaced )
- @@ -908,10 +940,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
- h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
- h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
- h->mb.pic.p_fenc_plane[i], i_stride2, w );
- - if( mb_y > 0 )
- - memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
- - else
- - memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
- + memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
- if( h->mb.b_interlaced )
- for( int j = 0; j < w; j++ )
- h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
- @@ -1327,6 +1356,7 @@ void x264_macroblock_cache_save( x264_t *h )
- x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
- h->mb.type[i_mb_xy] = i_mb_type;
- + h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb;
- h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
- h->mb.i_mb_prev_xy = i_mb_xy;
- diff --git a/common/macroblock.h b/common/macroblock.h
- index 5ef1498..ee8c113 100644
- --- a/common/macroblock.h
- +++ b/common/macroblock.h
- @@ -260,13 +260,18 @@ enum cabac_ctx_block_cat_e
- DCT_LUMA_8x8 = 5,
- };
- +/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
- +int x264_macroblock_cache_allocate( x264_t *h );
- +void x264_macroblock_cache_free( x264_t *h );
- +
- +/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
- +int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
- +void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
- -int x264_macroblock_cache_init( x264_t *h );
- void x264_macroblock_slice_init( x264_t *h );
- void x264_macroblock_thread_init( x264_t *h );
- void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
- void x264_macroblock_cache_save( x264_t *h );
- -void x264_macroblock_cache_end( x264_t *h );
- void x264_macroblock_bipred_init( x264_t *h );
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 300041e..a07f0ea 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -158,7 +158,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
- int deblock_thresh = i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta);
- /* If effective qp <= 15, deblocking would have no effect anyway */
- if( param->b_deblocking_filter && (h->mb.b_variable_qp || 15 < deblock_thresh ) )
- - sh->i_disable_deblocking_filter_idc = 0;
- + sh->i_disable_deblocking_filter_idc = param->b_sliced_threads ? 2 : 0;
- else
- sh->i_disable_deblocking_filter_idc = 1;
- sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
- @@ -519,6 +519,16 @@ static int x264_validate_parameters( x264_t *h )
- h->param.rc.i_vbv_max_bitrate = 0;
- }
- + if( h->param.b_interlaced && h->param.i_slice_max_size )
- + {
- + x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
- + h->param.i_slice_max_size = 0;
- + }
- + if( h->param.b_interlaced && h->param.i_slice_max_mbs )
- + {
- + x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
- + h->param.i_slice_max_mbs = 0;
- + }
- int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
- if( h->param.b_sliced_threads )
- h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
- @@ -527,16 +537,6 @@ static int x264_validate_parameters( x264_t *h )
- h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
- h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
- h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
- - if( h->param.b_interlaced && h->param.i_slice_max_size )
- - {
- - x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
- - h->param.i_slice_max_size = 0;
- - }
- - if( h->param.b_interlaced && h->param.i_slice_max_mbs )
- - {
- - x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
- - h->param.i_slice_max_mbs = 0;
- - }
- if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
- h->param.i_slice_count = 0;
- }
- @@ -1059,23 +1059,13 @@ x264_t *x264_encoder_open( x264_param_t *param )
- CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
- h->thread[i]->out.i_nals_allocated = init_nal_count;
- - if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
- + if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
- goto fail;
- }
- - /* Allocate scratch buffer */
- - for( int i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
- - {
- - int buf_hpel = (h->fdec->i_width[0]+48) * sizeof(int16_t);
- - int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
- - int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
- - int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
- - ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
- - int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
- - int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
- - int scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, X264_MAX( buf_mbtree, buf_nnz ) );
- - CHECKED_MALLOC( h->thread[i]->scratch_buffer, scratch_size );
- - }
- + for( int i = 0; i < h->param.i_threads; i++ )
- + if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
- + goto fail;
- if( x264_ratecontrol_new( h ) < 0 )
- goto fail;
- @@ -1552,25 +1542,32 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
- h->mb.pic.i_fref[1] = h->i_ref1;
- }
- -static void x264_fdec_filter_row( x264_t *h, int mb_y )
- +static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
- {
- /* mb_y is the mb to be encoded next, not the mb to be filtered here */
- int b_hpel = h->fdec->b_kept_as_ref;
- - int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
- - int b_end = mb_y == h->sps->i_mb_height;
- + int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
- + int b_end = mb_y == h->i_threadslice_end;
- + int b_measure_quality = 1;
- int min_y = mb_y - (1 << h->sh.b_mbaff);
- - int max_y = b_end ? h->sps->i_mb_height : mb_y;
- + int b_start = min_y == h->i_threadslice_start;
- + int max_y = b_end ? h->i_threadslice_end : mb_y;
- b_deblock &= b_hpel || h->param.psz_dump_yuv;
- + if( h->param.b_sliced_threads && b_start && min_y && !b_inloop )
- + {
- + b_deblock = 0; /* We already deblocked on the inloop pass. */
- + b_measure_quality = 0; /* We already measured quality on the inloop pass. */
- + }
- if( mb_y & h->sh.b_mbaff )
- return;
- - if( min_y < 0 )
- + if( min_y < h->i_threadslice_start )
- return;
- - if( !b_end && !h->param.b_sliced_threads )
- + if( !b_end && b_inloop )
- for( int j = 0; j <= h->sh.b_mbaff; j++ )
- for( int i = 0; i < 3; i++ )
- {
- - memcpy( h->mb.intra_border_backup[j][i],
- + memcpy( h->intra_border_backup[j][i],
- h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
- h->sps->i_mb_width*16 >> !!i );
- }
- @@ -1581,39 +1578,43 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
- if( b_hpel )
- {
- - x264_frame_expand_border( h, h->fdec, min_y, b_end );
- + int end = mb_y == h->sps->i_mb_height;
- + x264_frame_expand_border( h, h->fdec, min_y, end );
- if( h->param.analyse.i_subpel_refine )
- {
- - x264_frame_filter( h, h->fdec, min_y, b_end );
- - x264_frame_expand_border_filtered( h, h->fdec, min_y, b_end );
- + x264_frame_filter( h, h->fdec, min_y, end );
- + x264_frame_expand_border_filtered( h, h->fdec, min_y, end );
- }
- }
- if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
- x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
- - min_y = X264_MAX( min_y*16-8, 0 );
- - max_y = b_end ? h->param.i_height : mb_y*16-8;
- -
- - if( h->param.analyse.b_psnr )
- - for( int i = 0; i < 3; i++ )
- - h->stat.frame.i_ssd[i] +=
- - x264_pixel_ssd_wxh( &h->pixf,
- - h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
- - h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
- - h->param.i_width >> !!i, (max_y-min_y) >> !!i );
- + min_y = min_y*16 - 8 * !b_start;
- + max_y = b_end ? X264_MIN( h->i_threadslice_end*16 , h->param.i_height ) : mb_y*16 - 8;
- - if( h->param.analyse.b_ssim )
- + if( b_measure_quality )
- {
- - x264_emms();
- - /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
- - * and overlap by 4 */
- - min_y += min_y == 0 ? 2 : -6;
- - h->stat.frame.f_ssim +=
- - x264_pixel_ssim_wxh( &h->pixf,
- - h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
- - h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
- - h->param.i_width-2, max_y-min_y, h->scratch_buffer );
- + if( h->param.analyse.b_psnr )
- + for( int i = 0; i < 3; i++ )
- + h->stat.frame.i_ssd[i] +=
- + x264_pixel_ssd_wxh( &h->pixf,
- + h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
- + h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
- + h->param.i_width >> !!i, (max_y-min_y) >> !!i );
- +
- + if( h->param.analyse.b_ssim )
- + {
- + x264_emms();
- + /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
- + * and overlap by 4 */
- + min_y += b_start ? 2 : -6;
- + h->stat.frame.f_ssim +=
- + x264_pixel_ssim_wxh( &h->pixf,
- + h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
- + h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
- + h->param.i_width-2, max_y-min_y, h->scratch_buffer );
- + }
- }
- }
- @@ -1808,8 +1809,8 @@ static int x264_slice_write( x264_t *h )
- }
- }
- - if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
- - x264_fdec_filter_row( h, i_mb_y );
- + if( i_mb_x == 0 && !h->mb.b_reencode_mb )
- + x264_fdec_filter_row( h, i_mb_y, 1 );
- /* load cache */
- x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
- @@ -1971,14 +1972,13 @@ static int x264_slice_write( x264_t *h )
- if( x264_nal_end( h ) )
- return -1;
- - if( h->sh.i_last_mb == h->mb.i_mb_count-1 )
- + if( h->sh.i_last_mb == (h->i_threadslice_end * h->sps->i_mb_width - 1) )
- {
- h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
- + (h->out.i_nal*NALU_OVERHEAD * 8)
- - h->stat.frame.i_tex_bits
- - h->stat.frame.i_mv_bits;
- - if( !h->param.b_sliced_threads )
- - x264_fdec_filter_row( h, h->sps->i_mb_height );
- + x264_fdec_filter_row( h, h->i_threadslice_end, 1 );
- }
- return 0;
- @@ -2099,9 +2099,9 @@ static int x264_threaded_slices_write( x264_t *h )
- return (intptr_t)ret;
- }
- - /* deblocking and hpel filtering */
- - for( int i = 0; i <= h->sps->i_mb_height; i++ )
- - x264_stack_align( x264_fdec_filter_row, h, i );
- + /* Go back and fix up the hpel on the borders between slices. */
- + for( int i = 1; i < h->param.i_threads; i++ )
- + x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );
- x264_threads_merge_ratecontrol( h );
- @@ -2114,10 +2114,12 @@ static int x264_threaded_slices_write( x264_t *h )
- h->out.i_nal++;
- x264_nal_check_buffer( h );
- }
- - /* All entries in stat.frame are ints except for ssd/ssim,
- - * which are only calculated in the main thread. */
- + /* All entries in stat.frame are ints except for ssd/ssim. */
- for( int j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
- ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
- + for( int j = 0; j < 3; j++ )
- + h->stat.frame.i_ssd[j] += t->stat.frame.i_ssd[j];
- + h->stat.frame.f_ssim += t->stat.frame.f_ssim;
- }
- return 0;
- @@ -3072,9 +3074,9 @@ void x264_encoder_close ( x264_t *h )
- (*frame)->i_reference_count--;
- if( (*frame)->i_reference_count == 0 )
- x264_frame_delete( *frame );
- - x264_macroblock_cache_end( h->thread[i] );
- + x264_macroblock_cache_free( h->thread[i] );
- }
- - x264_free( h->thread[i]->scratch_buffer );
- + x264_macroblock_thread_free( h->thread[i], 0 );
- x264_free( h->thread[i]->out.p_bitstream );
- x264_free( h->thread[i]->out.nal);
- x264_free( h->thread[i] );
- diff --git a/encoder/lookahead.c b/encoder/lookahead.c
- index 7a0c6d3..5e29fb5 100644
- --- a/encoder/lookahead.c
- +++ b/encoder/lookahead.c
- @@ -148,7 +148,10 @@ int x264_lookahead_init( x264_t *h, int i_slicetype_length )
- x264_t *look_h = h->thread[h->param.i_threads];
- *look_h = *h;
- - if( x264_macroblock_cache_init( look_h ) )
- + if( x264_macroblock_cache_allocate( look_h ) )
- + goto fail;
- +
- + if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
- goto fail;
- if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
- @@ -170,8 +173,8 @@ void x264_lookahead_delete( x264_t *h )
- x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
- x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
- x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
- - x264_macroblock_cache_end( h->thread[h->param.i_threads] );
- - x264_free( h->thread[h->param.i_threads]->scratch_buffer );
- + x264_macroblock_cache_free( h->thread[h->param.i_threads] );
- + x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
- x264_free( h->thread[h->param.i_threads] );
- }
- x264_synch_frame_list_delete( &h->lookahead->ifbuf );
- --
- 1.7.0.4
- From cd9762c72e81e036b8eda7d6559d0a867f187c9e Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 16 Apr 2010 03:06:46 -0700
- Subject: [PATCH 4/6] Fix four minor bugs found by Clang
- ---
- encoder/analyse.c | 2 +-
- encoder/encoder.c | 2 +-
- input/timecode.c | 17 ++++++++++-------
- output/matroska.c | 2 ++
- 4 files changed, 14 insertions(+), 9 deletions(-)
- diff --git a/encoder/analyse.c b/encoder/analyse.c
- index 2ece9dc..74672d1 100644
- --- a/encoder/analyse.c
- +++ b/encoder/analyse.c
- @@ -1480,7 +1480,7 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
- weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
- h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
- if( weight[2].weightfn ) \
- - weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
- + weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
- if( pixel == PIXEL_4x4 )
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index a07f0ea..1438ec0 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -1338,7 +1338,7 @@ int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t
- if( h->fref0[i_ref]->i_frame != h->fref0[j]->i_frame )
- {
- /* found a place, after j, make sure there is not already a duplicate there */
- - if( j == i-1 || ( h->fref0[j+1] && h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
- + if( j == i-1 || ( h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
- break;
- }
- diff --git a/input/timecode.c b/input/timecode.c
- index 4a369ee..5fabe61 100644
- --- a/input/timecode.c
- +++ b/input/timecode.c
- @@ -194,15 +194,18 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
- ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
- if( ret != 3 )
- start = end = timecodes_num - 1;
- - if( h->auto_timebase_den || h->auto_timebase_num )
- - fpss[seq_num++] = seq_fps;
- - seq_fps = correct_fps( seq_fps, h );
- - if( seq_fps < 0 )
- - goto fail;
- for( ; num < start && num < timecodes_num - 1; num++ )
- timecodes[num + 1] = timecodes[num] + 1 / assume_fps;
- - for( num = start; num <= end && num < timecodes_num - 1; num++ )
- - timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
- + if( num < timecodes_num - 1 )
- + {
- + if( h->auto_timebase_den || h->auto_timebase_num )
- + fpss[seq_num++] = seq_fps;
- + seq_fps = correct_fps( seq_fps, h );
- + if( seq_fps < 0 )
- + goto fail;
- + for( num = start; num <= end && num < timecodes_num - 1; num++ )
- + timecodes[num + 1] = timecodes[num] + 1 / seq_fps;
- + }
- }
- if( h->auto_timebase_den || h->auto_timebase_num )
- fpss[seq_num] = h->assume_fps;
- diff --git a/output/matroska.c b/output/matroska.c
- index 25e91d5..47753d7 100644
- --- a/output/matroska.c
- +++ b/output/matroska.c
- @@ -150,6 +150,8 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
- avcC, avcC_len, p_mkv->frame_duration, 50000,
- p_mkv->width, p_mkv->height,
- p_mkv->d_width, p_mkv->d_height );
- + if( ret < 0 )
- + return ret;
- free( avcC );
- --
- 1.7.0.4
- From 217f4f314a13ae21b4ef559ddfa7cb1ce6b740f8 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 16 Apr 2010 12:06:07 -0700
- Subject: [PATCH 5/6] MMX code for predictor rounding/clipping
- Faster predictor checking at subme < 3.
- ---
- common/common.h | 11 +++++++++++
- common/x86/util.h | 41 +++++++++++++++++++++++++++++++++++++++++
- encoder/me.c | 11 ++++++-----
- 3 files changed, 58 insertions(+), 5 deletions(-)
- diff --git a/common/common.h b/common/common.h
- index 37f309d..ce2e7af 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -188,6 +188,17 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd
- return amvd0 + (amvd1<<8);
- }
- +static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
- +{
- + for( int i = 0; i < i_mvc; i++ )
- + {
- + int mx = (mvc[i][0] + 2) >> 2;
- + int my = (mvc[i][1] + 2) >> 2;
- + mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
- + mvc[i][0] = x264_clip3( my, mv_y_min, mv_y_max );
- + }
- +}
- +
- extern const uint8_t x264_exp2_lut[64];
- extern const float x264_log2_lut[128];
- extern const float x264_log2_lz_lut[32];
- diff --git a/common/x86/util.h b/common/x86/util.h
- index e094309..1a5ed32 100644
- --- a/common/x86/util.h
- +++ b/common/x86/util.h
- @@ -45,6 +45,7 @@ static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16
- :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
- );
- }
- +
- #define x264_predictor_difference x264_predictor_difference_mmxext
- static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
- {
- @@ -80,6 +81,7 @@ static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], in
- );
- return sum;
- }
- +
- #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
- static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
- {
- @@ -103,6 +105,45 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
- );
- return amvd;
- }
- +
- +#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
- +static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
- +{
- + uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
- + uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
- + static const uint64_t pw_2 = 0x0002000200020002ULL;
- + intptr_t i = i_mvc;
- + asm(
- + "movd %2, %%mm5 \n"
- + "movd %3, %%mm6 \n"
- + "movq %4, %%mm7 \n"
- + "punpckldq %%mm5, %%mm5 \n"
- + "punpckldq %%mm6, %%mm6 \n"
- + "test $1, %0 \n"
- + "jz 1f \n"
- + "movd -4(%5,%0,4), %%mm0 \n"
- + "paddw %%mm7, %%mm0 \n"
- + "psraw $2, %%mm0 \n"
- + "pmaxsw %%mm5, %%mm0 \n"
- + "pminsw %%mm6, %%mm0 \n"
- + "movd %%mm0, -4(%5,%0,4) \n"
- + "dec %0 \n"
- + "jz 2f \n"
- + "1: \n"
- + "movq -8(%5,%0,4), %%mm0 \n"
- + "paddw %%mm7, %%mm0 \n"
- + "psraw $2, %%mm0 \n"
- + "pmaxsw %%mm5, %%mm0 \n"
- + "pminsw %%mm6, %%mm0 \n"
- + "movq %%mm0, -8(%5,%0,4) \n"
- + "sub $2, %0 \n"
- + "jnz 1b \n"
- + "2: \n"
- + :"+r"(i), "+m"(M64( mvc ))
- + :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc)
- + );
- +}
- +
- #undef M128_CONST
- #define M128_CONST(x) ((__m128){x,x,x,x})
- #define x264_union128_t x264_union128_sse_t
- diff --git a/encoder/me.c b/encoder/me.c
- index 6788022..0b519ea 100644
- --- a/encoder/me.c
- +++ b/encoder/me.c
- @@ -241,14 +241,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
- * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
- * biasing against use of the predicted motion vector. */
- bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
- + uint32_t bmv = pack16to32_mask( bmx, bmy );
- + if( i_mvc )
- + x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
- for( int i = 0; i < i_mvc; i++ )
- {
- - int mx = (mvc[i][0] + 2) >> 2;
- - int my = (mvc[i][1] + 2) >> 2;
- - if( (mx | my) && ((mx-bmx) | (my-bmy)) )
- + if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
- {
- - mx = x264_clip3( mx, mv_x_min, mv_x_max );
- - my = x264_clip3( my, mv_y_min, mv_y_max );
- + int mx = mvc[i][0];
- + int my = mvc[i][1];
- COST_MV( mx, my );
- }
- }
- --
- 1.7.0.4
- From 292fc5e6a7c842e70e752eea9d758ad857ac7873 Mon Sep 17 00:00:00 2001
- From: Jason Garrett-Glaser <darkshikari@gmail.com>
- Date: Fri, 16 Apr 2010 11:36:43 -0700
- Subject: [PATCH 6/6] Fix issues with extremely large timebases
- With timebase denominators >= 2^30 , x264 would silently overflow and cause odd issues.
- Now x264 will explicitly fail with timebase denominators >= 2^31 and work with timebase denominators 2^31 > x >= 2^30.
- ---
- common/common.c | 14 +++++++-------
- common/common.h | 2 +-
- common/set.h | 4 ++--
- encoder/encoder.c | 22 +++++++++++++++-------
- encoder/ratecontrol.c | 4 ++--
- input/input.h | 12 ++++++------
- input/timecode.c | 32 +++++++++++++++++---------------
- input/y4m.c | 3 ++-
- output/flv.c | 4 ++--
- output/matroska.c | 4 ++--
- output/mp4.c | 2 +-
- x264.c | 8 ++++----
- x264.h | 11 +++++------
- 13 files changed, 66 insertions(+), 56 deletions(-)
- diff --git a/common/common.c b/common/common.c
- index 924323a..6471c07 100644
- --- a/common/common.c
- +++ b/common/common.c
- @@ -614,7 +614,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
- }
- OPT("fps")
- {
- - if( sscanf( value, "%d/%d", &p->i_fps_num, &p->i_fps_den ) == 2 )
- + if( sscanf( value, "%u/%u", &p->i_fps_num, &p->i_fps_den ) == 2 )
- ;
- else
- {
- @@ -1119,11 +1119,11 @@ void x264_free( void *p )
- /****************************************************************************
- * x264_reduce_fraction:
- ****************************************************************************/
- -void x264_reduce_fraction( int *n, int *d )
- +void x264_reduce_fraction( uint32_t *n, uint32_t *d )
- {
- - int a = *n;
- - int b = *d;
- - int c;
- + uint32_t a = *n;
- + uint32_t b = *d;
- + uint32_t c;
- if( !a || !b )
- return;
- c = a % b;
- @@ -1185,8 +1185,8 @@ char *x264_param2string( x264_param_t *p, int b_res )
- if( b_res )
- {
- s += sprintf( s, "%dx%d ", p->i_width, p->i_height );
- - s += sprintf( s, "fps=%d/%d ", p->i_fps_num, p->i_fps_den );
- - s += sprintf( s, "timebase=%d/%d ", p->i_timebase_num, p->i_timebase_den );
- + s += sprintf( s, "fps=%u/%u ", p->i_fps_num, p->i_fps_den );
- + s += sprintf( s, "timebase=%u/%u ", p->i_timebase_num, p->i_timebase_den );
- }
- s += sprintf( s, "cabac=%d", p->b_cabac );
- diff --git a/common/common.h b/common/common.h
- index ce2e7af..f4bd5dc 100644
- --- a/common/common.h
- +++ b/common/common.h
- @@ -134,7 +134,7 @@ int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_sta
- /* log */
- void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
- -void x264_reduce_fraction( int *n, int *d );
- +void x264_reduce_fraction( uint32_t *n, uint32_t *d );
- void x264_init_vlc_tables();
- static ALWAYS_INLINE uint8_t x264_clip_uint8( int x )
- diff --git a/common/set.h b/common/set.h
- index 9783118..ee27d74 100644
- --- a/common/set.h
- +++ b/common/set.h
- @@ -112,8 +112,8 @@ typedef struct
- int i_chroma_loc_bottom;
- int b_timing_info_present;
- - int i_num_units_in_tick;
- - int i_time_scale;
- + uint32_t i_num_units_in_tick;
- + uint32_t i_time_scale;
- int b_fixed_frame_rate;
- int b_nal_hrd_parameters_present;
- diff --git a/encoder/encoder.c b/encoder/encoder.c
- index 1438ec0..9b21d92 100644
- --- a/encoder/encoder.c
- +++ b/encoder/encoder.c
- @@ -817,10 +817,10 @@ static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
- /* VUI */
- if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
- {
- - int i_w = param->vui.i_sar_width;
- - int i_h = param->vui.i_sar_height;
- - int old_w = h->param.vui.i_sar_width;
- - int old_h = h->param.vui.i_sar_height;
- + uint32_t i_w = param->vui.i_sar_width;
- + uint32_t i_h = param->vui.i_sar_height;
- + uint32_t old_w = h->param.vui.i_sar_width;
- + uint32_t old_h = h->param.vui.i_sar_height;
- x264_reduce_fraction( &i_w, &i_h );
- @@ -886,21 +886,29 @@ x264_t *x264_encoder_open( x264_param_t *param )
- h->i_frame = -1;
- h->i_frame_num = 0;
- h->i_idr_pic_id = 0;
- + uint64_t new_timebase_den = h->param.i_timebase_den;
- if( h->param.b_dts_compress )
- {
- /* h->i_dts_compress_multiplier == h->frames.i_bframe_delay + 1 */
- h->i_dts_compress_multiplier = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 3 : 2) : 1;
- if( h->i_dts_compress_multiplier != 1 )
- {
- - x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %d/%d -> %d/%d\n",
- + new_timebase_den = h->param.i_timebase_den * h->i_dts_compress_multiplier;
- + x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %u/%u -> %u/ %"PRIu64"\n",
- h->param.i_timebase_num, h->param.i_timebase_den,
- - h->param.i_timebase_num, h->param.i_timebase_den * h->i_dts_compress_multiplier );
- - h->param.i_timebase_den *= h->i_dts_compress_multiplier;
- + h->param.i_timebase_num, new_timebase_den );
- }
- }
- else
- h->i_dts_compress_multiplier = 1;
- + if( new_timebase_den * 2 >= (1ULL << 32) )
- + {
- + x264_log( h, X264_LOG_ERROR, "Effective timebase denominator %"PRIu64" exceeds H.264 maximum\n", new_timebase_den );
- + goto fail;
- + }
- + h->param.i_timebase_den = new_timebase_den;
- +
- h->sps = &h->sps_array[0];
- x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
- diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
- index b51dbf7..8dd38f1 100644
- --- a/encoder/ratecontrol.c
- +++ b/encoder/ratecontrol.c
- @@ -657,14 +657,14 @@ int x264_ratecontrol_new( x264_t *h )
- return -1;
- }
- - if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%d/%d", &i, &j ) != 2 )
- + if( ( p = strstr( opts, "timebase=" ) ) && sscanf( p, "timebase=%u/%u", &i, &j ) != 2 )
- {
- x264_log( h, X264_LOG_ERROR, "timebase specified in stats file not valid\n" );
- return -1;
- }
- if( i != h->param.i_timebase_num || j != h->param.i_timebase_den )
- {
- - x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%d/%d vs %d/%d)\n",
- + x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%u/%u vs %u/%u)\n",
- h->param.i_timebase_num, h->param.i_timebase_den, i, j );
- return -1;
- }
- diff --git a/input/input.h b/input/input.h
- index b6cd218..eb62fdd 100644
- --- a/input/input.h
- +++ b/input/input.h
- @@ -38,15 +38,15 @@ typedef struct
- typedef struct
- {
- int csp; /* X264_CSP_YV12 or X264_CSP_I420 */
- - int fps_num;
- - int fps_den;
- + uint32_t fps_num;
- + uint32_t fps_den;
- int height;
- int interlaced;
- - int sar_width;
- - int sar_height;
- + uint32_t sar_width;
- + uint32_t sar_height;
- int tff;
- - int timebase_num;
- - int timebase_den;
- + uint32_t timebase_num;
- + uint32_t timebase_den;
- int vfr;
- int width;
- } video_info_t;
- diff --git a/input/timecode.c b/input/timecode.c
- index 5fabe61..008cb19 100644
- --- a/input/timecode.c
- +++ b/input/timecode.c
- @@ -32,8 +32,8 @@ typedef struct
- int frame_total;
- int auto_timebase_num;
- int auto_timebase_den;
- - int timebase_num;
- - int timebase_den;
- + int64_t timebase_num;
- + int64_t timebase_den;
- int seek;
- int stored_pts_num;
- int64_t *pts;
- @@ -53,7 +53,7 @@ static inline double sigexp10( double value, double *exponent )
- static double correct_fps( double fps, timecode_hnd_t *h )
- {
- - int64_t i = 1;
- + int i = 1;
- int64_t fps_num, fps_den;
- double exponent;
- double fps_sig = sigexp10( fps, &exponent );
- @@ -61,7 +61,7 @@ static double correct_fps( double fps, timecode_hnd_t *h )
- {
- fps_den = i * h->timebase_num;
- fps_num = round( fps_den * fps_sig ) * exponent;
- - if( fps_num < 0 )
- + if( fps_num > UINT_MAX )
- {
- fprintf( stderr, "timecode [error]: tcfile fps correction failed.\n"
- " Specify an appropriate timebase manually or remake tcfile.\n" );
- @@ -74,7 +74,7 @@ static double correct_fps( double fps, timecode_hnd_t *h )
- if( h->auto_timebase_den )
- {
- h->timebase_den = h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
- - if( h->timebase_den < 0 )
- + if( h->timebase_den > UINT_MAX )
- h->auto_timebase_den = 0;
- }
- return (double)fps_num / fps_den;
- @@ -86,12 +86,12 @@ static int try_mkv_timebase_den( double *fpss, timecode_hnd_t *h, int loop_num )
- h->timebase_den = MKV_TIMEBASE_DEN;
- for( int num = 0; num < loop_num; num++ )
- {
- - int fps_den;
- + int64_t fps_den;
- double exponent;
- double fps_sig = sigexp10( fpss[num], &exponent );
- fps_den = round( MKV_TIMEBASE_DEN / fps_sig ) / exponent;
- h->timebase_num = fps_den > 0 && h->timebase_num ? gcd( h->timebase_num, fps_den ) : fps_den;
- - if( h->timebase_num <= 0 )
- + if( h->timebase_num > UINT_MAX || !h->timebase_num )
- {
- fprintf( stderr, "timecode [error]: automatic timebase generation failed.\n"
- " Specify timebase manually.\n" );
- @@ -305,19 +305,19 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
- if( h->timebase_den >= 0 )
- {
- int i = 1;
- - int fps_num, fps_den;
- + int64_t fps_num, fps_den;
- double exponent;
- double fps_sig = sigexp10( fpss[num], &exponent );
- while( 1 )
- {
- fps_den = i * h->timebase_num;
- fps_num = round( fps_den * fps_sig ) * exponent;
- - if( fps_num < 0 || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
- + if( fps_num > UINT_MAX || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
- break;
- ++i;
- }
- h->timebase_den = fps_num > 0 && h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num;
- - if( h->timebase_den < 0 )
- + if( h->timebase_den > UINT_MAX )
- {
- h->auto_timebase_den = 0;
- continue;
- @@ -339,10 +339,12 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
- if( h->auto_timebase_den || h->auto_timebase_num )
- {
- - x264_reduce_fraction( &h->timebase_num, &h->timebase_den );
- - fprintf( stderr, "timecode [info]: automatic timebase generation %d/%d\n", h->timebase_num, h->timebase_den );
- + int64_t i = gcd( h->timebase_num, h->timebase_den );
- + h->timebase_num /= i;
- + h->timebase_den /= i;
- + fprintf( stderr, "timecode [info]: automatic timebase generation %"PRId64"/%"PRId64"\n", h->timebase_num, h->timebase_den );
- }
- - else if( h->timebase_den <= 0 )
- + else if( h->timebase_den > UINT_MAX || !h->timebase_den )
- {
- fprintf( stderr, "timecode [error]: automatic timebase generation failed.\n"
- " Specify an appropriate timebase manually.\n" );
- @@ -394,9 +396,9 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
- h->frame_total = input.get_frame_total( h->p_handle );
- h->seek = opt->seek;
- if( opt->timebase )
- - ret = sscanf( opt->timebase, "%d/%d", &h->timebase_num, &h->timebase_den );
- + ret = sscanf( opt->timebase, "%"PRId64"/%"PRId64"", &h->timebase_num, &h->timebase_den );
- if( ret == 1 )
- - h->timebase_num = atoi( opt->timebase );
- + h->timebase_num = strtoul( opt->timebase, NULL, 10 );
- h->auto_timebase_num = !ret;
- h->auto_timebase_den = ret < 2;
- if( h->auto_timebase_num )
- diff --git a/input/y4m.c b/input/y4m.c
- index c34f264..842b986 100644
- --- a/input/y4m.c
- +++ b/input/y4m.c
- @@ -40,7 +40,8 @@ typedef struct
- static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
- {
- y4m_hnd_t *h = malloc( sizeof(y4m_hnd_t) );
- - int i, n, d;
- + int i;
- + uint32_t n, d;
- char header[MAX_YUV4_HEADER+10];
- char *tokend, *header_end;
- int colorspace = X264_CSP_NONE;
- diff --git a/output/flv.c b/output/flv.c
- index 04f4428..e441b6d 100644
- --- a/output/flv.c
- +++ b/output/flv.c
- @@ -47,8 +47,8 @@ typedef struct
- int64_t i_prev_dts;
- int64_t i_prev_pts;
- - int i_timebase_num;
- - int i_timebase_den;
- + uint32_t i_timebase_num;
- + uint32_t i_timebase_den;
- int b_vfr_input;
- unsigned start;
- diff --git a/output/matroska.c b/output/matroska.c
- index 47753d7..0304c84 100644
- --- a/output/matroska.c
- +++ b/output/matroska.c
- @@ -30,8 +30,8 @@ typedef struct
- int64_t frame_duration;
- char b_writing_frame;
- - int i_timebase_num;
- - int i_timebase_den;
- + uint32_t i_timebase_num;
- + uint32_t i_timebase_den;
- } mkv_hnd_t;
- diff --git a/output/mp4.c b/output/mp4.c
- index cbe9f5c..f76541e 100644
- --- a/output/mp4.c
- +++ b/output/mp4.c
- @@ -38,7 +38,7 @@ typedef struct
- GF_ISOSample *p_sample;
- int i_track;
- uint32_t i_descidx;
- - int i_time_res;
- + uint32_t i_time_res;
- int64_t i_time_inc;
- int i_numframe;
- int i_delay_time;
- diff --git a/x264.c b/x264.c
- index 3f46fd9..cabdb1d 100644
- --- a/x264.c
- +++ b/x264.c
- @@ -1205,9 +1205,9 @@ generic_option:
- }
- if( !tcfile_name && input_opt.timebase )
- {
- - int i_user_timebase_num;
- - int i_user_timebase_den;
- - int ret = sscanf( input_opt.timebase, "%d/%d", &i_user_timebase_num, &i_user_timebase_den );
- + uint32_t i_user_timebase_num;
- + uint32_t i_user_timebase_den;
- + int ret = sscanf( input_opt.timebase, "%u/%u", &i_user_timebase_num, &i_user_timebase_den );
- if( !ret )
- {
- fprintf( stderr, "x264 [error]: invalid argument: timebase = %s\n", input_opt.timebase );
- @@ -1216,7 +1216,7 @@ generic_option:
- else if( ret == 1 )
- {
- i_user_timebase_num = param->i_timebase_num;
- - i_user_timebase_den = atoi( input_opt.timebase );
- + i_user_timebase_den = strtoul( input_opt.timebase, NULL, 10 );
- }
- opt->timebase_convert_multiplier = ((double)i_user_timebase_den / param->i_timebase_den)
- * ((double)param->i_timebase_num / i_user_timebase_num);
- diff --git a/x264.h b/x264.h
- index d30effe..83f087e 100644
- --- a/x264.h
- +++ b/x264.h
- @@ -35,7 +35,7 @@
- #include <stdarg.h>
- -#define X264_BUILD 93
- +#define X264_BUILD 94
- /* x264_t:
- * opaque handler for encoder */
- @@ -208,9 +208,6 @@ typedef struct x264_param_t
- int i_chroma_loc; /* both top & bottom */
- } vui;
- - int i_fps_num;
- - int i_fps_den;
- -
- /* Bitstream parameters */
- int i_frame_reference; /* Maximum number of reference frames */
- int i_keyint_max; /* Force an IDR keyframe at this interval */
- @@ -330,8 +327,10 @@ typedef struct x264_param_t
- * otherwise place size (4 bytes) before NAL units. */
- int i_sps_id; /* SPS and PPS id number */
- int b_vfr_input; /* VFR input */
- - int i_timebase_num; /* Timebase numerator */
- - int i_timebase_den; /* Timebase denominator */
- + uint32_t i_fps_num;
- + uint32_t i_fps_den;
- + uint32_t i_timebase_num; /* Timebase numerator */
- + uint32_t i_timebase_den; /* Timebase denominator */
- int b_dts_compress; /* DTS compression: this algorithm eliminates negative DTS
- * by compressing them to be less than the second PTS.
- * Warning: this will change the timebase! */
- --
- 1.7.0.4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement