Advertisement
Guest User

Untitled

a guest
Jan 17th, 2018
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 8.64 KB | None | 0 0
  1. diff --git a/common/common.h b/common/common.h
  2. index 1406255..994be69 100644
  3. --- a/common/common.h
  4. +++ b/common/common.h
  5. @@ -424,6 +424,8 @@ struct x264_t
  6.      struct
  7.      {
  8.          ALIGNED_64( dctcoef luma16x16_dc[3][16] );
  9. +        ALIGNED_64( int misalign_pad_align_1 );
  10. +        int misalign_pad_1;
  11.          ALIGNED_16( dctcoef chroma_dc[2][8] );
  12.          // FIXME share memory?
  13.          ALIGNED_64( dctcoef luma8x8[12][64] );
  14. @@ -569,7 +571,11 @@ struct x264_t
  15.              ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
  16.  
  17.              /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
  18. +            ALIGNED_64( int misalign_pad_align_1 );
  19. +            int misalign_pad_1;
  20.              ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
  21. +            ALIGNED_64( int misalign_pad_align_2 );
  22. +            int misalign_pad_2;
  23.              ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
  24.              ALIGNED_64( dctcoef i8x8_dct_buf[3][64] );
  25.              ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
  26. @@ -577,11 +583,17 @@ struct x264_t
  27.              uint32_t i8x8_nnz_buf[4];
  28.  
  29.              /* Psy trellis DCT data */
  30. +            ALIGNED_64( int misalign_pad_align_3 );
  31. +            int misalign_pad_3;
  32.              ALIGNED_64( dctcoef fenc_dct8[4][64] );
  33. +            ALIGNED_64( int misalign_pad_align_4 );
  34. +            int misalign_pad_4;
  35.              ALIGNED_64( dctcoef fenc_dct4[16][16] );
  36.  
  37.              /* Psy RD SATD/SA8D scores cache */
  38.              ALIGNED_64( uint32_t fenc_satd_cache[32] );
  39. +            ALIGNED_64( int misalign_pad_align_5 );
  40. +            int misalign_pad_5;
  41.              ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
  42.  
  43.              int i4x4_cbp;
  44. @@ -610,6 +622,8 @@ struct x264_t
  45.          struct
  46.          {
  47.              /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
  48. +            ALIGNED_64( int misalign_pad_align_1 );
  49. +            int misalign_pad_1;
  50.              ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
  51.  
  52.              /* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */
  53. @@ -619,6 +633,8 @@ struct x264_t
  54.              ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
  55.  
  56.              /* 0 if not available */
  57. +            ALIGNED_64( int misalign_pad_align_2 );
  58. +            int misalign_pad_2;
  59.              ALIGNED_16( int16_t mv[2][X264_SCAN8_LUMA_SIZE][2] );
  60.              ALIGNED_8( uint8_t mvd[2][X264_SCAN8_LUMA_SIZE][2] );
  61.  
  62. @@ -718,7 +734,11 @@ struct x264_t
  63.      uint32_t (*nr_residual_sum)[64];
  64.      uint32_t *nr_count;
  65.  
  66. +    ALIGNED_64( int misalign_pad_align_1 );
  67. +    int misalign_pad_1;
  68.      ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
  69. +    ALIGNED_64( int misalign_pad_align_2 );
  70. +    int misalign_pad_2;
  71.      ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
  72.      uint32_t nr_count_buf[2][4];
  73.  
  74. diff --git a/common/osdep.h b/common/osdep.h
  75. index fbc4801..31afba6 100644
  76. --- a/common/osdep.h
  77. +++ b/common/osdep.h
  78. @@ -123,17 +123,29 @@ int x264_is_pipe( const char *path );
  79.  // - Apple gcc only maintains 4 byte alignment
  80.  // - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils...
  81.  
  82. +#if 0
  83.  #define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\
  84.      uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + mask]; \
  85.      type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask)
  86. +#else
  87. +#define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\
  88. +    uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + (mask*3+2)]; \
  89. +    type (*name) __VA_ARGS__ = (void*)(((intptr_t)(name##_u+(mask*2+1)) & ~(mask*2+1)) + (mask+1))
  90. +#define ALIGNED_ARRAY_EMU_ZERO( mask, attr, type, name, sub1, ... )\
  91. +    attr uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + (mask*3+2)] = {0}; \
  92. +    type (*name) __VA_ARGS__ = (void*)(((intptr_t)(name##_u+(mask*2+1)) & ~(mask*2+1)) + (mask+1))
  93. +#define ALIGNED_ZERO_16( ... ) EXPAND( ALIGNED_ARRAY_EMU_ZERO( 15, __VA_ARGS__ ) )
  94. +#define ALIGNED_ZERO_32( ... ) EXPAND( ALIGNED_ARRAY_EMU_ZERO( 31, __VA_ARGS__ ) )
  95. +#define ALIGNED_ZERO_64( ... ) EXPAND( ALIGNED_ARRAY_EMU_ZERO( 63, __VA_ARGS__ ) )
  96. +#endif
  97.  
  98. -#if ARCH_ARM && SYS_MACOSX
  99. +#if 1 || ARCH_ARM && SYS_MACOSX
  100.  #define ALIGNED_ARRAY_8( ... ) EXPAND( ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) )
  101.  #else
  102.  #define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ )
  103.  #endif
  104.  
  105. -#if ARCH_ARM
  106. +#if 1 || ARCH_ARM
  107.  #define ALIGNED_ARRAY_16( ... ) EXPAND( ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) )
  108.  #else
  109.  #define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ )
  110. @@ -145,12 +157,12 @@ int x264_is_pipe( const char *path );
  111.  #define NATIVE_ALIGN 64
  112.  #define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
  113.  #define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 )
  114. -#if STACK_ALIGNMENT >= 32
  115. +#if 0 && STACK_ALIGNMENT >= 32
  116.  #define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
  117.  #else
  118.  #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
  119.  #endif
  120. -#if STACK_ALIGNMENT >= 64
  121. +#if 0 && STACK_ALIGNMENT >= 64
  122.  #define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ )
  123.  #else
  124.  #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
  125. diff --git a/encoder/analyse.c b/encoder/analyse.c
  126. index a9d7dc2..bcb6911 100644
  127. --- a/encoder/analyse.c
  128. +++ b/encoder/analyse.c
  129. @@ -78,6 +78,8 @@ typedef struct
  130.  
  131.      int i_satd_i8x8;
  132.      int i_cbp_i8x8_luma;
  133. +    ALIGNED_64( int misalign_pad_align_1 );
  134. +    int misalign_pad_1;
  135.      ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
  136.      int i_predict8x8[4];
  137.  
  138. @@ -558,7 +560,7 @@ static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra,
  139.  /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
  140.  static void inline psy_trellis_init( x264_t *h, int do_both_dct )
  141.  {
  142. -    ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
  143. +    ALIGNED_ZERO_16( static, pixel, zero,[16*FDEC_STRIDE] );
  144.  
  145.      if( do_both_dct || h->mb.b_transform_8x8 )
  146.          h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
  147. diff --git a/encoder/me.c b/encoder/me.c
  148. index 9cb5a84..7babd73 100644
  149. --- a/encoder/me.c
  150. +++ b/encoder/me.c
  151. @@ -633,7 +633,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
  152.              /* successive elimination by comparing DC before a full SAD,
  153.               * because sum(abs(diff)) >= abs(diff(sum)). */
  154.              uint16_t *sums_base = m->integral;
  155. -            ALIGNED_16( static pixel zero[8*FENC_STRIDE] ) = {0};
  156. +            ALIGNED_ZERO_16( static, pixel, zero,[8*FENC_STRIDE] );
  157.              ALIGNED_ARRAY_16( int, enc_dc,[4] );
  158.              int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
  159.              int delta = x264_pixel_size[sad_size].w;
  160. diff --git a/encoder/rdo.c b/encoder/rdo.c
  161. index 68ba0a2..a83d719 100644
  162. --- a/encoder/rdo.c
  163. +++ b/encoder/rdo.c
  164. @@ -96,7 +96,7 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
  165.      static const uint8_t satd_shift_x[3] = {3,   2,   2};
  166.      static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
  167.      static const uint8_t  satd_offset[3] = {0,   8,   16};
  168. -    ALIGNED_16( static pixel zero[16] ) = {0};
  169. +    ALIGNED_ZERO_16( static, pixel, zero,[16] );
  170.      int cache_index = (x >> satd_shift_x[size - PIXEL_8x4]) + (y >> satd_shift_y[size - PIXEL_8x4])
  171.                      + satd_offset[size - PIXEL_8x4];
  172.      int res = h->mb.pic.fenc_satd_cache[cache_index];
  173. @@ -123,7 +123,7 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
  174.  
  175.  static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
  176.  {
  177. -    ALIGNED_16( static pixel zero[16] ) = {0};
  178. +    ALIGNED_ZERO_16( static, pixel, zero,[16] );
  179.      int satd = 0;
  180.      pixel *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
  181.      pixel *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
  182. @@ -912,8 +912,8 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
  183.                           const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
  184.                           int b_chroma, int dc, int num_coefs, int idx, int b_8x8 )
  185.  {
  186. -    ALIGNED_16( dctcoef quant_coefs[2][16] );
  187. -    ALIGNED_16( dctcoef coefs[16] ) = {0};
  188. +    ALIGNED_ARRAY_16( dctcoef, quant_coefs,[2],[16] );
  189. +    ALIGNED_ZERO_16( ,dctcoef, coefs,[16] );
  190.      const uint32_t *coef_weight1 = b_8x8 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
  191.      const uint32_t *coef_weight2 = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
  192.      int delta_distortion[16];
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement