Advertisement
Guest User

Untitled

a guest
Jul 1st, 2017
531
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 38.36 KB | None | 0 0
  1. From 7fde0735a8676fc36efc054abb55b4a3c9580773 Mon Sep 17 00:00:00 2001
  2. From: Daniel Kang <daniel.d.kang@gmail.com>
  3. Date: Thu, 25 Nov 2010 19:44:56 -0500
  4. Subject: [PATCH 1/8] predict_4x4_ddl
  5.  
  6. ---
  7. common/x86/predict-a.asm |  108 +++++++++++++++++++++++++--------------------
  8.  common/x86/predict-c.c   |    9 +++-
  9.  2 files changed, 67 insertions(+), 50 deletions(-)
  10.  
  11. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  12. index a05e91b..739ce35 100644
  13. --- a/common/x86/predict-a.asm
  14. +++ b/common/x86/predict-a.asm
  15. @@ -40,6 +40,7 @@ SECTION .text
  16.  
  17.  cextern pb_1
  18.  cextern pb_3
  19. +cextern pw_1
  20.  cextern pw_2
  21.  cextern pw_4
  22.  cextern pw_8
  23. @@ -98,19 +99,16 @@ cextern pb_reverse
  24.  ; dest, left, right, src, tmp
  25.  ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  26.  %macro PRED8x8_LOWPASS0 6
  27. -    mov%6       %5, %2
  28. -    pavgb       %2, %3
  29. -    pxor        %3, %5
  30. -    mov%6       %1, %4
  31. -    pand        %3, [pb_1]
  32. -    psubusb     %2, %3
  33. -    pavgb       %1, %2
  34. +    mova        %6, %3
  35. +    pavg%1      %3, %4
  36. +    pxor        %4, %6
  37. +    mova        %2, %5
  38. +    pand        %4, [p%1_1]
  39. +    psubus%1    %3, %4
  40. +    pavg%1      %2, %3
  41.  %endmacro
  42. -%macro PRED8x8_LOWPASS 5
  43. -    PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q
  44. -%endmacro
  45. -%macro PRED8x8_LOWPASS_XMM 5
  46. -    PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
  47. +%macro PRED8x8_LOWPASS 6
  48. +    PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, %6
  49.  %endmacro
  50.  
  51.  %macro LOAD_PLANE_ARGS 0
  52. @@ -129,27 +127,37 @@ cextern pb_reverse
  53.  %endmacro
  54.  
  55.  ;-----------------------------------------------------------------------------
  56. -; void predict_4x4_ddl( uint8_t *src )
  57. +; void predict_4x4_ddl( pixel *src )
  58.  ;-----------------------------------------------------------------------------
  59. -cglobal predict_4x4_ddl_mmxext, 1,1
  60. -    movq    mm1, [r0-FDEC_STRIDE]
  61. -    movq    mm2, mm1
  62. -    movq    mm3, mm1
  63. -    movq    mm4, mm1
  64. -    psllq   mm1, 8
  65. -    pxor    mm2, mm1
  66. -    psrlq   mm2, 8
  67. -    pxor    mm3, mm2
  68. -    PRED8x8_LOWPASS mm0, mm1, mm3, mm4, mm5
  69. +%macro PREDICT_4x4_DDL 4
  70. +cglobal predict_4x4_ddl_%1, 1,1
  71. +    mova    m1, [r0-SIZEOF_PIXEL*FDEC_STRIDE]
  72. +    mova    m2, m1
  73. +    mova    m3, m1
  74. +    mova    m4, m1
  75. +    psll%2  m1, %3
  76. +    pxor    m2, m1
  77. +    psrl%2  m2, %3
  78. +    pxor    m3, m2
  79. +    PRED8x8_LOWPASS %4, m0, m1, m3, m4, m5
  80.  
  81.  %assign Y 0
  82.  %rep 4
  83. -    psrlq       mm0, 8
  84. -    movd        [r0+Y*FDEC_STRIDE], mm0
  85. +    psrl%2      m0, %3
  86. +    movh        [r0+SIZEOF_PIXEL*Y*FDEC_STRIDE], m0
  87.  %assign Y (Y+1)
  88.  %endrep
  89.  
  90.      RET
  91. +%endmacro
  92. +
  93. +%ifdef HIGH_BIT_DEPTH
  94. +INIT_XMM
  95. +PREDICT_4x4_DDL sse2  , dq, 2, w
  96. +%else
  97. +INIT_MMX
  98. +PREDICT_4x4_DDL mmxext, q , 8, b
  99. +%endif
  100.  
  101.  ;-----------------------------------------------------------------------------
  102.  ; void predict_4x4_ddr( uint8_t *src )
  103. @@ -166,7 +174,7 @@ cglobal predict_4x4_ddr_%1, 1,1
  104.      PALIGNR   mm3, [r0+2*FDEC_STRIDE-8], 7, mm4
  105.      movq      mm2, mm3
  106.      PALIGNR   mm3, [r0+3*FDEC_STRIDE-8], 7, mm4
  107. -    PRED8x8_LOWPASS mm0, mm3, mm1, mm2, mm4
  108. +    PRED8x8_LOWPASS b, mm0, mm3, mm1, mm2, mm4
  109.  %assign Y 3
  110.      movd    [r0+Y*FDEC_STRIDE], mm0
  111.  %rep 3
  112. @@ -186,7 +194,7 @@ cglobal predict_4x4_vr_%1, 1,1
  113.      PALIGNR mm0, [r0+1*FDEC_STRIDE-8], 7, mm2    ; ..t3t2t1t0ltl0l1
  114.      movq    mm2, mm0
  115.      PALIGNR mm0, [r0+2*FDEC_STRIDE-8], 7, mm3    ; t3t2t1t0ltl0l1l2
  116. -    PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
  117. +    PRED8x8_LOWPASS b, mm3, mm1, mm0, mm2, mm4
  118.      movq    mm1, mm3
  119.      psrlq   mm3, 16
  120.      psllq   mm1, 48
  121. @@ -215,7 +223,7 @@ cglobal predict_4x4_hd_%1, 1,1
  122.      psrlq     mm0, 16                   ; .. .. t2 t1 t0 lt l0 l1
  123.      psrlq     mm2, 8                    ; .. t2 t1 t0 lt l0 l1 l2
  124.      pavgb     mm7, mm2
  125. -    PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
  126. +    PRED8x8_LOWPASS b, mm3, mm1, mm0, mm2, mm4
  127.      punpcklbw mm7, mm3
  128.      psrlq     mm3, 32
  129.      PALIGNR   mm3, mm7, 6, mm6
  130. @@ -230,6 +238,7 @@ cglobal predict_4x4_hd_%1, 1,1
  131.      RET
  132.  %endmacro
  133.  
  134. +INIT_MMX
  135.  %define PALIGNR PALIGNR_MMX
  136.  PREDICT_4x4 mmxext
  137.  %define PALIGNR PALIGNR_SSSE3
  138. @@ -254,7 +263,7 @@ cglobal predict_4x4_hu_mmxext, 1,1
  139.      psrlq     mm2, 16
  140.      psrlq     mm3, 8
  141.      pavgb     mm7, mm3
  142. -    PRED8x8_LOWPASS mm4, mm0, mm2, mm3, mm5
  143. +    PRED8x8_LOWPASS b, mm4, mm0, mm2, mm3, mm5
  144.      punpcklbw mm7, mm4
  145.  %assign Y 0
  146.      movd    [r0+Y*FDEC_STRIDE], mm7
  147. @@ -278,7 +287,7 @@ cglobal predict_4x4_vl_mmxext, 1,1
  148.      movq        mm4, mm3
  149.      pavgb       mm4, mm1
  150.  
  151. -    PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
  152. +    PRED8x8_LOWPASS b, mm0, mm1, mm2, mm3, mm5
  153.  
  154.      movd        [r0+0*FDEC_STRIDE], mm4
  155.      movd        [r0+1*FDEC_STRIDE], mm0
  156. @@ -353,10 +362,10 @@ cglobal predict_8x8_filter_%1, 4,5
  157.      je .fix_lt_1
  158.  .do_left:
  159.      movq        mm0, mm4
  160. -    PRED8x8_LOWPASS mm2, mm1, mm4, mm3, mm5
  161. +    PRED8x8_LOWPASS b, mm2, mm1, mm4, mm3, mm5
  162.      movq     [t1+8], mm2
  163.      movq        mm4, mm0
  164. -    PRED8x8_LOWPASS mm1, mm3, mm0, mm4, mm5
  165. +    PRED8x8_LOWPASS b, mm1, mm3, mm0, mm4, mm5
  166.      movd         t4, mm1
  167.      mov      [t1+7], t4b
  168.  .check_top:
  169. @@ -374,7 +383,7 @@ cglobal predict_8x8_filter_%1, 4,5
  170.      test        r2b, 0x04
  171.      je .fix_tr_1
  172.  .do_top:
  173. -    PRED8x8_LOWPASS mm4, mm2, mm1, mm3, mm5
  174. +    PRED8x8_LOWPASS b, mm4, mm2, mm1, mm3, mm5
  175.      movq    [t1+16], mm4
  176.      test        r3b, 0x04
  177.      je .done
  178. @@ -387,7 +396,7 @@ cglobal predict_8x8_filter_%1, 4,5
  179.      psrlq       mm5, 56
  180.      PALIGNR     mm2, mm3, 7, mm3
  181.      PALIGNR     mm5, mm4, 1, mm4
  182. -    PRED8x8_LOWPASS mm1, mm2, mm5, mm0, mm4
  183. +    PRED8x8_LOWPASS b, mm1, mm2, mm5, mm0, mm4
  184.      jmp .do_topright
  185.  .fix_tr_2:
  186.      punpckhbw   mm3, mm3
  187. @@ -424,6 +433,7 @@ cglobal predict_8x8_filter_%1, 4,5
  188.  %endmacro
  189.  
  190.  %define PALIGNR PALIGNR_MMX
  191. +INIT_MMX
  192.  PREDICT_FILTER mmxext
  193.  %define PALIGNR PALIGNR_SSSE3
  194.  PREDICT_FILTER ssse3
  195. @@ -511,8 +521,8 @@ cglobal predict_8x8_ddl_mmxext, 2,2
  196.      movq        mm1, mm5
  197.      psllq       mm1, 8
  198.      add          r0, FDEC_STRIDE*4
  199. -    PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
  200. -    PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6
  201. +    PRED8x8_LOWPASS b, mm0, mm1, mm2, mm5, mm7
  202. +    PRED8x8_LOWPASS b, mm1, mm3, mm4, [r1+24], mm6
  203.  %assign Y 3
  204.  %rep 6
  205.      movq        [r0+Y*FDEC_STRIDE], mm1
  206. @@ -540,8 +550,8 @@ cglobal predict_8x8_ddr_mmxext, 2,2
  207.      movq        mm3, [r1+15]
  208.      movq        mm4, [r1+17]
  209.      add          r0, FDEC_STRIDE*4
  210. -    PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7
  211. -    PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6
  212. +    PRED8x8_LOWPASS b, mm0, mm1, mm2, [r1+8], mm7
  213. +    PRED8x8_LOWPASS b, mm1, mm3, mm4, [r1+16], mm6
  214.  %assign Y 3
  215.  %rep 6
  216.      movq        [r0+Y*FDEC_STRIDE], mm0
  217. @@ -582,7 +592,7 @@ cglobal predict_8x8_hu_mmxext, 2,2
  218.      punpckhbw mm1, mm1
  219.      por     mm3, mm1            ; l7 l7 l7 l6 l5 l4 l3 l2
  220.      pavgb   mm4, mm2
  221. -    PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
  222. +    PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6
  223.      movq    mm5, mm4
  224.      punpcklbw mm4, mm1          ; p4 p3 p2 p1
  225.      punpckhbw mm5, mm1          ; p8 p7 p6 p5
  226. @@ -627,7 +637,7 @@ cglobal predict_8x8_vr_core_mmxext, 2,2
  227.      movq        mm4, mm3
  228.      pavgb       mm3, mm2
  229.      add          r0, FDEC_STRIDE*4
  230. -    PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
  231. +    PRED8x8_LOWPASS b, mm0, mm1, mm2, mm4, mm7
  232.  
  233.  %assign Y -4
  234.  %rep 3
  235. @@ -714,6 +724,7 @@ ALIGN 4
  236.  
  237.  %endif ; !ARCH_X86_64
  238.  
  239. +INIT_XMM
  240.  ;-----------------------------------------------------------------------------
  241.  ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
  242.  ;-----------------------------------------------------------------------------
  243. @@ -723,7 +734,7 @@ cglobal predict_8x8_ddl_sse2, 2,2
  244.      movdqa      xmm1, xmm3
  245.      pslldq      xmm1, 1
  246.      add          r0, FDEC_STRIDE*4
  247. -    PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
  248. +    PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
  249.  
  250.  %assign Y -4
  251.  %rep 8
  252. @@ -742,7 +753,7 @@ cglobal predict_8x8_ddr_sse2, 2,2
  253.      movdqa      xmm2, xmm3
  254.      psrldq      xmm2, 1
  255.      add           r0, FDEC_STRIDE*4
  256. -    PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
  257. +    PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm4
  258.  
  259.      movdqa      xmm1, xmm0
  260.      psrldq      xmm1, 1
  261. @@ -771,7 +782,7 @@ cglobal predict_8x8_vl_sse2, 2,2
  262.      pslldq      xmm1, 1
  263.      pavgb       xmm3, xmm2
  264.      add           r0, FDEC_STRIDE*4
  265. -    PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
  266. +    PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm4, xmm5
  267.  ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
  268.  ; xmm3: (t0 + t1 + 1) >> 1
  269.  
  270. @@ -802,7 +813,7 @@ cglobal predict_8x8_vr_sse2, 2,2,7
  271.      pslldq      xmm0, 1
  272.      pslldq      xmm1, 2
  273.      pavgb       xmm2, xmm0
  274. -    PRED8x8_LOWPASS_XMM xmm4, xmm3, xmm1, xmm0, xmm5
  275. +    PRED8x8_LOWPASS b, xmm4, xmm3, xmm1, xmm0, xmm5
  276.      pandn       xmm6, xmm4
  277.      movdqa      xmm5, xmm4
  278.      psrlw       xmm4, 8
  279. @@ -824,6 +835,7 @@ cglobal predict_8x8_vr_sse2, 2,2,7
  280.  %endrep
  281.      RET
  282.  
  283. +INIT_MMX
  284.  ;-----------------------------------------------------------------------------
  285.  ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
  286.  ;-----------------------------------------------------------------------------
  287. @@ -840,12 +852,12 @@ cglobal predict_8x8_hd_mmxext, 2,2
  288.      PALIGNR mm4, mm3, 1, mm7    ; t0 lt l0 l1 l2 l3 l4 l5
  289.      movq    mm5, mm3
  290.      pavgb   mm3, mm1
  291. -    PRED8x8_LOWPASS mm0, mm4, mm1, mm5, mm7
  292. +    PRED8x8_LOWPASS b, mm0, mm4, mm1, mm5, mm7
  293.      movq    mm4, mm2
  294.      movq    mm1, mm2            ; t6 t5 t4 t3 t2 t1 t0 lt
  295.      psrlq   mm4, 16             ; .. .. t6 t5 t4 t3 t2 t1
  296.      psrlq   mm1, 8              ; .. t6 t5 t4 t3 t2 t1 t0
  297. -    PRED8x8_LOWPASS mm6, mm4, mm2, mm1, mm5
  298. +    PRED8x8_LOWPASS b, mm6, mm4, mm2, mm1, mm5
  299.                                  ; .. p11 p10 p9
  300.      movq    mm7, mm3
  301.      punpcklbw mm3, mm0          ; p4 p3 p2 p1
  302. @@ -886,7 +898,7 @@ cglobal predict_8x8_hd_%1, 2,2
  303.      PALIGNR xmm3, xmm0, 8, xmm0
  304.      movdqa  xmm4, xmm1
  305.      pavgb   xmm4, xmm3
  306. -    PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5
  307. +    PRED8x8_LOWPASS b, xmm0, xmm1, xmm2, xmm3, xmm5
  308.      punpcklbw xmm4, xmm0
  309.      movhlps xmm0, xmm4
  310.  
  311. @@ -946,7 +958,7 @@ cglobal predict_8x8_hu_%1, 2,2
  312.      por       mm3, mm1              ; l7 l7 l7 l6 l5 l4 l3 l2
  313.  %endif
  314.      pavgb     mm4, mm2
  315. -    PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
  316. +    PRED8x8_LOWPASS b, mm1, mm3, mm5, mm2, mm6
  317.  
  318.      movq2dq   xmm0, mm4
  319.      movq2dq   xmm1, mm1
  320. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  321. index 8eafcc0..077f2c7 100644
  322. --- a/common/x86/predict-c.c
  323. +++ b/common/x86/predict-c.c
  324. @@ -64,6 +64,7 @@
  325.   void x264_predict_8x8_filter_mmxext( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
  326.   void x264_predict_8x8_filter_ssse3( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
  327.   void x264_predict_4x4_ddl_mmxext( uint8_t *src );
  328. + void x264_predict_4x4_ddl_sse2( uint16_t *src );
  329.   void x264_predict_4x4_ddr_mmxext( uint8_t *src );
  330.   void x264_predict_4x4_vl_mmxext( uint8_t *src );
  331.   void x264_predict_4x4_vr_mmxext( uint8_t *src );
  332. @@ -474,7 +475,11 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  333.  {
  334.      if( !(cpu&X264_CPU_MMXEXT) )
  335.          return;
  336. -#if !HIGH_BIT_DEPTH
  337. +#if HIGH_BIT_DEPTH
  338. +    if( !(cpu&X264_CPU_SSE2) )
  339. +        return;
  340. +    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
  341. +#else
  342.      pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmxext;
  343.      pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
  344.      pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_mmxext;
  345. @@ -487,5 +492,5 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  346.      pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
  347.      pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
  348.      pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
  349. -#endif // !HIGH_BIT_DEPTH
  350. +#endif // HIGH_BIT_DEPTH
  351.  }
  352. --
  353. 1.7.2.3
  354.  
  355.  
  356. From ad274f6257b3205448a2dbb4ad77a6d0a51b722b Mon Sep 17 00:00:00 2001
  357. From: Daniel Kang <daniel.d.kang@gmail.com>
  358. Date: Thu, 25 Nov 2010 19:57:30 -0500
  359. Subject: [PATCH 2/8] predict_4x4_hu
  360.  
  361. ---
  362. common/x86/predict-a.asm |   36 +++++++++++++++++++++++++++++++++++-
  363.  common/x86/predict-c.c   |    2 ++
  364.  2 files changed, 37 insertions(+), 1 deletions(-)
  365.  
  366. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  367. index 739ce35..f2ab91e 100644
  368. --- a/common/x86/predict-a.asm
  369. +++ b/common/x86/predict-a.asm
  370. @@ -245,8 +245,40 @@ PREDICT_4x4 mmxext
  371.  PREDICT_4x4 ssse3
  372.  
  373.  ;-----------------------------------------------------------------------------
  374. -; void predict_4x4_hu( uint8_t *src )
  375. +; void predict_4x4_hu( pixel *src )
  376.  ;-----------------------------------------------------------------------------
  377. +%ifdef HIGH_BIT_DEPTH
  378. +INIT_XMM
  379. +cglobal predict_4x4_hu_sse2, 1,1,7
  380. +    mova       m0, [r0+2*0*FDEC_STRIDE-8*2]
  381. +    punpckhwd  m0, [r0+2*1*FDEC_STRIDE-8*2]
  382. +    mova       m1, [r0+2*2*FDEC_STRIDE-8*2]
  383. +    punpckhwd  m1, [r0+2*3*FDEC_STRIDE-8*2]
  384. +    punpckhdq  m0, m1
  385. +    mova       m1, m0
  386. +    pshufhw    m1, m1, 0xFF
  387. +    punpckhqdq m1, m1
  388. +    punpckhqdq m0, m1
  389. +    mova       m2, m0
  390. +    mova       m3, m0
  391. +    mova       m6, m0
  392. +    psrldq     m2, 4
  393. +    psrldq     m3, 2
  394. +    pavgw      m6, m3
  395. +    PRED8x8_LOWPASS w, m4, m0, m2, m3, m5
  396. +    punpcklwd  m6, m4
  397. +%assign Y 0
  398. +    movq    [r0+Y*2*FDEC_STRIDE], m6
  399. +%rep 2
  400. +%assign Y (Y+1)
  401. +    psrldq   m6, 4
  402. +    movq    [r0+2*Y*FDEC_STRIDE], m6
  403. +%endrep
  404. +    movq    [r0+2*3*FDEC_STRIDE], m1
  405. +    RET
  406. +
  407. +%else
  408. +INIT_MMX
  409.  cglobal predict_4x4_hu_mmxext, 1,1
  410.      movq      mm0, [r0+0*FDEC_STRIDE-8]
  411.      punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
  412. @@ -274,10 +306,12 @@ cglobal predict_4x4_hu_mmxext, 1,1
  413.  %endrep
  414.      movd    [r0+3*FDEC_STRIDE], mm1
  415.      RET
  416. +%endif ;HIGH_BIT_DEPTH
  417.  
  418.  ;-----------------------------------------------------------------------------
  419.  ; void predict_4x4_vl( uint8_t *src )
  420.  ;-----------------------------------------------------------------------------
  421. +INIT_MMX
  422.  cglobal predict_4x4_vl_mmxext, 1,1
  423.      movq        mm1, [r0-FDEC_STRIDE]
  424.      movq        mm3, mm1
  425. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  426. index 077f2c7..8f15e4e 100644
  427. --- a/common/x86/predict-c.c
  428. +++ b/common/x86/predict-c.c
  429. @@ -74,6 +74,7 @@
  430.   void x264_predict_4x4_dc_mmxext( uint8_t *src );
  431.   void x264_predict_4x4_ddr_ssse3( uint8_t *src );
  432.   void x264_predict_4x4_hu_mmxext( uint8_t *src );
  433. + void x264_predict_4x4_hu_sse2( uint16_t *src );
  434.   void x264_predict_16x16_dc_top_sse2( uint8_t *src );
  435.   void x264_predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left );
  436.   void x264_predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
  437. @@ -479,6 +480,7 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  438.      if( !(cpu&X264_CPU_SSE2) )
  439.          return;
  440.      pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
  441. +    pf[I_PRED_4x4_HU]  = x264_predict_4x4_hu_sse2;
  442.  #else
  443.      pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmxext;
  444.      pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
  445. --
  446. 1.7.2.3
  447.  
  448.  
  449. From e5ff6fb0be3ac1a19b9414db5e83589d6c5a0b4a Mon Sep 17 00:00:00 2001
  450. From: Daniel Kang <daniel.d.kang@gmail.com>
  451. Date: Fri, 26 Nov 2010 00:24:28 -0500
  452. Subject: [PATCH 3/8] predict_4x4_vl
  453.  
  454. ---
  455. common/x86/predict-a.asm |   43 ++++++++++++++++++++++++++-----------------
  456.  common/x86/predict-c.c   |    2 ++
  457.  2 files changed, 28 insertions(+), 17 deletions(-)
  458.  
  459. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  460. index f2ab91e..acdcca3 100644
  461. --- a/common/x86/predict-a.asm
  462. +++ b/common/x86/predict-a.asm
  463. @@ -309,28 +309,37 @@ cglobal predict_4x4_hu_mmxext, 1,1
  464.  %endif ;HIGH_BIT_DEPTH
  465.  
  466.  ;-----------------------------------------------------------------------------
  467. -; void predict_4x4_vl( uint8_t *src )
  468. -;-----------------------------------------------------------------------------
  469. -INIT_MMX
  470. -cglobal predict_4x4_vl_mmxext, 1,1
  471. -    movq        mm1, [r0-FDEC_STRIDE]
  472. -    movq        mm3, mm1
  473. -    movq        mm2, mm1
  474. -    psrlq       mm3, 8
  475. -    psrlq       mm2, 16
  476. -    movq        mm4, mm3
  477. -    pavgb       mm4, mm1
  478. -
  479. -    PRED8x8_LOWPASS b, mm0, mm1, mm2, mm3, mm5
  480. -
  481. -    movd        [r0+0*FDEC_STRIDE], mm4
  482. -    movd        [r0+1*FDEC_STRIDE], mm0
  483. -    psrlq       mm4, 8
  484. -    psrlq       mm0, 8
  485. -    movd        [r0+2*FDEC_STRIDE], mm4
  486. -    movd        [r0+3*FDEC_STRIDE], mm0
  487. +; void predict_4x4_vl( pixel *src )
  488. +;-----------------------------------------------------------------------------
  489. +%macro PREDICT_4x4_V1 4
  490. +cglobal predict_4x4_vl_%1, 1,1
  491. +    mova        m1, [r0-SIZEOF_PIXEL*FDEC_STRIDE]
  492. +    mova        m3, m1
  493. +    mova        m2, m1
  494. +    psrl%2      m3, %3
  495. +    psrl%2      m2, %3*2
  496. +    mova        m4, m3
  497. +    pavg%4      m4, m1
  498. +
  499. +    PRED8x8_LOWPASS %4, m0, m1, m2, m3, m5
  500. +
  501. +    movh        [r0+SIZEOF_PIXEL*0*FDEC_STRIDE], m4
  502. +    movh        [r0+SIZEOF_PIXEL*1*FDEC_STRIDE], m0
  503. +    psrl%2      m4, %3
  504. +    psrl%2      m0, %3
  505. +    movh        [r0+SIZEOF_PIXEL*2*FDEC_STRIDE], m4
  506. +    movh        [r0+SIZEOF_PIXEL*3*FDEC_STRIDE], m0
  507.  
  508.      RET
  509. +%endmacro
  510. +
  511. +%ifdef HIGH_BIT_DEPTH
  512. +INIT_XMM
  513. +PREDICT_4x4_V1 sse2  , dq, 2, w
  514. +%else
  515. +INIT_MMX
  516. +PREDICT_4x4_V1 mmxext, q , 8, b
  517. +%endif
  518.  
  519.  ;-----------------------------------------------------------------------------
  520.  ; void predict_4x4_dc( uint8_t *src )
  521. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  522. index 8f15e4e..75843a0 100644
  523. --- a/common/x86/predict-c.c
  524. +++ b/common/x86/predict-c.c
  525. @@ -67,6 +67,7 @@
  526.   void x264_predict_4x4_ddl_sse2( uint16_t *src );
  527.   void x264_predict_4x4_ddr_mmxext( uint8_t *src );
  528.   void x264_predict_4x4_vl_mmxext( uint8_t *src );
  529. + void x264_predict_4x4_vl_sse2( uint16_t *src );
  530.   void x264_predict_4x4_vr_mmxext( uint8_t *src );
  531.   void x264_predict_4x4_vr_ssse3( uint8_t *src );
  532.   void x264_predict_4x4_hd_mmxext( uint8_t *src );
  533. @@ -481,6 +482,7 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  534.          return;
  535.      pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
  536.      pf[I_PRED_4x4_HU]  = x264_predict_4x4_hu_sse2;
  537. +    pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_sse2;
  538.  #else
  539.      pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmxext;
  540.      pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
  541. --
  542. 1.7.2.3
  543.  
  544.  
  545. From d9df9a5c8ae6d55cf5bb814b99ab98f6ba517b0d Mon Sep 17 00:00:00 2001
  546. From: Daniel Kang <daniel.d.kang@gmail.com>
  547. Date: Fri, 26 Nov 2010 02:29:40 -0500
  548. Subject: [PATCH 4/8] predict_8x8_v
  549.  
  550. ---
  551. common/x86/predict-a.asm |   36 +++++++++++++++++++++++-------------
  552.  common/x86/predict-c.c   |    9 +++++++--
  553.  2 files changed, 30 insertions(+), 15 deletions(-)
  554.  
  555. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  556. index acdcca3..8d0ffd7 100644
  557. --- a/common/x86/predict-a.asm
  558. +++ b/common/x86/predict-a.asm
  559. @@ -48,15 +48,15 @@ cextern pw_ff00
  560.  cextern pb_reverse
  561.  
  562.  %macro STORE8x8 2
  563. -    add r0, 4*FDEC_STRIDE
  564. -    movq        [r0 + -4*FDEC_STRIDE], %1
  565. -    movq        [r0 + -3*FDEC_STRIDE], %1
  566. -    movq        [r0 + -2*FDEC_STRIDE], %1
  567. -    movq        [r0 + -1*FDEC_STRIDE], %1
  568. -    movq        [r0 +  0*FDEC_STRIDE], %2
  569. -    movq        [r0 +  1*FDEC_STRIDE], %2
  570. -    movq        [r0 +  2*FDEC_STRIDE], %2
  571. -    movq        [r0 +  3*FDEC_STRIDE], %2
  572. +    add r0, 4*SIZEOF_PIXEL*FDEC_STRIDE
  573. +    mova        [r0 + -4*SIZEOF_PIXEL*FDEC_STRIDE], %1
  574. +    mova        [r0 + -3*SIZEOF_PIXEL*FDEC_STRIDE], %1
  575. +    mova        [r0 + -2*SIZEOF_PIXEL*FDEC_STRIDE], %1
  576. +    mova        [r0 + -1*SIZEOF_PIXEL*FDEC_STRIDE], %1
  577. +    mova        [r0 +  0*SIZEOF_PIXEL*FDEC_STRIDE], %2
  578. +    mova        [r0 +  1*SIZEOF_PIXEL*FDEC_STRIDE], %2
  579. +    mova        [r0 +  2*SIZEOF_PIXEL*FDEC_STRIDE], %2
  580. +    mova        [r0 +  3*SIZEOF_PIXEL*FDEC_STRIDE], %2
  581.  %endmacro
  582.  
  583.  %macro STORE16x16 2
  584. @@ -482,12 +482,22 @@ PREDICT_FILTER mmxext
  585.  PREDICT_FILTER ssse3
  586.  
  587.  ;-----------------------------------------------------------------------------
  588. -; void predict_8x8_v( uint8_t *src, uint8_t *edge )
  589. +; void predict_8x8_v( pixel *src, pixel *edge )
  590.  ;-----------------------------------------------------------------------------
  591. -cglobal predict_8x8_v_mmxext, 2,2
  592. -    movq        mm0, [r1+16]
  593. -    STORE8x8    mm0, mm0
  594. +%macro PREDICT_8x8_V 1
  595. +cglobal predict_8x8_v_%1, 2,2
  596. +    mova        m0, [r1+SIZEOF_PIXEL*16]
  597. +    STORE8x8    m0, m0
  598.      RET
  599. +%endmacro
  600. +
  601. +%ifdef HIGH_BIT_DEPTH
  602. +INIT_XMM
  603. +PREDICT_8x8_V sse2
  604. +%else
  605. +INIT_MMX
  606. +PREDICT_8x8_V mmxext
  607. +%endif
  608.  
  609.  ;-----------------------------------------------------------------------------
  610.  ; void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
  611. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  612. index 75843a0..90fd0c7 100644
  613. --- a/common/x86/predict-c.c
  614. +++ b/common/x86/predict-c.c
  615. @@ -44,6 +44,7 @@
  616.   void x264_predict_8x8c_h_mmxext( uint8_t *src );
  617.   void x264_predict_8x8c_h_ssse3( uint8_t *src );
  618.   void x264_predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
  619. + void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[33] );
  620.   void x264_predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
  621.   void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
  622.   void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
  623. @@ -443,7 +444,11 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
  624.  {
  625.      if( !(cpu&X264_CPU_MMXEXT) )
  626.          return;
  627. -#if !HIGH_BIT_DEPTH
  628. +#if HIGH_BIT_DEPTH
  629. +    if( !(cpu&X264_CPU_SSE2) )
  630. +        return;
  631. +    pf[I_PRED_8x8_V]      = x264_predict_8x8_v_sse2;
  632. +#else
  633.      pf[I_PRED_8x8_V]      = x264_predict_8x8_v_mmxext;
  634.      pf[I_PRED_8x8_H]      = x264_predict_8x8_h_mmxext;
  635.      pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_mmxext;
  636. @@ -470,7 +475,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
  637.      pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_ssse3;
  638.      pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_ssse3;
  639.      *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
  640. -#endif // !HIGH_BIT_DEPTH
  641. +#endif // HIGH_BIT_DEPTH
  642.  }
  643.  
  644.  void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  645. --
  646. 1.7.2.3
  647.  
  648.  
  649. From 44ec73f438c89ad86c2c73706045ea92da967ec1 Mon Sep 17 00:00:00 2001
  650. From: Daniel Kang <daniel.d.kang@gmail.com>
  651. Date: Fri, 26 Nov 2010 03:30:32 -0500
  652. Subject: [PATCH 5/8] predict_8x8_h
  653.  
  654. ---
  655. common/x86/predict-a.asm |   43 ++++++++++++++++++++++++++-----------------
  656.  common/x86/predict-c.c   |    2 ++
  657.  2 files changed, 28 insertions(+), 17 deletions(-)
  658.  
  659. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  660. index 8d0ffd7..cedc3dd 100644
  661. --- a/common/x86/predict-a.asm
  662. +++ b/common/x86/predict-a.asm
  663. @@ -500,31 +500,40 @@ PREDICT_8x8_V mmxext
  664.  %endif
  665.  
  666.  ;-----------------------------------------------------------------------------
  667. -; void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
  668. +; void predict_8x8_h( pixel *src, pixel edge[33] )
  669.  ;-----------------------------------------------------------------------------
  670. -
  671. -INIT_MMX
  672. -cglobal predict_8x8_h_mmxext, 2,2
  673. -    movu   m3, [r1+7]
  674. -    add    r0, FDEC_STRIDE*4
  675. +%macro PREDICT_8x8_H 3
  676. +cglobal predict_8x8_h_%1, 2, 2
  677. +    movu   m3, [r1+SIZEOF_PIXEL*7]
  678. +    add    r0, SIZEOF_PIXEL*FDEC_STRIDE*4
  679.      mova   m7, m3
  680. -    punpckhbw m3, m3
  681. -    punpcklbw m7, m7
  682. -    pshufw m0, m3, 0xff
  683. -    pshufw m1, m3, 0xaa
  684. -    pshufw m2, m3, 0x55
  685. -    pshufw m3, m3, 0x00
  686. -    pshufw m4, m7, 0xff
  687. -    pshufw m5, m7, 0xaa
  688. -    pshufw m6, m7, 0x55
  689. -    pshufw m7, m7, 0x00
  690. +    punpckh%2 m3, m3
  691. +    punpckl%2 m7, m7
  692. +    pshuf%3 m0, m3, 0xff
  693. +    pshuf%3 m1, m3, 0xaa
  694. +    pshuf%3 m2, m3, 0x55
  695. +    pshuf%3 m3, m3, 0x00
  696. +    pshuf%3 m4, m7, 0xff
  697. +    pshuf%3 m5, m7, 0xaa
  698. +    pshuf%3 m6, m7, 0x55
  699. +    pshuf%3 m7, m7, 0x00
  700.  %assign n 0
  701.  %rep 8
  702. -    mova [r0+(n-4)*FDEC_STRIDE], m %+ n
  703. +    mova [r0+(n-4)*SIZEOF_PIXEL*FDEC_STRIDE], m %+ n
  704.  %assign n n+1
  705.  %endrep
  706.      RET
  707. +%endmacro
  708.  
  709. +%ifdef HIGH_BIT_DEPTH
  710. +INIT_XMM
  711. +PREDICT_8x8_H sse2  , wd, d
  712. +%else
  713. +INIT_MMX
  714. +PREDICT_8x8_H mmxext, bw, w
  715. +%endif
  716. +
  717. +INIT_MMX
  718.  ;-----------------------------------------------------------------------------
  719.  ; void predict_8x8_dc( uint8_t *src, uint8_t *edge );
  720.  ;-----------------------------------------------------------------------------
  721. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  722. index 90fd0c7..e5a3fa7 100644
  723. --- a/common/x86/predict-c.c
  724. +++ b/common/x86/predict-c.c
  725. @@ -46,6 +46,7 @@
  726.   void x264_predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
  727.   void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[33] );
  728.   void x264_predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
  729. + void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[33] );
  730.   void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
  731.   void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
  732.   void x264_predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
  733. @@ -448,6 +449,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
  734.      if( !(cpu&X264_CPU_SSE2) )
  735.          return;
  736.      pf[I_PRED_8x8_V]      = x264_predict_8x8_v_sse2;
  737. +    pf[I_PRED_8x8_H]      = x264_predict_8x8_h_sse2;
  738.  #else
  739.      pf[I_PRED_8x8_V]      = x264_predict_8x8_v_mmxext;
  740.      pf[I_PRED_8x8_H]      = x264_predict_8x8_h_mmxext;
  741. --
  742. 1.7.2.3
  743.  
  744.  
  745. From 0933e04c859c9f4bcd93dd6e03984d09dde314b0 Mon Sep 17 00:00:00 2001
  746. From: Daniel Kang <daniel.d.kang@gmail.com>
  747. Date: Fri, 26 Nov 2010 13:57:21 -0500
  748. Subject: [PATCH 6/8] predict_4x4_ddr, predict_4x4_vr, predict_4x4_hd
  749.  
  750. ---
  751. common/x86/predict-a.asm |  132 ++++++++++++++++++++++++---------------------
  752.  common/x86/predict-c.c   |    5 ++
  753.  2 files changed, 75 insertions(+), 62 deletions(-)
  754.  
  755. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  756. index cedc3dd..6bd845d 100644
  757. --- a/common/x86/predict-a.asm
  758. +++ b/common/x86/predict-a.asm
  759. @@ -160,89 +160,97 @@ PREDICT_4x4_DDL mmxext, q , 8, b
  760.  %endif
  761.  
  762.  ;-----------------------------------------------------------------------------
  763. -; void predict_4x4_ddr( uint8_t *src )
  764. +; void predict_4x4_ddr( pixel *src )
  765.  ;-----------------------------------------------------------------------------
  766. -%macro PREDICT_4x4 1
  767. +%macro PREDICT_4x4 7
  768.  cglobal predict_4x4_ddr_%1, 1,1
  769. -    movq      mm1, [r0+1*FDEC_STRIDE-8]
  770. -    movq      mm2, [r0+0*FDEC_STRIDE-8]
  771. -    punpckhbw mm2, [r0-1*FDEC_STRIDE-8]
  772. -    movd      mm3, [r0-1*FDEC_STRIDE]
  773. -    punpckhwd mm1, mm2
  774. -    PALIGNR   mm3, mm1, 5, mm1
  775. -    movq      mm1, mm3
  776. -    PALIGNR   mm3, [r0+2*FDEC_STRIDE-8], 7, mm4
  777. -    movq      mm2, mm3
  778. -    PALIGNR   mm3, [r0+3*FDEC_STRIDE-8], 7, mm4
  779. -    PRED8x8_LOWPASS b, mm0, mm3, mm1, mm2, mm4
  780. +    mova      m1, [r0+1*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL]
  781. +    mova      m2, [r0+0*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL]
  782. +    punpckh%2 m2, [r0-1*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL]
  783. +    movh      m3, [r0-1*SIZEOF_PIXEL*FDEC_STRIDE]
  784. +    punpckh%3 m1, m2
  785. +    PALIGNR   m3, m1, 5*SIZEOF_PIXEL, m1
  786. +    mova      m1, m3
  787. +    PALIGNR   m3, [r0+2*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
  788. +    mova      m2, m3
  789. +    PALIGNR   m3, [r0+3*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m4
  790. +    PRED8x8_LOWPASS %5, m0, m3, m1, m2, m4
  791.  %assign Y 3
  792. -    movd    [r0+Y*FDEC_STRIDE], mm0
  793. +    movh    [r0+Y*SIZEOF_PIXEL*FDEC_STRIDE], m0
  794.  %rep 3
  795.  %assign Y (Y-1)
  796. -    psrlq    mm0, 8
  797. -    movd    [r0+Y*FDEC_STRIDE], mm0
  798. +    psrl%4   m0, %7
  799. +    movh    [r0+Y*SIZEOF_PIXEL*FDEC_STRIDE], m0
  800.  %endrep
  801.      RET
  802.  
  803.  cglobal predict_4x4_vr_%1, 1,1
  804. -    movd    mm0, [r0-1*FDEC_STRIDE]              ; ........t3t2t1t0
  805. -    movq    mm7, mm0
  806. -    PALIGNR mm0, [r0-1*FDEC_STRIDE-8], 7, mm1    ; ......t3t2t1t0lt
  807. -    pavgb   mm7, mm0
  808. -    PALIGNR mm0, [r0+0*FDEC_STRIDE-8], 7, mm1    ; ....t3t2t1t0ltl0
  809. -    movq    mm1, mm0
  810. -    PALIGNR mm0, [r0+1*FDEC_STRIDE-8], 7, mm2    ; ..t3t2t1t0ltl0l1
  811. -    movq    mm2, mm0
  812. -    PALIGNR mm0, [r0+2*FDEC_STRIDE-8], 7, mm3    ; t3t2t1t0ltl0l1l2
  813. -    PRED8x8_LOWPASS b, mm3, mm1, mm0, mm2, mm4
  814. -    movq    mm1, mm3
  815. -    psrlq   mm3, 16
  816. -    psllq   mm1, 48
  817. -    movd   [r0+0*FDEC_STRIDE], mm7
  818. -    movd   [r0+1*FDEC_STRIDE], mm3
  819. -    PALIGNR mm7, mm1, 7, mm2
  820. -    psllq   mm1, 8
  821. -    movd   [r0+2*FDEC_STRIDE], mm7
  822. -    PALIGNR mm3, mm1, 7, mm1
  823. -    movd   [r0+3*FDEC_STRIDE], mm3
  824. +    movh    m0, [r0-1*SIZEOF_PIXEL*FDEC_STRIDE]              ; ........t3t2t1t0
  825. +    mova    m5, m0
  826. +    PALIGNR m0, [r0-1*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1    ; ......t3t2t1t0lt
  827. +    pavg%5  m5, m0
  828. +    PALIGNR m0, [r0+0*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m1    ; ....t3t2t1t0ltl0
  829. +    mova    m1, m0
  830. +    PALIGNR m0, [r0+1*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m2    ; ..t3t2t1t0ltl0l1
  831. +    mova    m2, m0
  832. +    PALIGNR m0, [r0+2*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL], 7*SIZEOF_PIXEL, m3    ; t3t2t1t0ltl0l1l2
  833. +    PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
  834. +    mova    m1, m3
  835. +    psrl%4  m3, %7*2
  836. +    psll%4  m1, %7*6
  837. +    movh   [r0+0*SIZEOF_PIXEL*FDEC_STRIDE], m5
  838. +    movh   [r0+1*SIZEOF_PIXEL*FDEC_STRIDE], m3
  839. +    PALIGNR m5, m1, 7*SIZEOF_PIXEL, m2
  840. +    psll%4  m1, %7
  841. +    movh   [r0+2*SIZEOF_PIXEL*FDEC_STRIDE], m5
  842. +    PALIGNR m3, m1, 7*SIZEOF_PIXEL, m1
  843. +    movh   [r0+3*SIZEOF_PIXEL*FDEC_STRIDE], m3
  844.      RET
  845.  
  846. -cglobal predict_4x4_hd_%1, 1,1
  847. -    movd      mm0, [r0-1*FDEC_STRIDE-4] ; lt ..
  848. -    punpckldq mm0, [r0-1*FDEC_STRIDE]   ; t3 t2 t1 t0 lt .. .. ..
  849. -    psllq     mm0, 8                    ; t2 t1 t0 lt .. .. .. ..
  850. -    movq      mm1, [r0+3*FDEC_STRIDE-8] ; l3
  851. -    punpckhbw mm1, [r0+2*FDEC_STRIDE-8] ; l2 l3
  852. -    movq      mm2, [r0+1*FDEC_STRIDE-8] ; l1
  853. -    punpckhbw mm2, [r0+0*FDEC_STRIDE-8] ; l0 l1
  854. -    punpckhwd mm1, mm2                  ; l0 l1 l2 l3
  855. -    punpckhdq mm1, mm0                  ; t2 t1 t0 lt l0 l1 l2 l3
  856. -    movq      mm0, mm1
  857. -    movq      mm2, mm1
  858. -    movq      mm7, mm1
  859. -    psrlq     mm0, 16                   ; .. .. t2 t1 t0 lt l0 l1
  860. -    psrlq     mm2, 8                    ; .. t2 t1 t0 lt l0 l1 l2
  861. -    pavgb     mm7, mm2
  862. -    PRED8x8_LOWPASS b, mm3, mm1, mm0, mm2, mm4
  863. -    punpcklbw mm7, mm3
  864. -    psrlq     mm3, 32
  865. -    PALIGNR   mm3, mm7, 6, mm6
  866. +cglobal predict_4x4_hd_%1, 1,1,7
  867. +    movh      m0, [r0-1*SIZEOF_PIXEL*FDEC_STRIDE-4*SIZEOF_PIXEL] ; lt ..
  868. +    punpckl%6 m0, [r0-1*SIZEOF_PIXEL*FDEC_STRIDE]   ; t3 t2 t1 t0 lt .. .. ..
  869. +    psll%4    m0, %7                    ; t2 t1 t0 lt .. .. .. ..
  870. +    mova      m1, [r0+3*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL] ; l3
  871. +    punpckh%2 m1, [r0+2*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL] ; l2 l3
  872. +    mova      m2, [r0+1*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL] ; l1
  873. +    punpckh%2 m2, [r0+0*SIZEOF_PIXEL*FDEC_STRIDE-8*SIZEOF_PIXEL] ; l0 l1
  874. +    punpckh%3 m1, m2                  ; l0 l1 l2 l3
  875. +    punpckh%6 m1, m0                  ; t2 t1 t0 lt l0 l1 l2 l3
  876. +    mova      m0, m1
  877. +    mova      m2, m1
  878. +    mova      m6, m1
  879. +    psrl%4    m0, %7*2                   ; .. .. t2 t1 t0 lt l0 l1
  880. +    psrl%4    m2, %7                     ; .. t2 t1 t0 lt l0 l1 l2
  881. +    pavg%5    m6, m2
  882. +    PRED8x8_LOWPASS %5, m3, m1, m0, m2, m4
  883. +    punpckl%2 m6, m3
  884. +    psrl%4    m3, %7*4
  885. +    PALIGNR   m3, m6, 6*SIZEOF_PIXEL, m5
  886.  %assign Y 3
  887. -    movd     [r0+Y*FDEC_STRIDE], mm7
  888. +    movh     [r0+Y*SIZEOF_PIXEL*FDEC_STRIDE], m6
  889.  %rep 2
  890.  %assign Y (Y-1)
  891. -    psrlq     mm7, 16
  892. -    movd     [r0+Y*FDEC_STRIDE], mm7
  893. +    psrl%4    m6, %7*2
  894. +    movh     [r0+Y*SIZEOF_PIXEL*FDEC_STRIDE], m6
  895.  %endrep
  896. -    movd     [r0+0*FDEC_STRIDE], mm3
  897. +    movh     [r0+0*SIZEOF_PIXEL*FDEC_STRIDE], m3
  898.      RET
  899.  %endmacro
  900.  
  901. +%ifdef HIGH_BIT_DEPTH
  902. +INIT_XMM
  903. +%define PALIGNR PALIGNR_SSSE3
  904. +PREDICT_4x4 ssse3 , wd, dq, dq, w, qdq, 2
  905. +
  906. +%else
  907.  INIT_MMX
  908.  %define PALIGNR PALIGNR_MMX
  909. -PREDICT_4x4 mmxext
  910. +PREDICT_4x4 mmxext, bw, wd, q , b, dq , 8
  911.  %define PALIGNR PALIGNR_SSSE3
  912. -PREDICT_4x4 ssse3
  913. +PREDICT_4x4 ssse3 , bw, wd, q , b, dq , 8
  914. +
  915. +%endif
  916.  
  917.  ;-----------------------------------------------------------------------------
  918.  ; void predict_4x4_hu( pixel *src )
  919. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  920. index e5a3fa7..0d6f7f2 100644
  921. --- a/common/x86/predict-c.c
  922. +++ b/common/x86/predict-c.c
  923. @@ -490,6 +490,11 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  924.      pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
  925.      pf[I_PRED_4x4_HU]  = x264_predict_4x4_hu_sse2;
  926.      pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_sse2;
  927. +    if( !(cpu&X264_CPU_SSSE3) )
  928. +        return;
  929. +    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
  930. +    pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
  931. +    pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
  932.  #else
  933.      pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmxext;
  934.      pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
  935. --
  936. 1.7.2.3
  937.  
  938.  
  939. From d7d01cb8124f447e05d29c3db0ec40ae456de87c Mon Sep 17 00:00:00 2001
  940. From: Daniel Kang <daniel.d.kang@gmail.com>
  941. Date: Fri, 26 Nov 2010 15:54:40 -0500
  942. Subject: [PATCH 7/8] predict_4x4_dc
  943.  
  944. ---
  945. common/x86/predict-a.asm |   26 ++++++++++++++++++++++++--
  946.  common/x86/predict-c.c   |    4 ++--
  947.  2 files changed, 26 insertions(+), 4 deletions(-)
  948.  
  949. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  950. index 6bd845d..fd322f3 100644
  951. --- a/common/x86/predict-a.asm
  952. +++ b/common/x86/predict-a.asm
  953. @@ -350,9 +350,30 @@ PREDICT_4x4_V1 mmxext, q , 8, b
  954.  %endif
  955.  
  956.  ;-----------------------------------------------------------------------------
  957. -; void predict_4x4_dc( uint8_t *src )
  958. +; void predict_4x4_dc( pixel *src )
  959.  ;-----------------------------------------------------------------------------
  960. +%ifdef HIGH_BIT_DEPTH
  961. +INIT_MMX
  962. +cglobal predict_4x4_dc_mmxext, 1,1
  963. +    movq   m0, [r0-SIZEOF_PIXEL*FDEC_STRIDE]
  964. +    HADDW  m0, m1
  965. +%assign n 0
  966. +%rep 4
  967. +    movd   m1, [r0+SIZEOF_PIXEL*(FDEC_STRIDE*n-1)]
  968. +    paddw  m0, m1
  969. +%assign n n+1
  970. +%endrep
  971. +    paddw  m0, [pw_4]
  972. +    psrlw  m0, 3
  973. +    SPLATW m0, m0
  974. +    movq   [r0+SIZEOF_PIXEL*FDEC_STRIDE*0], m0
  975. +    movq   [r0+SIZEOF_PIXEL*FDEC_STRIDE*1], m0
  976. +    movq   [r0+SIZEOF_PIXEL*FDEC_STRIDE*2], m0
  977. +    movq   [r0+SIZEOF_PIXEL*FDEC_STRIDE*3], m0
  978. +    RET
  979.  
  980. +%else
  981. +INIT_MMX
  982.  cglobal predict_4x4_dc_mmxext, 1,4
  983.      pxor   mm7, mm7
  984.      movd   mm0, [r0-FDEC_STRIDE]
  985. @@ -373,12 +394,13 @@ cglobal predict_4x4_dc_mmxext, 1,4
  986.      mov   [r0+FDEC_STRIDE*2], r1d
  987.      mov   [r0+FDEC_STRIDE*3], r1d
  988.      RET
  989. +%endif
  990.  
  991.  %macro PREDICT_FILTER 1
  992.  ;-----------------------------------------------------------------------------
  993.  ;void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
  994.  ;-----------------------------------------------------------------------------
  995. -
  996. +INIT_MMX
  997.  cglobal predict_8x8_filter_%1, 4,5
  998.      add          r0, 0x58
  999.  %define src r0-0x58
  1000. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  1001. index 0d6f7f2..1a44f3a 100644
  1002. --- a/common/x86/predict-c.c
  1003. +++ b/common/x86/predict-c.c
  1004. @@ -74,7 +74,7 @@
  1005.   void x264_predict_4x4_vr_ssse3( uint8_t *src );
  1006.   void x264_predict_4x4_hd_mmxext( uint8_t *src );
  1007.   void x264_predict_4x4_hd_ssse3( uint8_t *src );
  1008. - void x264_predict_4x4_dc_mmxext( uint8_t *src );
  1009. + void x264_predict_4x4_dc_mmxext( pixel *src );
  1010.   void x264_predict_4x4_ddr_ssse3( uint8_t *src );
  1011.   void x264_predict_4x4_hu_mmxext( uint8_t *src );
  1012.   void x264_predict_4x4_hu_sse2( uint16_t *src );
  1013. @@ -484,6 +484,7 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  1014.  {
  1015.      if( !(cpu&X264_CPU_MMXEXT) )
  1016.          return;
  1017. +    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_mmxext;
  1018.  #if HIGH_BIT_DEPTH
  1019.      if( !(cpu&X264_CPU_SSE2) )
  1020.          return;
  1021. @@ -499,7 +500,6 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
  1022.      pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmxext;
  1023.      pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
  1024.      pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_mmxext;
  1025. -    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_mmxext;
  1026.      pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmxext;
  1027.      pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_mmxext;
  1028.      pf[I_PRED_4x4_HU]  = x264_predict_4x4_hu_mmxext;
  1029. --
  1030. 1.7.2.3
  1031.  
  1032.  
  1033. From 51d57538ea8ea9ee729cae9890bcd5164ca16e6d Mon Sep 17 00:00:00 2001
  1034. From: Daniel Kang <daniel.d.kang@gmail.com>
  1035. Date: Fri, 26 Nov 2010 19:41:53 -0500
  1036. Subject: [PATCH 8/8] predict_8x8_dc
  1037.  
  1038. ---
  1039. common/x86/predict-a.asm |   23 +++++++++++++++++++++--
  1040.  common/x86/predict-c.c   |    2 ++
  1041.  2 files changed, 23 insertions(+), 2 deletions(-)
  1042.  
  1043. diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
  1044. index fd322f3..82a4cf7 100644
  1045. --- a/common/x86/predict-a.asm
  1046. +++ b/common/x86/predict-a.asm
  1047. @@ -563,10 +563,27 @@ INIT_MMX
  1048.  PREDICT_8x8_H mmxext, bw, w
  1049.  %endif
  1050.  
  1051. -INIT_MMX
  1052.  ;-----------------------------------------------------------------------------
  1053. -; void predict_8x8_dc( uint8_t *src, uint8_t *edge );
  1054. +; void predict_8x8_dc( pixel *src, pixel *edge );
  1055.  ;-----------------------------------------------------------------------------
  1056. +%ifdef HIGH_BIT_DEPTH
  1057. +INIT_XMM
  1058. +cglobal predict_8x8_dc_sse2, 2,2,3
  1059. +    pxor        m0, m0
  1060. +    movu        m1, [r1+14]
  1061. +    HADDW       m1, m2
  1062. +    paddd       m0, m1
  1063. +    mova        m3, [r1+32]
  1064. +    HADDW       m3, m4
  1065. +    paddd       m0, m3
  1066. +    paddw       m0, [pw_8]
  1067. +    psrlw       m0, 4
  1068. +    SPLATW      m0, m0
  1069. +    STORE8x8    m0, m0
  1070. +    RET
  1071. +
  1072. +%else
  1073. +INIT_MMX
  1074.  cglobal predict_8x8_dc_mmxext, 2,2
  1075.      pxor        mm0, mm0
  1076.      pxor        mm1, mm1
  1077. @@ -579,7 +596,9 @@ cglobal predict_8x8_dc_mmxext, 2,2
  1078.      packuswb    mm0, mm0
  1079.      STORE8x8    mm0, mm0
  1080.      RET
  1081. +%endif
  1082.  
  1083. +INIT_MMX
  1084.  ;-----------------------------------------------------------------------------
  1085.  ; void predict_8x8_dc_top( uint8_t *src, uint8_t *edge );
  1086.  ;-----------------------------------------------------------------------------
  1087. diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
  1088. index 1a44f3a..39cebd9 100644
  1089. --- a/common/x86/predict-c.c
  1090. +++ b/common/x86/predict-c.c
  1091. @@ -50,6 +50,7 @@
  1092.   void x264_predict_8x8_hd_mmxext( uint8_t *src, uint8_t edge[33] );
  1093.   void x264_predict_8x8_hu_mmxext( uint8_t *src, uint8_t edge[33] );
  1094.   void x264_predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
  1095. + void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[33] );
  1096.   void x264_predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
  1097.   void x264_predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
  1098.   void x264_predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
  1099. @@ -450,6 +451,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
  1100.          return;
  1101.      pf[I_PRED_8x8_V]      = x264_predict_8x8_v_sse2;
  1102.      pf[I_PRED_8x8_H]      = x264_predict_8x8_h_sse2;
  1103. +    pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_sse2;
  1104.  #else
  1105.      pf[I_PRED_8x8_V]      = x264_predict_8x8_v_mmxext;
  1106.      pf[I_PRED_8x8_H]      = x264_predict_8x8_h_mmxext;
  1107. --
  1108. 1.7.2.3
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement