Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- From 94747e7686f32edf11707473e66f588f628cd779 Mon Sep 17 00:00:00 2001
- From: Daniel Kang <daniel.d.kang@gmail.com>
- Date: Tue, 23 Nov 2010 20:29:37 -0500
- Subject: [PATCH] SSE version of high-bit-depth add4x4_idct_sse2
- ~6.3x faster than C.
- Our first Google Code-In patch!
- ---
- common/dct.c | 4 +
- common/x86/const-a.asm | 1 +
- common/x86/dct-32.asm | 36 ++++++------
- common/x86/dct-64.asm | 38 ++++++------
- common/x86/dct-a.asm | 47 +++++++++++++--
- common/x86/dct.h | 1 +
- common/x86/pixel-a.asm | 16 +++---
- common/x86/x86util.asm | 147 +++++++++++++++++++++++++-----------------------
- 8 files changed, 167 insertions(+), 123 deletions(-)
- diff --git a/common/dct.c b/common/dct.c
- index 975afef..1b3d87b 100644
- --- a/common/dct.c
- +++ b/common/dct.c
- @@ -429,6 +429,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
- dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
- dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
- }
- + if( cpu&X264_CPU_SSE2 )
- + {
- + dctf->add4x4_idct = x264_add4x4_idct_sse2;
- + }
- #endif // HAVE_MMX
- #else // !HIGH_BIT_DEPTH
- #if HAVE_MMX
- diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
- index 32579e3..d6e621e 100644
- --- a/common/x86/const-a.asm
- +++ b/common/x86/const-a.asm
- @@ -50,6 +50,7 @@ const pw_3fff, times 8 dw 0x3fff
- const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
- const pd_1, times 4 dd 1
- +const pd_32, times 4 dd 32
- const pd_128, times 4 dd 128
- const pw_00ff, times 8 dw 0x00ff
- const pw_ff00, times 8 dw 0xff00
- diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
- index 24e6efc..21e70c1 100644
- --- a/common/x86/dct-32.asm
- +++ b/common/x86/dct-32.asm
- @@ -38,13 +38,13 @@ cextern hsub_mul
- ; in: m0..m7
- ; out: 0,4,6 in mem, rest in regs
- %macro DCT8_1D 9
- - SUMSUB_BA m%8, m%1 ; %8 = s07, %1 = d07
- - SUMSUB_BA m%7, m%2 ; %7 = s16, %2 = d16
- - SUMSUB_BA m%6, m%3 ; %6 = s25, %3 = d25
- - SUMSUB_BA m%5, m%4 ; %5 = s34, %4 = d34
- - SUMSUB_BA m%5, m%8 ; %5 = a0, %8 = a2
- - SUMSUB_BA m%6, m%7 ; %6 = a1, %7 = a3
- - SUMSUB_BA m%6, m%5 ; %6 = dst0, %5 = dst4
- + SUMSUB_BA w, m%8, m%1 ; %8 = s07, %1 = d07
- + SUMSUB_BA w, m%7, m%2 ; %7 = s16, %2 = d16
- + SUMSUB_BA w, m%6, m%3 ; %6 = s25, %3 = d25
- + SUMSUB_BA w, m%5, m%4 ; %5 = s34, %4 = d34
- + SUMSUB_BA w, m%5, m%8 ; %5 = a0, %8 = a2
- + SUMSUB_BA w, m%6, m%7 ; %6 = a1, %7 = a3
- + SUMSUB_BA w, m%6, m%5 ; %6 = dst0, %5 = dst4
- mova [%9+0x00], m%6
- mova [%9+0x40], m%5
- mova m%6, m%7 ; a3
- @@ -127,13 +127,13 @@ cextern hsub_mul
- psubw m%2, m%1
- mova m%1, [%9+0x00]
- mova m%6, [%9+0x40]
- - SUMSUB_BA m%6, m%1
- - SUMSUB_BA m%7, m%6
- - SUMSUB_BA m%3, m%1
- - SUMSUB_BA m%5, m%7
- - SUMSUB_BA m%2, m%3
- - SUMSUB_BA m%8, m%1
- - SUMSUB_BA m%4, m%6
- + SUMSUB_BA w, m%6, m%1
- + SUMSUB_BA w, m%7, m%6
- + SUMSUB_BA w, m%3, m%1
- + SUMSUB_BA w, m%5, m%7
- + SUMSUB_BA w, m%2, m%3
- + SUMSUB_BA w, m%8, m%1
- + SUMSUB_BA w, m%4, m%6
- SWAP %1, %5, %6
- SWAP %3, %8, %7
- %endmacro
- @@ -434,18 +434,18 @@ global add8x8_idct_sse2.skip_prologue
- SBUTTERFLY qdq, 4, 5, 0
- SBUTTERFLY qdq, 6, 7, 0
- UNSPILL r1,0
- - IDCT4_1D 0,1,2,3,r1
- + IDCT4_1D w,0,1,2,3,r1
- SPILL r1, 4
- TRANSPOSE2x4x4W 0,1,2,3,4
- UNSPILL r1, 4
- - IDCT4_1D 4,5,6,7,r1
- + IDCT4_1D w,4,5,6,7,r1
- SPILL r1, 0
- TRANSPOSE2x4x4W 4,5,6,7,0
- UNSPILL r1, 0
- paddw m0, [pw_32]
- - IDCT4_1D 0,1,2,3,r1
- + IDCT4_1D w,0,1,2,3,r1
- paddw m4, [pw_32]
- - IDCT4_1D 4,5,6,7,r1
- + IDCT4_1D w,4,5,6,7,r1
- SPILL r1, 6,7
- pxor m7, m7
- DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
- diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
- index 5e43b9c..70edcbd 100644
- --- a/common/x86/dct-64.asm
- +++ b/common/x86/dct-64.asm
- @@ -36,13 +36,13 @@ cextern hsub_mul
- INIT_XMM
- %macro DCT8_1D 10
- - SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34
- - SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
- - SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
- - SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
- + SUMSUB_BA w, m%5, m%4 ; %5=s34, %4=d34
- + SUMSUB_BA w, m%6, m%3 ; %6=s25, %3=d25
- + SUMSUB_BA w, m%7, m%2 ; %7=s16, %2=d16
- + SUMSUB_BA w, m%8, m%1 ; %8=s07, %1=d07
- - SUMSUB_BA m%6, m%7, m%10 ; %6=a1, %7=a3
- - SUMSUB_BA m%5, m%8, m%10 ; %5=a0, %8=a2
- + SUMSUB_BA w, m%6, m%7, m%10 ; %6=a1, %7=a3
- + SUMSUB_BA w, m%5, m%8, m%10 ; %5=a0, %8=a2
- movdqa m%9, m%1
- psraw m%9, 1
- @@ -56,7 +56,7 @@ INIT_XMM
- paddw m%10, m%2
- psubw m%10, m%3 ; %10=a7
- - SUMSUB_BA m%4, m%1
- + SUMSUB_BA w, m%4, m%1
- psubw m%1, m%3
- psubw m%4, m%2
- psraw m%3, 1
- @@ -70,7 +70,7 @@ INIT_XMM
- psraw m%9, 2
- psubw m%9, m%10 ; %9=b7
- - SUMSUB_BA m%6, m%5, m%10 ; %6=b0, %5=b4
- + SUMSUB_BA w, m%6, m%5, m%10 ; %6=b0, %5=b4
- movdqa m%3, m%7
- psraw m%3, 1
- @@ -88,7 +88,7 @@ INIT_XMM
- %endmacro
- %macro IDCT8_1D 10
- - SUMSUB_BA m%5, m%1, m%9 ; %5=a0, %1=a2
- + SUMSUB_BA w, m%5, m%1, m%9 ; %5=a0, %1=a2
- movdqa m%9, m%2
- psraw m%9, 1
- @@ -123,8 +123,8 @@ INIT_XMM
- psraw m%6, 2
- psubw m%9, m%6 ; %9=b7
- - SUMSUB_BA m%7, m%5, m%6 ; %7=b0, %5=b6
- - SUMSUB_BA m%3, m%1, m%6; %3=b2, %1=b4
- + SUMSUB_BA w, m%7, m%5, m%6 ; %7=b0, %5=b6
- + SUMSUB_BA w, m%3, m%1, m%6; %3=b2, %1=b4
- movdqa m%8, m%10
- psraw m%8, 2
- @@ -132,10 +132,10 @@ INIT_XMM
- psraw m%2, 2
- psubw m%2, m%10 ; %2=b5
- - SUMSUB_BA m%9, m%7, m%6 ; %9=c0, %7=c7
- - SUMSUB_BA m%2, m%3, m%6 ; %2=c1, %3=c6
- - SUMSUB_BA m%8, m%1, m%6 ; %8=c2, %1=c5
- - SUMSUB_BA m%4, m%5, m%6 ; %4=c3, %5=c4
- + SUMSUB_BA w, m%9, m%7, m%6 ; %9=c0, %7=c7
- + SUMSUB_BA w, m%2, m%3, m%6 ; %2=c1, %3=c6
- + SUMSUB_BA w, m%8, m%1, m%6 ; %8=c2, %1=c5
- + SUMSUB_BA w, m%4, m%5, m%6 ; %4=c3, %5=c4
- SWAP %1, %9, %6
- SWAP %3, %8, %7
- @@ -263,14 +263,14 @@ global add8x8_idct_sse2.skip_prologue
- mova m7, [r1+112]
- SBUTTERFLY qdq, 4, 5, 8
- SBUTTERFLY qdq, 6, 7, 8
- - IDCT4_1D 0,1,2,3,8,10
- + IDCT4_1D w,0,1,2,3,8,10
- TRANSPOSE2x4x4W 0,1,2,3,8
- - IDCT4_1D 4,5,6,7,8,10
- + IDCT4_1D w,4,5,6,7,8,10
- TRANSPOSE2x4x4W 4,5,6,7,8
- paddw m0, [pw_32]
- - IDCT4_1D 0,1,2,3,8,10
- + IDCT4_1D w,0,1,2,3,8,10
- paddw m4, [pw_32]
- - IDCT4_1D 4,5,6,7,8,10
- + IDCT4_1D w,4,5,6,7,8,10
- DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
- DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
- DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
- diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
- index 67fa34a..0e4b514 100644
- --- a/common/x86/dct-a.asm
- +++ b/common/x86/dct-a.asm
- @@ -52,13 +52,15 @@ SECTION .text
- cextern pw_32_0
- cextern pw_32
- cextern pw_8000
- +cextern pw_pixel_max
- cextern hsub_mul
- cextern pb_1
- cextern pw_1
- +cextern pd_32
- %macro WALSH4_1D 5
- - SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
- - SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
- + SUMSUB_BADC w, m%4, m%3, m%2, m%1, m%5
- + SUMSUB_BADC w, m%4, m%2, m%3, m%1, m%5
- SWAP %1, %4, %3
- %endmacro
- @@ -86,7 +88,7 @@ cglobal dct4x4dc_mmx, 1,1
- movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
- WALSH4_1D 0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
- - SUMSUB_BADC m1, m0, m3, m2, m4
- + SUMSUB_BADC w, m1, m0, m3, m2, m4
- SWAP 0, 1
- SWAP 2, 3
- SUMSUB_17BIT 0,2,4,7
- @@ -175,10 +177,10 @@ cglobal add4x4_idct_mmx, 2,2
- movq m3, [r1+24]
- movq m2, [r1+16]
- movq m0, [r1+ 0]
- - IDCT4_1D 0,1,2,3,4,5
- + IDCT4_1D w,0,1,2,3,4,5
- TRANSPOSE4x4W 0,1,2,3,4
- paddw m0, [pw_32]
- - IDCT4_1D 0,1,2,3,4,5
- + IDCT4_1D w,0,1,2,3,4,5
- STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
- STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
- STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
- @@ -198,7 +200,7 @@ cglobal add4x4_idct_sse4, 2,2,6
- psubw m0, m3 ; row1>>1-row3/row0-2
- paddw m2, m1 ; row3>>1+row1/row0+2
- SBUTTERFLY2 wd, 0, 2, 1
- - SUMSUB_BA m2, m0, m1
- + SUMSUB_BA w, m2, m0, m1
- pshuflw m1, m2, 10110001b
- pshufhw m2, m2, 10110001b
- punpckldq m1, m0
- @@ -215,7 +217,7 @@ cglobal add4x4_idct_sse4, 2,2,6
- psubw m0, m3 ; row1>>1-row3/row0-2
- paddw m2, m1 ; row3>>1+row1/row0+2
- SBUTTERFLY2 qdq, 0, 2, 1
- - SUMSUB_BA m2, m0, m1
- + SUMSUB_BA w, m2, m0, m1
- movd m4, [r0+FDEC_STRIDE*0]
- movd m1, [r0+FDEC_STRIDE*1]
- @@ -236,6 +238,37 @@ cglobal add4x4_idct_sse4, 2,2,6
- movd [r0+FDEC_STRIDE*2], m0
- pextrd [r0+FDEC_STRIDE*3], m0, 1
- RET
- +
- +%else
- +
- +%macro STORE_DIFFx2 6
- + psrad %1, 6
- + psrad %2, 6
- + packssdw %1, %2
- + movq %3, %5
- + movhps %3, %6
- + paddsw %1, %3
- + pxor %4, %4
- + CLIPW %1, %4, [pw_pixel_max]
- + movq %5, %1
- + movhps %6, %1
- +%endmacro
- +
- +INIT_XMM
- +cglobal add4x4_idct_sse2, 2,2,7
- + pxor m6, m6
- +.skip_prologue:
- + mova m1, [r1+16]
- + mova m3, [r1+48]
- + mova m2, [r1+32]
- + mova m0, [r1+ 0]
- + IDCT4_1D d,0,1,2,3,4,5
- + TRANSPOSE4x4D 0,1,2,3,4
- + paddd m0, [pd_32]
- + IDCT4_1D d,0,1,2,3,4,5
- + STORE_DIFFx2 m0, m1, m4, m6, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
- + STORE_DIFFx2 m2, m3, m4, m6, [r0+4*FDEC_STRIDE], [r0+6*FDEC_STRIDE]
- + RET
- %endif ; !HIGH_BIT_DEPTH
- INIT_MMX
- diff --git a/common/x86/dct.h b/common/x86/dct.h
- index 58b9d17..e55d256 100644
- --- a/common/x86/dct.h
- +++ b/common/x86/dct.h
- @@ -40,6 +40,7 @@ void x264_sub8x8_dct_dc_mmxext( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix
- void x264_sub8x8_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
- void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
- +void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
- void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] );
- void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] );
- void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct [ 4] );
- diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
- index acb3612..6cd79e1 100644
- --- a/common/x86/pixel-a.asm
- +++ b/common/x86/pixel-a.asm
- @@ -881,7 +881,7 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
- DEINTB %1, %2, %3, %4, %5
- psubw m%1, m%3
- psubw m%2, m%4
- - SUMSUB_BA m%1, m%2, m%3
- + SUMSUB_BA w, m%1, m%2, m%3
- %endmacro
- %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
- @@ -1278,10 +1278,10 @@ cglobal pixel_sa8d_8x8_internal_%1
- %else ; non-sse2
- HADAMARD4_V m0, m1, m2, m8, m6
- HADAMARD4_V m4, m5, m3, m9, m6
- - SUMSUB_BADC m0, m4, m1, m5, m6
- + SUMSUB_BADC w, m0, m4, m1, m5, m6
- HADAMARD 2, sumsub, 0, 4, 6, 11
- HADAMARD 2, sumsub, 1, 5, 6, 11
- - SUMSUB_BADC m2, m3, m8, m9, m6
- + SUMSUB_BADC w, m2, m3, m8, m9, m6
- HADAMARD 2, sumsub, 2, 3, 6, 11
- HADAMARD 2, sumsub, 8, 9, 6, 11
- HADAMARD 1, amax, 0, 4, 6, 11
- @@ -1379,7 +1379,7 @@ cglobal pixel_sa8d_8x8_internal_%1
- mova spill0, m6
- mova spill1, m7
- HADAMARD4_V m0, m1, m2, m3, m7
- - SUMSUB_BADC m0, m4, m1, m5, m7
- + SUMSUB_BADC w, m0, m4, m1, m5, m7
- HADAMARD 2, sumsub, 0, 4, 7, 6
- HADAMARD 2, sumsub, 1, 5, 7, 6
- HADAMARD 1, amax, 0, 4, 7, 6
- @@ -1387,7 +1387,7 @@ cglobal pixel_sa8d_8x8_internal_%1
- mova m6, spill0
- mova m7, spill1
- paddw m0, m1
- - SUMSUB_BADC m2, m6, m3, m7, m4
- + SUMSUB_BADC w, m2, m6, m3, m7, m4
- HADAMARD 2, sumsub, 2, 6, 4, 5
- HADAMARD 2, sumsub, 3, 7, 4, 5
- HADAMARD 1, amax, 2, 6, 4, 5
- @@ -1994,7 +1994,7 @@ cglobal hadamard_ac_2x2max_mmxext
- mova m2, [r3+0x40]
- mova m3, [r3+0x60]
- sub r3, 8
- - SUMSUB_BADC m0, m1, m2, m3, m4
- + SUMSUB_BADC w, m0, m1, m2, m3, m4
- ABS4 m0, m2, m1, m3, m4, m5
- HADAMARD 0, max, 0, 2, 4, 5
- HADAMARD 0, max, 1, 3, 4, 5
- @@ -2059,7 +2059,7 @@ cglobal hadamard_ac_8x8_mmxext
- mova m1, [r3+0x20]
- mova m2, [r3+0x40]
- mova m3, [r3+0x60]
- - SUMSUB_BADC m0, m1, m2, m3, m4
- + SUMSUB_BADC w, m0, m1, m2, m3, m4
- HADAMARD 0, sumsub, 0, 2, 4, 5
- ABS4 m1, m3, m0, m2, m4, m5
- HADAMARD 0, max, 1, 3, 4, 5
- @@ -2266,7 +2266,7 @@ cglobal hadamard_ac_8x8_%1
- ABS_MOV m2, m4
- ABS_MOV m3, m5
- paddw m1, m2
- - SUMSUB_BA m0, m4; m2
- + SUMSUB_BA w, m0, m4; m2
- %if vertical
- pand m1, [mask_ac4]
- %else
- diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
- index 9acaa3d..d16a237 100644
- --- a/common/x86/x86util.asm
- +++ b/common/x86/x86util.asm
- @@ -241,44 +241,44 @@
- psrlw m%4, 8 ; src .. y7 .. y5
- %endmacro
- -%macro SUMSUB_BA 2-3
- -%if %0==2
- - paddw %1, %2
- - paddw %2, %2
- - psubw %2, %1
- +%macro SUMSUB_BA 3-4
- +%if %0==3
- + padd%1 %2, %3
- + padd%1 %3, %3
- + psub%1 %3, %2
- %else
- - mova %3, %1
- - paddw %1, %2
- - psubw %2, %3
- + mova %4, %2
- + padd%1 %2, %3
- + psub%1 %3, %4
- %endif
- %endmacro
- -%macro SUMSUB_BADC 4-5
- -%if %0==5
- - SUMSUB_BA %1, %2, %5
- - SUMSUB_BA %3, %4, %5
- +%macro SUMSUB_BADC 5-6
- +%if %0==6
- + SUMSUB_BA %1, %2, %3, %6
- + SUMSUB_BA %1, %4, %5, %6
- %else
- - paddw %1, %2
- - paddw %3, %4
- - paddw %2, %2
- - paddw %4, %4
- - psubw %2, %1
- - psubw %4, %3
- + padd%1 %2, %3
- + padd%1 %4, %5
- + padd%1 %3, %3
- + padd%1 %5, %5
- + psub%1 %3, %2
- + psub%1 %5, %4
- %endif
- %endmacro
- %macro HADAMARD4_V 4+
- - SUMSUB_BADC %1, %2, %3, %4
- - SUMSUB_BADC %1, %3, %2, %4
- + SUMSUB_BADC w, %1, %2, %3, %4
- + SUMSUB_BADC w, %1, %3, %2, %4
- %endmacro
- %macro HADAMARD8_V 8+
- - SUMSUB_BADC %1, %2, %3, %4
- - SUMSUB_BADC %5, %6, %7, %8
- - SUMSUB_BADC %1, %3, %2, %4
- - SUMSUB_BADC %5, %7, %6, %8
- - SUMSUB_BADC %1, %5, %2, %6
- - SUMSUB_BADC %3, %7, %4, %8
- + SUMSUB_BADC w, %1, %2, %3, %4
- + SUMSUB_BADC w, %5, %6, %7, %8
- + SUMSUB_BADC w, %1, %3, %2, %4
- + SUMSUB_BADC w, %5, %7, %6, %8
- + SUMSUB_BADC w, %1, %5, %2, %6
- + SUMSUB_BADC w, %3, %7, %4, %8
- %endmacro
- %macro TRANS_SSE2 5-6
- @@ -363,7 +363,7 @@
- %endif
- %endif
- %ifidn %2, sumsub
- - SUMSUB_BA m%3, m%4, m%5
- + SUMSUB_BA w, m%3, m%4, m%5
- %else
- %ifidn %2, amax
- %if %0==6
- @@ -426,67 +426,72 @@
- %endif
- %endmacro
- -%macro SUMSUB2_AB 3
- - mova %3, %1
- - paddw %1, %1
- - paddw %1, %2
- - psubw %3, %2
- - psubw %3, %2
- +%macro SUMSUB2_AB 4
- + mova %4, %2
- + padd%1 %2, %2
- + padd%1 %2, %3
- + psub%1 %4, %3
- + psub%1 %4, %3
- %endmacro
- -%macro SUMSUB2_BA 3
- - mova m%3, m%1
- - paddw m%1, m%2
- - paddw m%1, m%2
- - psubw m%2, m%3
- - psubw m%2, m%3
- +%macro SUMSUB2_BA 4
- + mova m%4, m%2
- + padd%1 m%2, m%3
- + padd%1 m%2, m%3
- + psub%1 m%3, m%4
- + psub%1 m%3, m%4
- %endmacro
- -%macro SUMSUBD2_AB 4
- - mova %4, %1
- - mova %3, %2
- - psraw %2, 1 ; %2: %2>>1
- - psraw %1, 1 ; %1: %1>>1
- - paddw %2, %4 ; %2: %2>>1+%1
- - psubw %1, %3 ; %1: %1>>1-%2
- +%macro SUMSUBD2_AB 5
- + mova %5, %2
- + mova %4, %3
- + psra%1 %3, 1 ; %3: %2>>1
- + psra%1 %2, 1 ; %2: %1>>1
- + padd%1 %3, %5 ; %3: %2>>1+%1
- + psub%1 %2, %4 ; %2: %1>>1-%2
- %endmacro
- %macro DCT4_1D 5
- %ifnum %5
- - SUMSUB_BADC m%4, m%1, m%3, m%2; m%5
- - SUMSUB_BA m%3, m%4, m%5
- - SUMSUB2_AB m%1, m%2, m%5
- + SUMSUB_BADC w, m%4, m%1, m%3, m%2; m%5
- + SUMSUB_BA w, m%3, m%4, m%5
- + SUMSUB2_AB w, m%1, m%2, m%5
- SWAP %1, %3, %4, %5, %2
- %else
- - SUMSUB_BADC m%4, m%1, m%3, m%2
- - SUMSUB_BA m%3, m%4
- + SUMSUB_BADC w, m%4, m%1, m%3, m%2
- + SUMSUB_BA w, m%3, m%4
- mova [%5], m%2
- - SUMSUB2_AB m%1, [%5], m%2
- + SUMSUB2_AB w, m%1, [%5], m%2
- SWAP %1, %3, %4, %2
- %endif
- %endmacro
- -%macro IDCT4_1D 5-6
- -%ifnum %5
- - SUMSUBD2_AB m%2, m%4, m%6, m%5
- - ; %2: %2>>1-%4 %4: %2+%4>>1
- - SUMSUB_BA m%3, m%1, m%6
- - ; %3: %1+%3 %1: %1-%3
- - SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
- - ; %4: %1+%3 + (%2+%4>>1)
- - ; %3: %1+%3 - (%2+%4>>1)
- - ; %2: %1-%3 + (%2>>1-%4)
- - ; %1: %1-%3 - (%2>>1-%4)
- +%macro IDCT4_1D 6-7
- +%ifnum %6
- + SUMSUBD2_AB %1, m%3, m%5, m%7, m%6
- + ; %3: %3>>2-%5 %5: %3+%5>>2
- + SUMSUB_BA %1, m%4, m%2, m%7
- + ; %4: %2+%4 %2: %2-%4
- + SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%7
- + ; %5: %2+%4 + (%3+%5>>1)
- + ; %4: %2+%4 - (%3+%5>>1)
- + ; %3: %2-%4 + (%3>>1-%5)
- + ; %2: %2-%4 - (%3>>1-%5)
- %else
- - SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
- - SUMSUB_BA m%3, m%1
- - SUMSUB_BADC m%4, m%3, m%2, m%1
- +%ifidn %1,w
- + SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+16]
- +%endif
- +%ifidn %1,d
- + SUMSUBD2_AB %1, m%3, m%5, [%6], [%6+32]
- +%endif
- + SUMSUB_BA %1, m%4, m%2
- + SUMSUB_BADC %1, m%5, m%4, m%3, m%2
- %endif
- - SWAP %1, %4, %3
- - ; %1: %1+%3 + (%2+%4>>1) row0
- - ; %2: %1-%3 + (%2>>1-%4) row1
- - ; %3: %1-%3 - (%2>>1-%4) row2
- - ; %4: %1+%3 - (%2+%4>>1) row3
- + SWAP %2, %5, %4
- + ; %2: %2+%4 + (%3+%5>>1) row0
- + ; %3: %2-%4 + (%3>>1-%5) row2
- + ; %4: %2-%4 - (%3>>1-%5) row3
- + ; %5: %2+%4 - (%3+%5>>1) row4
- %endmacro
- --
- 1.7.3.2.146.gca209
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement