Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
- index 5393957..8109fa0 100644
- --- a/libavcodec/x86/vp9mc.asm
- +++ b/libavcodec/x86/vp9mc.asm
- @@ -192,42 +192,49 @@ filter_sse2_h_fn avg
- %macro filter_h_fn 1
- %assign %%px mmsize/2
- -cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
- - mova m6, [pw_256]
- - mova m7, [filteryq+ 0]
- +cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 10, dst, dstride, src, sstride, h, filtery
- + mova m5, [pw_256]
- + mova m6, [filteryq+ 0]
- + mova m7, [filteryq+32]
- %if ARCH_X86_64 && mmsize > 8
- - mova m8, [filteryq+32]
- - mova m9, [filteryq+64]
- - mova m10, [filteryq+96]
- + mova m8, [filteryq+64]
- + mova m9, [filteryq+96]
- %endif
- .loop:
- - movh m0, [srcq-3]
- - movh m1, [srcq-2]
- - movh m2, [srcq-1]
- - movh m3, [srcq+0]
- - movh m4, [srcq+1]
- - movh m5, [srcq+2]
- - punpcklbw m0, m1
- - punpcklbw m2, m3
- - movh m1, [srcq+3]
- - movh m3, [srcq+4]
- + movh m1, [srcq-3]
- + movq m0, [srcq+%%px-3]
- + punpcklbw m1, m1
- +%if mmsize == 8
- + punpckhbw m3, m0, m0
- +%endif
- + punpcklbw m0, m0
- add srcq, sstrideq
- - punpcklbw m4, m5
- - punpcklbw m1, m3
- - pmaddubsw m0, m7
- +%if mmsize == 8
- + mova m4, m3
- + palignr m3, m0, 1
- + palignr m4, m0, 5
- +%else
- + mova m3, m0
- + mova m4, m0
- + palignr m3, m1, 9
- + palignr m4, m1, 13
- +%endif
- + mova m2, m0
- + palignr m0, m1, 1
- + palignr m2, m1, 5
- + pmaddubsw m0, m6
- + pmaddubsw m2, m7
- %if ARCH_X86_64 && mmsize > 8
- - pmaddubsw m2, m8
- + pmaddubsw m3, m8
- pmaddubsw m4, m9
- - pmaddubsw m1, m10
- %else
- - pmaddubsw m2, [filteryq+32]
- - pmaddubsw m4, [filteryq+64]
- - pmaddubsw m1, [filteryq+96]
- + pmaddubsw m3, [filteryq+64]
- + pmaddubsw m4, [filteryq+96]
- %endif
- - paddw m0, m4
- - paddw m2, m1
- + paddw m0, m3
- + paddw m2, m4
- paddsw m0, m2
- - pmulhrsw m0, m6
- + pmulhrsw m0, m5
- %ifidn %1, avg
- movh m1, [dstq]
- %endif
- @@ -260,35 +267,36 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filt
- mova m10, [filteryq+64]
- mova m11, [filteryq+96]
- .loop:
- - movu m0, [srcq-3]
- - movu m1, [srcq-2]
- - movu m2, [srcq-1]
- - movu m3, [srcq+0]
- - movu m4, [srcq+1]
- - movu m5, [srcq+2]
- - movu m6, [srcq+3]
- - movu m7, [srcq+4]
- + movu m12, [srcq-3]
- + movu m1, [srcq+5]
- + punpckhbw m0, m12, m12
- + punpckhbw m1, m1
- + punpcklbw m12, m12
- add srcq, sstrideq
- - SBUTTERFLY bw, 0, 1, 12
- - SBUTTERFLY bw, 2, 3, 12
- - SBUTTERFLY bw, 4, 5, 12
- - SBUTTERFLY bw, 6, 7, 12
- - pmaddubsw m0, m8
- - pmaddubsw m1, m8
- - pmaddubsw m2, m9
- - pmaddubsw m3, m9
- - pmaddubsw m4, m10
- - pmaddubsw m5, m10
- - pmaddubsw m6, m11
- + palignr m7, m1, m0, 13
- + palignr m6, m0, m12, 13
- + palignr m5, m1, m0, 9
- + palignr m4, m0, m12, 9
- + palignr m3, m1, m0, 5
- + palignr m2, m0, m12, 5
- + palignr m1, m0, 1
- + palignr m0, m12, 1
- pmaddubsw m7, m11
- - paddw m0, m4
- - paddw m1, m5
- - paddw m2, m6
- + pmaddubsw m6, m11
- + pmaddubsw m5, m10
- + pmaddubsw m4, m10
- + pmaddubsw m3, m9
- + pmaddubsw m2, m9
- + pmaddubsw m1, m8
- + pmaddubsw m0, m8
- paddw m3, m7
- - paddsw m0, m2
- + paddw m2, m6
- + paddw m1, m5
- + paddw m0, m4
- paddsw m1, m3
- - pmulhrsw m0, m13
- + paddsw m0, m2
- pmulhrsw m1, m13
- + pmulhrsw m0, m13
- packuswb m0, m1
- %ifidn %1, avg
- pavgb m0, [dstq]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement