Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- %ifdef ARCH_X86_64
- cglobal mbtree_propagate_cost_avx, 7,7,13
- shl r6d, 1
- lea r0, [r0+r6*2]
- add r1, r6
- add r2, r6
- add r3, r6
- add r4, r6
- neg r6
- vpxor xmm4, xmm4
- vmovss xmm6, [r5]
- vshufps ymm6, ymm6, ymm6, 0
- vmulps ymm6, ymm6, [pf_inv256]
- vmovdqa xmm5, [pw_3fff]
- .loop:
- vmovq xmm2, [r2+r6] ; intra
- vmovq xmm0, [r4+r6] ; invq
- vpunpcklwd xmm2, xmm4
- vpunpcklwd xmm0, xmm4
- vpmaddwd xmm0, xmm2
- vmovdqa xmm12, xmm2
- vmovq xmm8, [r2+r6+8]
- vmovq xmm7, [r4+r6+8]
- vpunpcklwd xmm8, xmm4
- vpunpcklwd xmm7, xmm4
- vpmaddwd xmm7, xmm8
- vinsertf128 ymm2, ymm2, xmm8, 1
- vinsertf128 ymm0, ymm0, xmm7, 1
- vcvtdq2ps ymm9, ymm2
- vmovq xmm3, [r3+r6] ; inter
- vmovq xmm1, [r1+r6] ; prop
- vpand xmm3, xmm5
- vpunpcklwd xmm1, xmm4
- vpunpcklwd xmm3, xmm4
- vmovq xmm11, [r3+r6+8]
- vmovq xmm10, [r1+r6+8]
- vpand xmm11, xmm5
- vpunpcklwd xmm10, xmm4
- vpunpcklwd xmm11, xmm4
- vpsubd xmm12, xmm3 ; intra - inter
- vpsubd xmm8, xmm11 ; intra - inter
- vinsertf128 ymm3, ymm3, xmm11, 1
- vinsertf128 ymm1, ymm1, xmm10, 1
- vinsertf128 ymm2, ymm12, xmm8, 1
- vcvtdq2ps ymm0, ymm0
- vmulps ymm0, ymm0, ymm6 ; intra*invq*fps_factor>>8
- vcvtdq2ps ymm1, ymm1 ; prop
- vaddps ymm0, ymm0, ymm1 ; prop + (intra*invq*fps_factor>>8)
- vcvtdq2ps ymm2, ymm2 ; intra - inter
- vrcpps ymm3, ymm9 ; 1 / intra 1st approximation
- vmulps ymm1, ymm9, ymm3 ; intra * (1/intra 1st approx)
- vmulps ymm1, ymm1, ymm3 ; intra * (1/intra 1st approx)^2
- vmulps ymm0, ymm0, ymm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
- vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
- vsubps ymm3, ymm3, ymm1 ; 2nd approximation for 1/intra
- vmulps ymm0, ymm0, ymm3 ; / intra
- vcvtps2dq ymm0, ymm0
- vmovdqu [r0+r6*2], ymm0
- add r6, 16
- jl .loop
- vzeroupper
- RET
- %endif ; x64
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement