Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- .section .text.rav1e::asm::x86::dist::hbd::satd_kernel_8x8_hbd_avx2,"ax",@progbits
- .p2align 4, 0x90
- .type rav1e::asm::x86::dist::hbd::satd_kernel_8x8_hbd_avx2,@function
- rav1e::asm::x86::dist::hbd::satd_kernel_8x8_hbd_avx2:
- .cfi_startproc
- push rbp
- .cfi_def_cfa_offset 16
- .cfi_offset rbp, -16
- mov rbp, rsp
- .cfi_def_cfa_register rbp
- push r15
- push r14
- push r12
- push rbx
- and rsp, -64
- sub rsp, 320
- .cfi_offset rbx, -48
- .cfi_offset r12, -40
- .cfi_offset r14, -32
- .cfi_offset r15, -24
- vpmovsxwd ymm0, xmmword ptr [rdi]
- vpmovsxwd ymm1, xmmword ptr [rdx]
- vpmovsxwd ymm2, xmmword ptr [rdx + 2*rcx]
- vpmovsxwd ymm3, xmmword ptr [rdi + 4*rsi]
- vpmovsxwd ymm4, xmmword ptr [rdx + 4*rcx]
- lea r8, [rsi + 2*rsi]
- lea rbx, [rcx + 2*rcx]
- vpmovsxwd ymm6, xmmword ptr [rdx + 8*rcx]
- lea rax, [rsi + 4*rsi]
- vpmovsxwd ymm5, xmmword ptr [rdi + 2*r8]
- vpmovsxwd ymm7, xmmword ptr [rdi + 4*r8]
- vpmovsxwd ymm8, xmmword ptr [rdx + 4*rbx]
- vpsubd ymm0, ymm0, ymm1
- vpmovsxwd ymm1, xmmword ptr [rdi + 2*rsi]
- vmovd r12d, xmm0
- vpsubd ymm1, ymm1, ymm2
- vpsubd ymm2, ymm3, ymm4
- vpmovsxwd ymm3, xmmword ptr [rdx + 2*rbx]
- vpmovsxwd ymm4, xmmword ptr [rdi + 8*rsi]
- vmovd r9d, xmm1
- vmovd r8d, xmm2
- vpsubd ymm3, ymm5, ymm3
- vpmovsxwd ymm5, xmmword ptr [rdi + 2*rax]
- lea rax, [rcx + 4*rcx]
- vpsubd ymm4, ymm4, ymm6
- vpmovsxwd ymm6, xmmword ptr [rdx + 2*rax]
- mov rax, rsi
- vmovd r10d, xmm3
- vmovd r15d, xmm4
- shl rax, 4
- sub rax, rsi
- sub rax, rsi
- vpsubd ymm6, ymm5, ymm6
- vpsubd ymm5, ymm7, ymm8
- vpmovsxwd ymm7, xmmword ptr [rdi + rax]
- mov rax, rcx
- shl rax, 4
- vmovd r11d, xmm6
- vmovd r14d, xmm5
- sub rax, rcx
- sub rax, rcx
- mov rcx, -7
- vpmovsxwd ymm8, xmmword ptr [rdx + rax]
- vmovdqa ymmword ptr [rsp], ymm0
- vmovdqa ymmword ptr [rsp + 32], ymm1
- vmovdqa ymmword ptr [rsp + 64], ymm2
- vmovdqa ymmword ptr [rsp + 96], ymm3
- vmovdqa ymmword ptr [rsp + 128], ymm4
- vmovdqa ymmword ptr [rsp + 160], ymm6
- vmovdqa ymmword ptr [rsp + 192], ymm5
- vpsubd ymm7, ymm7, ymm8
- vmovd ebx, xmm7
- vmovdqa ymmword ptr [rsp + 224], ymm7
- .p2align 4, 0x90
- .LBB666_1:
- lea esi, [r9 + r12]
- sub r12d, r9d
- lea edx, [r10 + r8]
- sub r8d, r10d
- lea edi, [r11 + r15]
- lea r9d, [rbx + r14]
- sub r14d, ebx
- sub r15d, r11d
- lea r10d, [rdx + rsi]
- sub esi, edx
- lea edx, [r8 + r12]
- lea ebx, [r9 + rdi]
- sub r12d, r8d
- lea eax, [r14 + r15]
- sub edi, r9d
- sub r15d, r14d
- lea r8d, [rbx + r10]
- sub r10d, ebx
- lea ebx, [rax + rdx]
- sub edx, eax
- lea eax, [rdi + rsi]
- sub esi, edi
- lea edi, [r15 + r12]
- sub r12d, r15d
- mov dword ptr [rsp + 4*rcx + 28], r8d
- mov dword ptr [rsp + 4*rcx + 60], ebx
- mov dword ptr [rsp + 4*rcx + 92], eax
- mov dword ptr [rsp + 4*rcx + 124], edi
- mov dword ptr [rsp + 4*rcx + 156], r10d
- mov dword ptr [rsp + 4*rcx + 188], edx
- mov dword ptr [rsp + 4*rcx + 220], esi
- mov dword ptr [rsp + 4*rcx + 252], r12d
- test rcx, rcx
- je .LBB666_2
- mov r12d, dword ptr [rsp + 4*rcx + 32]
- mov r9d, dword ptr [rsp + 4*rcx + 64]
- mov r8d, dword ptr [rsp + 4*rcx + 96]
- mov r10d, dword ptr [rsp + 4*rcx + 128]
- mov r15d, dword ptr [rsp + 4*rcx + 160]
- mov r11d, dword ptr [rsp + 4*rcx + 192]
- mov r14d, dword ptr [rsp + 4*rcx + 224]
- mov ebx, dword ptr [rsp + 4*rcx + 256]
- inc rcx
- jmp .LBB666_1
- .LBB666_2:
- xor eax, eax
- .p2align 4, 0x90
- .LBB666_3:
- mov ecx, dword ptr [rsp + rax]
- mov esi, dword ptr [rsp + rax + 4]
- mov ebx, dword ptr [rsp + rax + 8]
- mov edx, dword ptr [rsp + rax + 20]
- mov edi, dword ptr [rsp + rax + 28]
- lea r10d, [rsi + rcx]
- sub ecx, esi
- mov esi, dword ptr [rsp + rax + 12]
- lea r9d, [rsi + rbx]
- sub ebx, esi
- mov esi, dword ptr [rsp + rax + 16]
- lea r8d, [r9 + r10]
- sub r10d, r9d
- lea r9d, [rbx + rcx]
- sub ecx, ebx
- lea r11d, [rdx + rsi]
- sub esi, edx
- mov edx, dword ptr [rsp + rax + 24]
- lea r14d, [rdi + rdx]
- sub edx, edi
- lea edi, [r14 + r11]
- sub r11d, r14d
- lea ebx, [rdx + rsi]
- sub esi, edx
- lea r14d, [rdi + r8]
- sub r8d, edi
- lea edi, [rbx + r9]
- sub r9d, ebx
- lea ebx, [r11 + r10]
- lea edx, [rsi + rcx]
- sub r10d, r11d
- sub ecx, esi
- mov dword ptr [rsp + rax], r14d
- mov dword ptr [rsp + rax + 4], edi
- mov dword ptr [rsp + rax + 8], ebx
- mov dword ptr [rsp + rax + 12], edx
- mov dword ptr [rsp + rax + 16], r8d
- mov dword ptr [rsp + rax + 20], r9d
- mov dword ptr [rsp + rax + 24], r10d
- mov dword ptr [rsp + rax + 28], ecx
- add rax, 32
- cmp rax, 256
- jne .LBB666_3
- vpabsd ymm0, ymmword ptr [rsp]
- vpabsd ymm1, ymmword ptr [rsp + 32]
- vpabsd ymm2, ymmword ptr [rsp + 96]
- vpaddd ymm0, ymm1, ymm0
- vpabsd ymm1, ymmword ptr [rsp + 64]
- vpaddd ymm1, ymm1, ymm2
- vpabsd ymm2, ymmword ptr [rsp + 160]
- vpaddd ymm0, ymm0, ymm1
- vpabsd ymm1, ymmword ptr [rsp + 128]
- vpaddd ymm1, ymm1, ymm2
- vpabsd ymm2, ymmword ptr [rsp + 192]
- vpaddd ymm1, ymm1, ymm2
- vpabsd ymm2, ymmword ptr [rsp + 224]
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm0, ymm0, ymm2
- vmovd eax, xmm0
- vpextrd ecx, xmm0, 1
- add rcx, rax
- vpextrd eax, xmm0, 2
- add rax, rcx
- vpextrd ecx, xmm0, 3
- vextracti128 xmm0, ymm0, 1
- add rcx, rax
- vmovd eax, xmm0
- vpextrd edx, xmm0, 2
- add rax, rcx
- vpextrd ecx, xmm0, 1
- add rcx, rax
- vpextrd eax, xmm0, 3
- add rdx, rcx
- add rax, rdx
- lea rsp, [rbp - 32]
- pop rbx
- pop r12
- pop r14
- pop r15
- pop rbp
- .cfi_def_cfa rsp, 8
- vzeroupper
- ret
- .Lfunc_end666:
- .size rav1e::asm::x86::dist::hbd::satd_kernel_8x8_hbd_avx2, .Lfunc_end666-rav1e::asm::x86::dist::hbd::satd_kernel_8x8_hbd_avx2
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement