Advertisement
shssoichiro

4x4 kernel after

Oct 26th, 2022
1,316
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. .section .text.rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2,"ax",@progbits
  2.     .p2align    4, 0x90
  3.     .type   rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2,@function
  4. rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2:
  5.  
  6.     .cfi_startproc
  7.     push rbp
  8.     .cfi_def_cfa_offset 16
  9.     push r15
  10.     .cfi_def_cfa_offset 24
  11.     push r14
  12.     .cfi_def_cfa_offset 32
  13.     push r13
  14.     .cfi_def_cfa_offset 40
  15.     push r12
  16.     .cfi_def_cfa_offset 48
  17.     push rbx
  18.     .cfi_def_cfa_offset 56
  19.     sub rsp, 24
  20.     .cfi_def_cfa_offset 80
  21.     .cfi_offset rbx, -56
  22.     .cfi_offset r12, -48
  23.     .cfi_offset r13, -40
  24.     .cfi_offset r14, -32
  25.     .cfi_offset r15, -24
  26.     .cfi_offset rbp, -16
  27.  
  28.     vpmovsxwd xmm0, qword ptr [rdi]
  29.  
  30.     vpmovsxwd xmm1, qword ptr [rdx]
  31.  
  32.     vpmovsxwd xmm2, qword ptr [rdx + 2*rcx]
  33.  
  34.     vpmovsxwd xmm3, qword ptr [rdx + 4*rcx]
  35.  
  36.     lea rax, [rsi + 2*rsi]
  37.  
  38.     vpsubd xmm1, xmm0, xmm1
  39.  
  40.     vpmovsxwd xmm0, qword ptr [rdi + 2*rsi]
  41.  
  42.     vmovd r9d, xmm1
  43.     vpextrd r14d, xmm1, 1
  44.  
  45.     vpsubd xmm2, xmm0, xmm2
  46.  
  47.     vpmovsxwd xmm0, qword ptr [rdi + 4*rsi]
  48.  
  49.     vpsubd xmm0, xmm0, xmm3
  50.  
  51.     vpmovsxwd xmm3, qword ptr [rdi + 2*rax]
  52.  
  53.     lea rax, [rcx + 2*rcx]
  54.  
  55.     vpmovsxwd xmm4, qword ptr [rdx + 2*rax]
  56.  
  57.     vmovd eax, xmm2
  58.  
  59.     vpextrd edi, xmm0, 2
  60.  
  61.     lea r12d, [rax + r9]
  62.  
  63.     sub r9d, eax
  64.  
  65.     vmovd eax, xmm0
  66.  
  67.     vpsubd xmm3, xmm3, xmm4
  68.  
  69.     vmovd edx, xmm3
  70.  
  71.     vpextrd r8d, xmm3, 3
  72.  
  73.     lea esi, [rdx + rax]
  74.  
  75.     sub eax, edx
  76.  
  77.     vpextrd edx, xmm3, 1
  78.  
  79.     lea r15d, [rax + r9]
  80.     sub r9d, eax
  81.  
  82.     vpextrd eax, xmm2, 1
  83.  
  84.     lea r11d, [rsi + r12]
  85.  
  86.     sub r12d, esi
  87.  
  88.     lea ecx, [rax + r14]
  89.  
  90.     sub r14d, eax
  91.  
  92.     vpextrd eax, xmm0, 1
  93.  
  94.     lea esi, [rdx + rax]
  95.  
  96.     sub eax, edx
  97.  
  98.     lea edx, [rsi + rcx]
  99.  
  100.     sub ecx, esi
  101.  
  102.     vpextrd esi, xmm1, 2
  103.  
  104.     mov qword ptr [rsp + 16], rcx
  105.  
  106.     lea ecx, [rax + r14]
  107.  
  108.     sub r14d, eax
  109.  
  110.     vpextrd eax, xmm2, 2
  111.  
  112.     mov qword ptr [rsp], rdx
  113.     mov qword ptr [rsp + 8], rcx
  114.  
  115.     lea r10d, [rax + rsi]
  116.  
  117.     sub esi, eax
  118.  
  119.     vpextrd eax, xmm3, 2
  120.  
  121.     lea ebp, [rax + rdi]
  122.  
  123.     sub edi, eax
  124.  
  125.     lea ebx, [rbp + r10]
  126.  
  127.     sub r10d, ebp
  128.  
  129.     lea edx, [rdi + rsi]
  130.     sub esi, edi
  131.  
  132.     vpextrd edi, xmm1, 3
  133.  
  134.     vpextrd ebp, xmm2, 3
  135.  
  136.     lea r13d, [rbp + rdi]
  137.  
  138.     sub edi, ebp
  139.  
  140.     vpextrd ebp, xmm0, 3
  141.  
  142.     lea eax, [r8 + rbp]
  143.  
  144.     sub ebp, r8d
  145.  
  146.     lea ecx, [rax + r13]
  147.  
  148.     sub r13d, eax
  149.  
  150.     mov rax, qword ptr [rsp]
  151.  
  152.     lea r8d, [rbp + rdi]
  153.     sub edi, ebp
  154.  
  155.     lea ebp, [rax + r11]
  156.  
  157.     sub r11d, eax
  158.  
  159.     lea eax, [rcx + rbx]
  160.  
  161.     sub ebx, ecx
  162.  
  163.     lea ecx, [rax + rbp]
  164.  
  165.     sub ebp, eax
  166.  
  167.     mov rax, qword ptr [rsp + 8]
  168.  
  169.     mov dword ptr [rsp], ecx
  170.  
  171.     lea ecx, [rbx + r11]
  172.  
  173.     sub r11d, ebx
  174.  
  175.     vmovd xmm1, dword ptr [rsp]
  176.  
  177.     lea ebx, [rax + r15]
  178.  
  179.     sub r15d, eax
  180.  
  181.     lea eax, [r8 + rdx]
  182.  
  183.     sub edx, r8d
  184.  
  185.     lea r8d, [rax + rbx]
  186.  
  187.     sub ebx, eax
  188.  
  189.     lea eax, [rdx + r15]
  190.  
  191.     sub r15d, edx
  192.  
  193.     lea edx, [r14 + r9]
  194.  
  195.     sub r9d, r14d
  196.  
  197.     vpinsrd xmm1, xmm1, ecx, 1
  198.     mov rcx, qword ptr [rsp + 16]
  199.     vmovd xmm0, r8d
  200.     vpinsrd xmm1, xmm1, ebp, 2
  201.     vpinsrd xmm1, xmm1, r11d, 3
  202.     vpinsrd xmm0, xmm0, eax, 1
  203.     vpinsrd xmm0, xmm0, ebx, 2
  204.  
  205.     lea ebx, [rdi + rsi]
  206.  
  207.     sub esi, edi
  208.  
  209.     lea edi, [rbx + rdx]
  210.  
  211.     sub edx, ebx
  212.  
  213.     lea ebx, [rsi + r9]
  214.  
  215.     vpinsrd xmm0, xmm0, r15d, 3
  216.  
  217.     sub r9d, esi
  218.  
  219.     vmovd xmm2, edi
  220.     vinserti128 ymm0, ymm1, xmm0, 1
  221.  
  222.     lea eax, [rcx + r12]
  223.  
  224.     sub r12d, ecx
  225.  
  226.     lea ecx, [r13 + r10]
  227.  
  228.     sub r10d, r13d
  229.  
  230.     vpabsd ymm0, ymm0
  231.  
  232.     lea ebp, [rcx + rax]
  233.  
  234.     sub eax, ecx
  235.  
  236.     lea ecx, [r10 + r12]
  237.  
  238.     vpinsrd xmm2, xmm2, ebx, 1
  239.  
  240.     sub r12d, r10d
  241.  
  242.     vmovd xmm3, ebp
  243.     vpinsrd xmm2, xmm2, edx, 2
  244.     vpinsrd xmm2, xmm2, r9d, 3
  245.     vpinsrd xmm3, xmm3, ecx, 1
  246.     vpinsrd xmm1, xmm3, eax, 2
  247.     vpinsrd xmm1, xmm1, r12d, 3
  248.     vinserti128 ymm1, ymm1, xmm2, 1
  249.  
  250.     vpabsd ymm1, ymm1
  251.  
  252.     vpaddd ymm0, ymm1, ymm0
  253.  
  254.     vmovd eax, xmm0
  255.  
  256.     vpextrd ecx, xmm0, 1
  257.  
  258.     add rcx, rax
  259.  
  260.     vpextrd eax, xmm0, 2
  261.  
  262.     add rax, rcx
  263.  
  264.     vpextrd ecx, xmm0, 3
  265.  
  266.     vextracti128 xmm0, ymm0, 1
  267.  
  268.     add rcx, rax
  269.  
  270.     vmovd eax, xmm0
  271.     vpextrd edx, xmm0, 2
  272.  
  273.     add rax, rcx
  274.  
  275.     vpextrd ecx, xmm0, 1
  276.  
  277.     add rcx, rax
  278.  
  279.     vpextrd eax, xmm0, 3
  280.  
  281.     add rdx, rcx
  282.  
  283.     add rax, rdx
  284.  
  285.     add rsp, 24
  286.     .cfi_def_cfa_offset 56
  287.     pop rbx
  288.     .cfi_def_cfa_offset 48
  289.     pop r12
  290.  
  291.     .cfi_def_cfa_offset 40
  292.     pop r13
  293.  
  294.     .cfi_def_cfa_offset 32
  295.     pop r14
  296.  
  297.     .cfi_def_cfa_offset 24
  298.     pop r15
  299.     .cfi_def_cfa_offset 16
  300.     pop rbp
  301.  
  302.     .cfi_def_cfa_offset 8
  303.     vzeroupper
  304.  
  305.     ret
  306.  
  307. .Lfunc_end666:
  308.     .size   rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2, .Lfunc_end666-rav1e::asm::x86::dist::hbd::satd_kernel_4x4_hbd_avx2
  309.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement