Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __attribute__((noinline))
- void benchmark(cfloat* __restrict__ aa, cfloat* __restrict__ bb, cfloat* __restrict__ cc, cfloat* __restrict__ dd, cfloat uu, cfloat vv, size_t nn) {
- for (ssize_t ii=0; ii < nn; ii++) {
- dd[ii] = (
- aa[ii]*uu +
- bb[ii]*vv +
- cc[ii]
- );
- }
- }
- struct cfloat {
- cfloat(float re, float im) : re(re), im(im) {}
- float re,im;
- };
- cfloat operator +(cfloat a, cfloat b) {
- return cfloat(a.re+b.re, a.im+b.im);
- }
- cfloat operator *(cfloat a, cfloat b) {
- return cfloat(a.re*b.re-a.im*b.im, a.re*b.im+a.im*b.re);
- }
- 0x00000000004006a0 <+0>: push %r15
- 0x00000000004006a2 <+2>: test %r8,%r8
- 0x00000000004006a5 <+5>: push %r14
- 0x00000000004006a7 <+7>: push %r13
- 0x00000000004006a9 <+9>: push %r12
- 0x00000000004006ab <+11>: push %rbp
- 0x00000000004006ac <+12>: push %rbx
- 0x00000000004006ad <+13>: movq %xmm0,-0x28(%rsp)
- 0x00000000004006b3 <+19>: mov %rdi,-0x38(%rsp)
- 0x00000000004006b8 <+24>: mov -0x28(%rsp),%rax
- 0x00000000004006bd <+29>: movq %xmm1,-0x28(%rsp)
- 0x00000000004006c3 <+35>: mov -0x28(%rsp),%r9
- 0x00000000004006c8 <+40>: je 0x4008a0 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+512>
- 0x00000000004006ce <+46>: mov %r9,%r15
- 0x00000000004006d1 <+49>: mov %rax,%r14
- 0x00000000004006d4 <+52>: xor %r11d,%r11d
- 0x00000000004006d7 <+55>: shr $0x20,%r15
- 0x00000000004006db <+59>: shr $0x20,%r14
- 0x00000000004006df <+63>: xor %r10d,%r10d
- 0x00000000004006e2 <+66>: mov %r15d,-0x2c(%rsp)
- 0x00000000004006e7 <+71>: xor %ebp,%ebp
- 0x00000000004006e9 <+73>: xor %ebx,%ebx
- 0x00000000004006eb <+75>: movss -0x2c(%rsp),%xmm6
- 0x00000000004006f1 <+81>: mov %r9d,-0x2c(%rsp)
- 0x00000000004006f6 <+86>: movss -0x2c(%rsp),%xmm5
- 0x00000000004006fc <+92>: mov %r14d,-0x2c(%rsp)
- 0x0000000000400701 <+97>: movss -0x2c(%rsp),%xmm4
- 0x0000000000400707 <+103>: mov %eax,-0x2c(%rsp)
- 0x000000000040070b <+107>: xor %r13d,%r13d
- 0x000000000040070e <+110>: xor %r12d,%r12d
- 0x0000000000400711 <+113>: movabs $0xffffffff00000000,%r9
- 0x000000000040071b <+123>: movss -0x2c(%rsp),%xmm3
- 0x0000000000400721 <+129>: nopl 0x0(%rax)
- 0x0000000000400728 <+136>: lea 0x0(,%r13,8),%rax
- 0x0000000000400730 <+144>: movaps %xmm6,%xmm1
- 0x0000000000400733 <+147>: movaps %xmm5,%xmm7
- 0x0000000000400736 <+150>: and $0xffffffff,%ebp
- 0x0000000000400739 <+153>: lea (%rsi,%rax,1),%r15
- 0x000000000040073d <+157>: lea (%rdx,%rax,1),%r14
- 0x0000000000400741 <+161>: add -0x38(%rsp),%rax
- 0x0000000000400746 <+166>: and $0xffffffff,%ebx
- 0x0000000000400749 <+169>: add $0x1,%r12
- 0x000000000040074d <+173>: movss (%r15),%xmm0
- 0x0000000000400752 <+178>: movss 0x4(%r15),%xmm2
- 0x0000000000400758 <+184>: mulss %xmm0,%xmm1
- 0x000000000040075c <+188>: mulss %xmm2,%xmm7
- 0x0000000000400760 <+192>: mulss %xmm5,%xmm0
- 0x0000000000400764 <+196>: mulss %xmm6,%xmm2
- 0x0000000000400768 <+200>: addss %xmm7,%xmm1
- 0x000000000040076c <+204>: movaps %xmm3,%xmm7
- 0x000000000040076f <+207>: subss %xmm2,%xmm0
- 0x0000000000400773 <+211>: movd %xmm1,-0x30(%rsp)
- 0x0000000000400779 <+217>: mov -0x30(%rsp),%edi
- 0x000000000040077d <+221>: movaps %xmm4,%xmm1
- 0x0000000000400780 <+224>: movd %xmm0,-0x30(%rsp)
- 0x0000000000400786 <+230>: mov %edi,%r15d
- 0x0000000000400789 <+233>: mov -0x30(%rsp),%edi
- 0x000000000040078d <+237>: movss (%rax),%xmm0
- 0x0000000000400791 <+241>: shl $0x20,%r15
- 0x0000000000400795 <+245>: movss 0x4(%rax),%xmm2
- 0x000000000040079a <+250>: mulss %xmm0,%xmm1
- 0x000000000040079e <+254>: or %r15,%rbp
- 0x00000000004007a1 <+257>: mulss %xmm2,%xmm7
- 0x00000000004007a5 <+261>: mov %edi,%r15d
- 0x00000000004007a8 <+264>: and %r9,%rbp
- 0x00000000004007ab <+267>: mulss %xmm3,%xmm0
- 0x00000000004007af <+271>: or %r15,%rbp
- 0x00000000004007b2 <+274>: mulss %xmm4,%xmm2
- 0x00000000004007b6 <+278>: addss %xmm7,%xmm1
- 0x00000000004007ba <+282>: subss %xmm2,%xmm0
- 0x00000000004007be <+286>: movd %xmm1,-0x30(%rsp)
- 0x00000000004007c4 <+292>: mov -0x30(%rsp),%edi
- 0x00000000004007c8 <+296>: movd %xmm0,-0x30(%rsp)
- 0x00000000004007ce <+302>: mov %edi,%eax
- 0x00000000004007d0 <+304>: mov -0x30(%rsp),%edi
- 0x00000000004007d4 <+308>: shl $0x20,%rax
- 0x00000000004007d8 <+312>: or %rax,%rbx
- 0x00000000004007db <+315>: and %r9,%rbx
- 0x00000000004007de <+318>: mov %edi,%eax
- 0x00000000004007e0 <+320>: or %rax,%rbx
- 0x00000000004007e3 <+323>: mov %r10,%rax
- 0x00000000004007e6 <+326>: mov %rbx,%rdi
- 0x00000000004007e9 <+329>: and $0xffffffff,%eax
- 0x00000000004007ec <+332>: shr $0x20,%rdi
- 0x00000000004007f0 <+336>: mov %edi,-0x20(%rsp)
- 0x00000000004007f4 <+340>: mov %rbp,%rdi
- 0x00000000004007f7 <+343>: shr $0x20,%rdi
- 0x00000000004007fb <+347>: movss -0x20(%rsp),%xmm0
- 0x0000000000400801 <+353>: mov %edi,-0x10(%rsp)
- 0x0000000000400805 <+357>: addss -0x10(%rsp),%xmm0
- 0x000000000040080b <+363>: mov %ebp,-0x10(%rsp)
- 0x000000000040080f <+367>: movss %xmm0,-0x20(%rsp)
- 0x0000000000400815 <+373>: mov -0x20(%rsp),%r10d
- 0x000000000040081a <+378>: mov %ebx,-0x20(%rsp)
- 0x000000000040081e <+382>: movss -0x20(%rsp),%xmm0
- 0x0000000000400824 <+388>: addss -0x10(%rsp),%xmm0
- 0x000000000040082a <+394>: shl $0x20,%r10
- 0x000000000040082e <+398>: or %rax,%r10
- 0x0000000000400831 <+401>: and %r9,%r10
- 0x0000000000400834 <+404>: movss %xmm0,-0x20(%rsp)
- 0x000000000040083a <+410>: mov -0x20(%rsp),%eax
- 0x000000000040083e <+414>: or %rax,%r10
- 0x0000000000400841 <+417>: mov %r11,%rax
- 0x0000000000400844 <+420>: mov %r10,%rdi
- 0x0000000000400847 <+423>: and $0xffffffff,%eax
- 0x000000000040084a <+426>: shr $0x20,%rdi
- 0x000000000040084e <+430>: mov %edi,-0x20(%rsp)
- 0x0000000000400852 <+434>: movss -0x20(%rsp),%xmm0
- 0x0000000000400858 <+440>: addss 0x4(%r14),%xmm0
- 0x000000000040085e <+446>: movss %xmm0,-0x20(%rsp)
- 0x0000000000400864 <+452>: mov -0x20(%rsp),%r11d
- 0x0000000000400869 <+457>: mov %r10d,-0x20(%rsp)
- 0x000000000040086e <+462>: movss -0x20(%rsp),%xmm0
- 0x0000000000400874 <+468>: addss (%r14),%xmm0
- 0x0000000000400879 <+473>: shl $0x20,%r11
- 0x000000000040087d <+477>: or %rax,%r11
- 0x0000000000400880 <+480>: and %r9,%r11
- 0x0000000000400883 <+483>: movss %xmm0,-0x20(%rsp)
- 0x0000000000400889 <+489>: mov -0x20(%rsp),%eax
- 0x000000000040088d <+493>: or %rax,%r11
- 0x0000000000400890 <+496>: cmp %r8,%r12
- 0x0000000000400893 <+499>: mov %r11,(%rcx,%r13,8)
- 0x0000000000400897 <+503>: mov %r12,%r13
- 0x000000000040089a <+506>: jne 0x400728 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+136>
- 0x00000000004008a0 <+512>: pop %rbx
- 0x00000000004008a1 <+513>: pop %rbp
- 0x00000000004008a2 <+514>: pop %r12
- 0x00000000004008a4 <+516>: pop %r13
- 0x00000000004008a6 <+518>: pop %r14
- 0x00000000004008a8 <+520>: pop %r15
- 0x00000000004008aa <+522>: retq
- struct cfloat {
- cfloat(float re, float im) { ri[0] = re; ri[1] = im; }
- float ri[2];
- };
- cfloat operator +(cfloat a, cfloat b) {
- return cfloat(a.ri[0]+b.ri[0], a.ri[1]+b.ri[1]);
- }
- cfloat operator *(cfloat a, cfloat b) {
- return cfloat(a.ri[0]*b.ri[0]-a.ri[1]*b.ri[1], a.ri[0]*b.ri[1]+a.ri[1]*b.ri[0]);
- }
- Dump of assembler code for function _Z9benchmarkP6cfloatS0_S0_S0_S_S_m:
- 0x00000000004006a0 <+0>: push %rbx
- 0x00000000004006a1 <+1>: movq %xmm0,-0x8(%rsp)
- 0x00000000004006a7 <+7>: mov -0x8(%rsp),%r9
- 0x00000000004006ac <+12>: movq %xmm1,-0x8(%rsp)
- 0x00000000004006b2 <+18>: mov -0x8(%rsp),%rax
- 0x00000000004006b7 <+23>: mov %r9d,-0xc(%rsp)
- 0x00000000004006bc <+28>: shr $0x20,%r9
- 0x00000000004006c0 <+32>: movss -0xc(%rsp),%xmm9
- 0x00000000004006c7 <+39>: mov %r9d,-0xc(%rsp)
- 0x00000000004006cc <+44>: movss -0xc(%rsp),%xmm8
- 0x00000000004006d3 <+51>: mov %eax,-0xc(%rsp)
- 0x00000000004006d7 <+55>: shr $0x20,%rax
- 0x00000000004006db <+59>: movss -0xc(%rsp),%xmm7
- 0x00000000004006e1 <+65>: test %r8,%r8
- 0x00000000004006e4 <+68>: mov %eax,-0xc(%rsp)
- 0x00000000004006e8 <+72>: movss -0xc(%rsp),%xmm6
- 0x00000000004006ee <+78>: je 0x400796 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+246>
- 0x00000000004006f4 <+84>: xor %eax,%eax
- 0x00000000004006f6 <+86>: xor %r9d,%r9d
- 0x00000000004006f9 <+89>: nopl 0x0(%rax)
- 0x0000000000400700 <+96>: shl $0x3,%rax
- 0x0000000000400704 <+100>: movaps %xmm7,%xmm0
- 0x0000000000400707 <+103>: lea (%rsi,%rax,1),%rbx
- 0x000000000040070b <+107>: movaps %xmm6,%xmm3
- 0x000000000040070e <+110>: lea (%rcx,%rax,1),%r10
- 0x0000000000400712 <+114>: lea (%rdx,%rax,1),%r11
- 0x0000000000400716 <+118>: lea (%rdi,%rax,1),%rax
- 0x000000000040071a <+122>: movss (%rbx),%xmm1
- 0x000000000040071e <+126>: add $0x1,%r9
- 0x0000000000400722 <+130>: movss 0x4(%rbx),%xmm5
- 0x0000000000400727 <+135>: mulss %xmm1,%xmm0
- 0x000000000040072b <+139>: mulss %xmm5,%xmm3
- 0x000000000040072f <+143>: movss (%rax),%xmm2
- 0x0000000000400733 <+147>: movaps %xmm8,%xmm10
- 0x0000000000400737 <+151>: mulss %xmm6,%xmm1
- 0x000000000040073b <+155>: movss 0x4(%rax),%xmm4
- 0x0000000000400740 <+160>: mulss %xmm7,%xmm5
- 0x0000000000400744 <+164>: mulss %xmm4,%xmm10
- 0x0000000000400749 <+169>: cmp %r8,%r9
- 0x000000000040074c <+172>: mov %r9,%rax
- 0x000000000040074f <+175>: subss %xmm3,%xmm0
- 0x0000000000400753 <+179>: movaps %xmm2,%xmm3
- 0x0000000000400756 <+182>: mulss %xmm9,%xmm4
- 0x000000000040075b <+187>: mulss %xmm9,%xmm3
- 0x0000000000400760 <+192>: addss %xmm5,%xmm1
- 0x0000000000400764 <+196>: mulss %xmm8,%xmm2
- 0x0000000000400769 <+201>: subss %xmm10,%xmm3
- 0x000000000040076e <+206>: addss %xmm4,%xmm2
- 0x0000000000400772 <+210>: addss %xmm3,%xmm0
- 0x0000000000400776 <+214>: addss %xmm2,%xmm1
- 0x000000000040077a <+218>: addss (%r11),%xmm0
- 0x000000000040077f <+223>: addss 0x4(%r11),%xmm1
- 0x0000000000400785 <+229>: movss %xmm0,(%r10)
- 0x000000000040078a <+234>: movss %xmm1,0x4(%r10)
- 0x0000000000400790 <+240>: jne 0x400700 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+96>
- 0x0000000000400796 <+246>: pop %rbx
- 0x0000000000400797 <+247>: retq
- End of assembler dump.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement