Advertisement
Guest User

Untitled

a guest
Jul 21st, 2017
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.67 KB | None | 0 0
  1. __attribute__((noinline))
  2. void benchmark(cfloat* __restrict__ aa, cfloat* __restrict__ bb, cfloat* __restrict__ cc, cfloat* __restrict__ dd, cfloat uu, cfloat vv, size_t nn) {
  3. for (ssize_t ii=0; ii < nn; ii++) {
  4. dd[ii] = (
  5. aa[ii]*uu +
  6. bb[ii]*vv +
  7. cc[ii]
  8. );
  9. }
  10. }
  11.  
  12. struct cfloat {
  13. cfloat(float re, float im) : re(re), im(im) {}
  14. float re,im;
  15. };
  16.  
  17. cfloat operator +(cfloat a, cfloat b) {
  18. return cfloat(a.re+b.re, a.im+b.im);
  19. }
  20.  
  21. cfloat operator *(cfloat a, cfloat b) {
  22. return cfloat(a.re*b.re-a.im*b.im, a.re*b.im+a.im*b.re);
  23. }
  24.  
  25. 0x00000000004006a0 <+0>: push %r15
  26. 0x00000000004006a2 <+2>: test %r8,%r8
  27. 0x00000000004006a5 <+5>: push %r14
  28. 0x00000000004006a7 <+7>: push %r13
  29. 0x00000000004006a9 <+9>: push %r12
  30. 0x00000000004006ab <+11>: push %rbp
  31. 0x00000000004006ac <+12>: push %rbx
  32. 0x00000000004006ad <+13>: movq %xmm0,-0x28(%rsp)
  33. 0x00000000004006b3 <+19>: mov %rdi,-0x38(%rsp)
  34. 0x00000000004006b8 <+24>: mov -0x28(%rsp),%rax
  35. 0x00000000004006bd <+29>: movq %xmm1,-0x28(%rsp)
  36. 0x00000000004006c3 <+35>: mov -0x28(%rsp),%r9
  37. 0x00000000004006c8 <+40>: je 0x4008a0 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+512>
  38. 0x00000000004006ce <+46>: mov %r9,%r15
  39. 0x00000000004006d1 <+49>: mov %rax,%r14
  40. 0x00000000004006d4 <+52>: xor %r11d,%r11d
  41. 0x00000000004006d7 <+55>: shr $0x20,%r15
  42. 0x00000000004006db <+59>: shr $0x20,%r14
  43. 0x00000000004006df <+63>: xor %r10d,%r10d
  44. 0x00000000004006e2 <+66>: mov %r15d,-0x2c(%rsp)
  45. 0x00000000004006e7 <+71>: xor %ebp,%ebp
  46. 0x00000000004006e9 <+73>: xor %ebx,%ebx
  47. 0x00000000004006eb <+75>: movss -0x2c(%rsp),%xmm6
  48. 0x00000000004006f1 <+81>: mov %r9d,-0x2c(%rsp)
  49. 0x00000000004006f6 <+86>: movss -0x2c(%rsp),%xmm5
  50. 0x00000000004006fc <+92>: mov %r14d,-0x2c(%rsp)
  51. 0x0000000000400701 <+97>: movss -0x2c(%rsp),%xmm4
  52. 0x0000000000400707 <+103>: mov %eax,-0x2c(%rsp)
  53. 0x000000000040070b <+107>: xor %r13d,%r13d
  54. 0x000000000040070e <+110>: xor %r12d,%r12d
  55. 0x0000000000400711 <+113>: movabs $0xffffffff00000000,%r9
  56. 0x000000000040071b <+123>: movss -0x2c(%rsp),%xmm3
  57. 0x0000000000400721 <+129>: nopl 0x0(%rax)
  58. 0x0000000000400728 <+136>: lea 0x0(,%r13,8),%rax
  59. 0x0000000000400730 <+144>: movaps %xmm6,%xmm1
  60. 0x0000000000400733 <+147>: movaps %xmm5,%xmm7
  61. 0x0000000000400736 <+150>: and $0xffffffff,%ebp
  62. 0x0000000000400739 <+153>: lea (%rsi,%rax,1),%r15
  63. 0x000000000040073d <+157>: lea (%rdx,%rax,1),%r14
  64. 0x0000000000400741 <+161>: add -0x38(%rsp),%rax
  65. 0x0000000000400746 <+166>: and $0xffffffff,%ebx
  66. 0x0000000000400749 <+169>: add $0x1,%r12
  67. 0x000000000040074d <+173>: movss (%r15),%xmm0
  68. 0x0000000000400752 <+178>: movss 0x4(%r15),%xmm2
  69. 0x0000000000400758 <+184>: mulss %xmm0,%xmm1
  70. 0x000000000040075c <+188>: mulss %xmm2,%xmm7
  71. 0x0000000000400760 <+192>: mulss %xmm5,%xmm0
  72. 0x0000000000400764 <+196>: mulss %xmm6,%xmm2
  73. 0x0000000000400768 <+200>: addss %xmm7,%xmm1
  74. 0x000000000040076c <+204>: movaps %xmm3,%xmm7
  75. 0x000000000040076f <+207>: subss %xmm2,%xmm0
  76. 0x0000000000400773 <+211>: movd %xmm1,-0x30(%rsp)
  77. 0x0000000000400779 <+217>: mov -0x30(%rsp),%edi
  78. 0x000000000040077d <+221>: movaps %xmm4,%xmm1
  79. 0x0000000000400780 <+224>: movd %xmm0,-0x30(%rsp)
  80. 0x0000000000400786 <+230>: mov %edi,%r15d
  81. 0x0000000000400789 <+233>: mov -0x30(%rsp),%edi
  82. 0x000000000040078d <+237>: movss (%rax),%xmm0
  83. 0x0000000000400791 <+241>: shl $0x20,%r15
  84. 0x0000000000400795 <+245>: movss 0x4(%rax),%xmm2
  85. 0x000000000040079a <+250>: mulss %xmm0,%xmm1
  86. 0x000000000040079e <+254>: or %r15,%rbp
  87. 0x00000000004007a1 <+257>: mulss %xmm2,%xmm7
  88. 0x00000000004007a5 <+261>: mov %edi,%r15d
  89. 0x00000000004007a8 <+264>: and %r9,%rbp
  90. 0x00000000004007ab <+267>: mulss %xmm3,%xmm0
  91. 0x00000000004007af <+271>: or %r15,%rbp
  92. 0x00000000004007b2 <+274>: mulss %xmm4,%xmm2
  93. 0x00000000004007b6 <+278>: addss %xmm7,%xmm1
  94. 0x00000000004007ba <+282>: subss %xmm2,%xmm0
  95. 0x00000000004007be <+286>: movd %xmm1,-0x30(%rsp)
  96. 0x00000000004007c4 <+292>: mov -0x30(%rsp),%edi
  97. 0x00000000004007c8 <+296>: movd %xmm0,-0x30(%rsp)
  98. 0x00000000004007ce <+302>: mov %edi,%eax
  99. 0x00000000004007d0 <+304>: mov -0x30(%rsp),%edi
  100. 0x00000000004007d4 <+308>: shl $0x20,%rax
  101. 0x00000000004007d8 <+312>: or %rax,%rbx
  102. 0x00000000004007db <+315>: and %r9,%rbx
  103. 0x00000000004007de <+318>: mov %edi,%eax
  104. 0x00000000004007e0 <+320>: or %rax,%rbx
  105. 0x00000000004007e3 <+323>: mov %r10,%rax
  106. 0x00000000004007e6 <+326>: mov %rbx,%rdi
  107. 0x00000000004007e9 <+329>: and $0xffffffff,%eax
  108. 0x00000000004007ec <+332>: shr $0x20,%rdi
  109. 0x00000000004007f0 <+336>: mov %edi,-0x20(%rsp)
  110. 0x00000000004007f4 <+340>: mov %rbp,%rdi
  111. 0x00000000004007f7 <+343>: shr $0x20,%rdi
  112. 0x00000000004007fb <+347>: movss -0x20(%rsp),%xmm0
  113. 0x0000000000400801 <+353>: mov %edi,-0x10(%rsp)
  114. 0x0000000000400805 <+357>: addss -0x10(%rsp),%xmm0
  115. 0x000000000040080b <+363>: mov %ebp,-0x10(%rsp)
  116. 0x000000000040080f <+367>: movss %xmm0,-0x20(%rsp)
  117. 0x0000000000400815 <+373>: mov -0x20(%rsp),%r10d
  118. 0x000000000040081a <+378>: mov %ebx,-0x20(%rsp)
  119. 0x000000000040081e <+382>: movss -0x20(%rsp),%xmm0
  120. 0x0000000000400824 <+388>: addss -0x10(%rsp),%xmm0
  121. 0x000000000040082a <+394>: shl $0x20,%r10
  122. 0x000000000040082e <+398>: or %rax,%r10
  123. 0x0000000000400831 <+401>: and %r9,%r10
  124. 0x0000000000400834 <+404>: movss %xmm0,-0x20(%rsp)
  125. 0x000000000040083a <+410>: mov -0x20(%rsp),%eax
  126. 0x000000000040083e <+414>: or %rax,%r10
  127. 0x0000000000400841 <+417>: mov %r11,%rax
  128. 0x0000000000400844 <+420>: mov %r10,%rdi
  129. 0x0000000000400847 <+423>: and $0xffffffff,%eax
  130. 0x000000000040084a <+426>: shr $0x20,%rdi
  131. 0x000000000040084e <+430>: mov %edi,-0x20(%rsp)
  132. 0x0000000000400852 <+434>: movss -0x20(%rsp),%xmm0
  133. 0x0000000000400858 <+440>: addss 0x4(%r14),%xmm0
  134. 0x000000000040085e <+446>: movss %xmm0,-0x20(%rsp)
  135. 0x0000000000400864 <+452>: mov -0x20(%rsp),%r11d
  136. 0x0000000000400869 <+457>: mov %r10d,-0x20(%rsp)
  137. 0x000000000040086e <+462>: movss -0x20(%rsp),%xmm0
  138. 0x0000000000400874 <+468>: addss (%r14),%xmm0
  139. 0x0000000000400879 <+473>: shl $0x20,%r11
  140. 0x000000000040087d <+477>: or %rax,%r11
  141. 0x0000000000400880 <+480>: and %r9,%r11
  142. 0x0000000000400883 <+483>: movss %xmm0,-0x20(%rsp)
  143. 0x0000000000400889 <+489>: mov -0x20(%rsp),%eax
  144. 0x000000000040088d <+493>: or %rax,%r11
  145. 0x0000000000400890 <+496>: cmp %r8,%r12
  146. 0x0000000000400893 <+499>: mov %r11,(%rcx,%r13,8)
  147. 0x0000000000400897 <+503>: mov %r12,%r13
  148. 0x000000000040089a <+506>: jne 0x400728 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+136>
  149. 0x00000000004008a0 <+512>: pop %rbx
  150. 0x00000000004008a1 <+513>: pop %rbp
  151. 0x00000000004008a2 <+514>: pop %r12
  152. 0x00000000004008a4 <+516>: pop %r13
  153. 0x00000000004008a6 <+518>: pop %r14
  154. 0x00000000004008a8 <+520>: pop %r15
  155. 0x00000000004008aa <+522>: retq
  156.  
  157. struct cfloat {
  158. cfloat(float re, float im) { ri[0] = re; ri[1] = im; }
  159. float ri[2];
  160. };
  161.  
  162. cfloat operator +(cfloat a, cfloat b) {
  163. return cfloat(a.ri[0]+b.ri[0], a.ri[1]+b.ri[1]);
  164. }
  165.  
  166. cfloat operator *(cfloat a, cfloat b) {
  167. return cfloat(a.ri[0]*b.ri[0]-a.ri[1]*b.ri[1], a.ri[0]*b.ri[1]+a.ri[1]*b.ri[0]);
  168. }
  169.  
  170. Dump of assembler code for function _Z9benchmarkP6cfloatS0_S0_S0_S_S_m:
  171. 0x00000000004006a0 <+0>: push %rbx
  172. 0x00000000004006a1 <+1>: movq %xmm0,-0x8(%rsp)
  173. 0x00000000004006a7 <+7>: mov -0x8(%rsp),%r9
  174. 0x00000000004006ac <+12>: movq %xmm1,-0x8(%rsp)
  175. 0x00000000004006b2 <+18>: mov -0x8(%rsp),%rax
  176. 0x00000000004006b7 <+23>: mov %r9d,-0xc(%rsp)
  177. 0x00000000004006bc <+28>: shr $0x20,%r9
  178. 0x00000000004006c0 <+32>: movss -0xc(%rsp),%xmm9
  179. 0x00000000004006c7 <+39>: mov %r9d,-0xc(%rsp)
  180. 0x00000000004006cc <+44>: movss -0xc(%rsp),%xmm8
  181. 0x00000000004006d3 <+51>: mov %eax,-0xc(%rsp)
  182. 0x00000000004006d7 <+55>: shr $0x20,%rax
  183. 0x00000000004006db <+59>: movss -0xc(%rsp),%xmm7
  184. 0x00000000004006e1 <+65>: test %r8,%r8
  185. 0x00000000004006e4 <+68>: mov %eax,-0xc(%rsp)
  186. 0x00000000004006e8 <+72>: movss -0xc(%rsp),%xmm6
  187. 0x00000000004006ee <+78>: je 0x400796 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+246>
  188. 0x00000000004006f4 <+84>: xor %eax,%eax
  189. 0x00000000004006f6 <+86>: xor %r9d,%r9d
  190. 0x00000000004006f9 <+89>: nopl 0x0(%rax)
  191. 0x0000000000400700 <+96>: shl $0x3,%rax
  192. 0x0000000000400704 <+100>: movaps %xmm7,%xmm0
  193. 0x0000000000400707 <+103>: lea (%rsi,%rax,1),%rbx
  194. 0x000000000040070b <+107>: movaps %xmm6,%xmm3
  195. 0x000000000040070e <+110>: lea (%rcx,%rax,1),%r10
  196. 0x0000000000400712 <+114>: lea (%rdx,%rax,1),%r11
  197. 0x0000000000400716 <+118>: lea (%rdi,%rax,1),%rax
  198. 0x000000000040071a <+122>: movss (%rbx),%xmm1
  199. 0x000000000040071e <+126>: add $0x1,%r9
  200. 0x0000000000400722 <+130>: movss 0x4(%rbx),%xmm5
  201. 0x0000000000400727 <+135>: mulss %xmm1,%xmm0
  202. 0x000000000040072b <+139>: mulss %xmm5,%xmm3
  203. 0x000000000040072f <+143>: movss (%rax),%xmm2
  204. 0x0000000000400733 <+147>: movaps %xmm8,%xmm10
  205. 0x0000000000400737 <+151>: mulss %xmm6,%xmm1
  206. 0x000000000040073b <+155>: movss 0x4(%rax),%xmm4
  207. 0x0000000000400740 <+160>: mulss %xmm7,%xmm5
  208. 0x0000000000400744 <+164>: mulss %xmm4,%xmm10
  209. 0x0000000000400749 <+169>: cmp %r8,%r9
  210. 0x000000000040074c <+172>: mov %r9,%rax
  211. 0x000000000040074f <+175>: subss %xmm3,%xmm0
  212. 0x0000000000400753 <+179>: movaps %xmm2,%xmm3
  213. 0x0000000000400756 <+182>: mulss %xmm9,%xmm4
  214. 0x000000000040075b <+187>: mulss %xmm9,%xmm3
  215. 0x0000000000400760 <+192>: addss %xmm5,%xmm1
  216. 0x0000000000400764 <+196>: mulss %xmm8,%xmm2
  217. 0x0000000000400769 <+201>: subss %xmm10,%xmm3
  218. 0x000000000040076e <+206>: addss %xmm4,%xmm2
  219. 0x0000000000400772 <+210>: addss %xmm3,%xmm0
  220. 0x0000000000400776 <+214>: addss %xmm2,%xmm1
  221. 0x000000000040077a <+218>: addss (%r11),%xmm0
  222. 0x000000000040077f <+223>: addss 0x4(%r11),%xmm1
  223. 0x0000000000400785 <+229>: movss %xmm0,(%r10)
  224. 0x000000000040078a <+234>: movss %xmm1,0x4(%r10)
  225. 0x0000000000400790 <+240>: jne 0x400700 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+96>
  226. 0x0000000000400796 <+246>: pop %rbx
  227. 0x0000000000400797 <+247>: retq
  228. End of assembler dump.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement