Advertisement
Guest User

Untitled

a guest
Jul 21st, 2019
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.75 KB | None | 0 0
  1. _asm {
  2. mov esi, K8;
  3. sub esi, 8;
  4. shl esi, 2;
  5. xor edi, edi;
  6. mov edx, a;
  7. mov ebx, bb;
  8. vxorps ymm3, ymm3, ymm3;
  9. Lrep:
  10. cmp edi, esi;
  11. jg Lexit;
  12. vmovups ymm0, ymmword ptr[edx + edi];
  13. vfmadd231ps ymm3, ymm0, ymmword ptr[ebx + edi];
  14. add edi, 32;
  15. jmp Lrep;
  16. Lexit:
  17. vmovups ymmword ptr[c], ymm3;
  18. }
  19.  
  20. pA = A;
  21. for (k = 0; k < K; k++) {
  22. pC = C;
  23. for (i = 0; i < M; i++) {
  24. pA = A + i * K + k;
  25. pB = B + k * N;
  26. for (j = N / 32; j > 0; j--) {
  27. _asm {
  28. mov eax, pC;
  29. mov ebx, pA;
  30. mov ecx, pB;
  31. vmovups ymm0, ymmword ptr[eax];
  32. vmovss xmm1, dword ptr[ebx];
  33. vbroadcastss ymm4, xmm1;
  34. vmovups ymm2, ymmword ptr[ecx];
  35. vfmadd231ps ymm0, ymm4, ymm2;
  36. vmovups ymmword ptr[eax], ymm0;
  37. }
  38. pC += 8; pB += 8;
  39. _asm {
  40. mov eax, pC;
  41. mov ebx, pA;
  42. mov ecx, pB;
  43. vmovups ymm0, ymmword ptr[eax];
  44. vmovss xmm1, dword ptr[ebx];
  45. vbroadcastss ymm4, xmm1;
  46. vmovups ymm2, ymmword ptr[ecx];
  47. vfmadd231ps ymm0, ymm4, ymm2;
  48. vmovups ymmword ptr[eax], ymm0;
  49. }
  50. pC += 8; pB += 8;
  51. _asm {
  52. mov eax, pC;
  53. mov ebx, pA;
  54. mov ecx, pB;
  55. vmovups ymm0, ymmword ptr[eax];
  56. vmovss xmm1, dword ptr[ebx];
  57. vbroadcastss ymm4, xmm1;
  58. vmovups ymm2, ymmword ptr[ecx];
  59. vfmadd231ps ymm0, ymm4, ymm2;
  60. vmovups ymmword ptr[eax], ymm0;
  61. }
  62. pC += 8; pB += 8;
  63. _asm {
  64. mov eax, pC;
  65. mov ebx, pA;
  66. mov ecx, pB;
  67. vmovups ymm0, ymmword ptr[eax];
  68. vmovss xmm1, dword ptr[ebx];
  69. vbroadcastss ymm4, xmm1;
  70. vmovups ymm2, ymmword ptr[ecx];
  71. vfmadd231ps ymm0, ymm4, ymm2;
  72. vmovups ymmword ptr[eax], ymm0;
  73. }
  74. pC += 8; pB += 8;
  75. }
  76. for (j = N / 32 * 32; j < N; j++) {
  77. *pC += *pA * *pB;
  78. pC += 1; pB += 1;
  79. }
  80. }
  81. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement