Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- _asm {
- mov esi, K8;
- sub esi, 8;
- shl esi, 2;
- xor edi, edi;
- mov edx, a;
- mov ebx, bb;
- vxorps ymm3, ymm3, ymm3;
- Lrep:
- cmp edi, esi;
- jg Lexit;
- vmovups ymm0, ymmword ptr[edx + edi];
- vfmadd231ps ymm3, ymm0, ymmword ptr[ebx + edi];
- add edi, 32;
- jmp Lrep;
- Lexit:
- vmovups ymmword ptr[c], ymm3;
- }
- pA = A;
- for (k = 0; k < K; k++) {
- pC = C;
- for (i = 0; i < M; i++) {
- pA = A + i * K + k;
- pB = B + k * N;
- for (j = N / 32; j > 0; j--) {
- _asm {
- mov eax, pC;
- mov ebx, pA;
- mov ecx, pB;
- vmovups ymm0, ymmword ptr[eax];
- vmovss xmm1, dword ptr[ebx];
- vbroadcastss ymm4, xmm1;
- vmovups ymm2, ymmword ptr[ecx];
- vfmadd231ps ymm0, ymm4, ymm2;
- vmovups ymmword ptr[eax], ymm0;
- }
- pC += 8; pB += 8;
- _asm {
- mov eax, pC;
- mov ebx, pA;
- mov ecx, pB;
- vmovups ymm0, ymmword ptr[eax];
- vmovss xmm1, dword ptr[ebx];
- vbroadcastss ymm4, xmm1;
- vmovups ymm2, ymmword ptr[ecx];
- vfmadd231ps ymm0, ymm4, ymm2;
- vmovups ymmword ptr[eax], ymm0;
- }
- pC += 8; pB += 8;
- _asm {
- mov eax, pC;
- mov ebx, pA;
- mov ecx, pB;
- vmovups ymm0, ymmword ptr[eax];
- vmovss xmm1, dword ptr[ebx];
- vbroadcastss ymm4, xmm1;
- vmovups ymm2, ymmword ptr[ecx];
- vfmadd231ps ymm0, ymm4, ymm2;
- vmovups ymmword ptr[eax], ymm0;
- }
- pC += 8; pB += 8;
- _asm {
- mov eax, pC;
- mov ebx, pA;
- mov ecx, pB;
- vmovups ymm0, ymmword ptr[eax];
- vmovss xmm1, dword ptr[ebx];
- vbroadcastss ymm4, xmm1;
- vmovups ymm2, ymmword ptr[ecx];
- vfmadd231ps ymm0, ymm4, ymm2;
- vmovups ymmword ptr[eax], ymm0;
- }
- pC += 8; pB += 8;
- }
- for (j = N / 32 * 32; j < N; j++) {
- *pC += *pA * *pB;
- pC += 1; pB += 1;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement