Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- typedef unsigned long u64;
- void mul(u64 f[8], const u64 a[4], const u64 b[4]) {
- // a0..a3 b0..b3 f0..f7 mu t0..t1 s0..s3s0..s3
- u64 t0, t1, t2, mu;
- asm volatile (
- "movq %[a0], %[mu];"
- "mulxq %[b0], %[f4], %[t0]; movq %[f4], %[f0];"
- "mulxq %[b1], %[f5], %[t1]; addq %[t0], %[f5];"
- "mulxq %[b2], %[f6], %[t2]; adcq %[t1], %[f6];"
- "mulxq %[b3], %[f7], %[f4]; adcq %[t2], %[f7]; adcq $0, %[f4];"
- "movq %[a1], %[mu];"
- "mulxq %[b0], %[t2], %[t0]; addq %[t2], %[f5]; movq %[f5], %[f1];"
- "mulxq %[b1], %[t2], %[t1]; adcq %[t2], %[f6];"
- "mulxq %[b2], %[f5], %[t2]; adcq %[f5], %[f7];"
- "mulxq %[b3], %[mu], %[f5]; adcq %[mu], %[f4]; adcq $0, %[f5];"
- "addq %[t0], %[f6]; adcq %[t1], %[f7]; adcq %[t2], %[f4]; adcq $0, %[f5];"
- "movq %[a2], %[mu];"
- "mulxq %[b0], %[t2], %[t0]; addq %[t2], %[f6]; movq %[f6], %[f2];"
- "mulxq %[b1], %[t2], %[t1]; adcq %[t2], %[f7];"
- "mulxq %[b2], %[f6], %[t2]; adcq %[f6], %[f4];"
- "mulxq %[b3], %[mu], %[f6]; adcq %[mu], %[f5]; adcq $0, %[f6];"
- "addq %[t0], %[f7]; adcq %[t1], %[f4]; adcq %[t2], %[f5]; adcq $0, %[f6];"
- "movq %[a3], %[mu];"
- "mulxq %[b0], %[t2], %[t0]; addq %[t2], %[f7]; movq %[f7], %[f3];"
- "mulxq %[b1], %[t2], %[t1]; adcq %[t2], %[f4];"
- "mulxq %[b2], %[f7], %[t2]; adcq %[f7], %[f5];"
- "mulxq %[b3], %[mu], %[f7]; adcq %[mu], %[f6]; adcq $0, %[f7];"
- "addq %[t0], %[f4]; adcq %[t1], %[f5]; adcq %[t2], %[f6]; adcq $0, %[f7];"
- :
- [f0]"=&m"(f[0]), [f1]"=&m"(f[1]), [f2]"=&m"(f[2]), [f3]"=&m"(f[3]),
- [f4]"=&r"(f[4]), [f5]"=&r"(f[5]), [f6]"=&r"(f[6]), [f7]"=&r"(f[7]),
- [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [mu]"=&d"(mu):
- [a0]"m"(a[0]), [a1]"m"(a[1]), [a2]"m"(a[2]), [a3]"m"(a[3]),
- [b0]"m"(b[0]), [b1]"m"(b[1]), [b2]"m"(b[2]), [b3]"m"(b[3])
- );
- }
- void mulX(u64 f[8], const u64 a[4], const u64 b[4]) {
- // a0..a3 b0..b3 f0..f7 mu t0..t1 s0..s3s0..s3
- u64 t0, t1, mu, s0, s1, s2, s3;
- asm volatile (
- "movq %[a0], %[mu];"
- "mulxq %[b0], %[t0], %[s1]; movq %[t0], %[f0];"
- "mulxq %[b1], %[t0], %[s2]; addq %[t0], %[s1];"
- "mulxq %[b2], %[t0], %[s3]; adcq %[t0], %[s2];"
- "mulxq %[b3], %[t0], %[s0]; adcq %[t0], %[s3]; adcq $0, %[s0]; xorl %k[t1], %k[t1];"
- "movq %[a1], %[mu];"
- "mulxq %[b0], %[t0], %[t1]; adoxq %[t0], %[s1]; adcxq %[t1], %[s2]; movq %[s1], %[f1];"
- "mulxq %[b1], %[t0], %[t1]; adoxq %[t0], %[s2]; adcxq %[t1], %[s3];"
- "mulxq %[b2], %[t0], %[t1]; adoxq %[t0], %[s3]; adcxq %[t1], %[s0];"
- "mulxq %[b3], %[t0], %[t1]; adoxq %[t0], %[s0]; movl $0, %k[s1];"
- "adoxq %[s1], %[s1]; adcxq %[t1], %[s1];"
- "movq %[a2], %[mu];"
- "mulxq %[b0], %[t0], %[t1]; adoxq %[t0], %[s2]; adcxq %[t1], %[s3]; movq %[s2], %[f2];"
- "mulxq %[b1], %[t0], %[t1]; adoxq %[t0], %[s3]; adcxq %[t1], %[s0];"
- "mulxq %[b2], %[t0], %[t1]; adoxq %[t0], %[s0]; adcxq %[t1], %[s1];"
- "mulxq %[b3], %[t0], %[t1]; adoxq %[t0], %[s1]; movl $0, %k[s2];"
- "adoxq %[s2], %[s2]; adcxq %[t1], %[s2];"
- "movq %[a3], %[mu];"
- "mulxq %[b0], %[t0], %[t1]; adoxq %[t0], %[s3]; adcxq %[t1], %[s0]; movq %[s3], %[f3];"
- "mulxq %[b1], %[t0], %[t1]; adoxq %[t0], %[s0]; adcxq %[t1], %[s1]; movq %[s0], %[f4];"
- "mulxq %[b2], %[t0], %[t1]; adoxq %[t0], %[s1]; adcxq %[t1], %[s2]; movq %[s1], %[f5];"
- "mulxq %[b3], %[t0], %[t1]; adoxq %[t0], %[s2]; movl $0, %k[s3]; movq %[s2], %[f6];"
- "adoxq %[s3], %[s3]; adcxq %[t1], %[s3];"
- "movq %[s3], %[f7];":
- [f0]"=&m"(f[0]), [f1]"=&m"(f[1]), [f2]"=&m"(f[2]), [f3]"=&m"(f[3]),
- [f4]"=&m"(f[4]), [f5]"=&m"(f[5]), [f6]"=&m"(f[6]), [f7]"=&m"(f[7]),
- [t0]"=&r"(t0), [t1]"=&r"(t1), [mu]"=&d"(mu),
- [s0]"=&r"(s0), [s1]"=&r"(s1), [s2]"=&r"(s2), [s3]"=&r"(s3):
- [a0]"m"(a[0]), [a1]"m"(a[1]), [a2]"m"(a[2]), [a3]"m"(a[3]),
- [b0]"m"(b[0]), [b1]"m"(b[1]), [b2]"m"(b[2]), [b3]"m"(b[3])
- );
- }
- int main() {
- u64 a[4] = {1,2,3,4};
- u64 b[4] = {5,4,3,2};
- u64 c[8];
- for (unsigned i=0; ++i; )mulCall(c, a, b);
- }
- /*
- $ gcc -DmulCall=mul -O2 66.c && sudo perf stat ./a.out # X
- Performance counter stats for './a.out':
- 27,690.80 msec task-clock # 0.998 CPUs utilized
- 3,247 context-switches # 117.259 /sec
- 586 cpu-migrations # 21.162 /sec
- 45 page-faults # 1.625 /sec
- 106,431,860,062 cycles # 3.844 GHz (83.31%)
- 428,297,182 stalled-cycles-frontend # 0.40% frontend cycles idle (83.34%)
- 74,221,128,251 stalled-cycles-backend # 69.74% backend cycles idle (83.33%)
- 301,156,490,496 instructions # 2.83 insn per cycle
- # 0.25 stalled cycles per insn (83.34%)
- 12,946,750,221 branches # 467.547 M/sec (83.33%)
- 2,007,761 branch-misses # 0.02% of all branches (83.34%)
- 27.738868676 seconds time elapsed
- 27.680633000 seconds user
- 0.011993000 seconds sys
- $ gcc -DmulCall=mulX -O2 66.c && sudo perf stat ./a.out # X
- Performance counter stats for './a.out':
- 28,449.67 msec task-clock # 0.998 CPUs utilized
- 3,481 context-switches # 122.356 /sec
- 533 cpu-migrations # 18.735 /sec
- 47 page-faults # 1.652 /sec
- 110,006,534,480 cycles # 3.867 GHz (83.34%)
- 565,459,105 stalled-cycles-frontend # 0.51% frontend cycles idle (83.31%)
- 73,662,756,220 stalled-cycles-backend # 66.96% backend cycles idle (83.33%)
- 309,508,804,581 instructions # 2.81 insn per cycle
- # 0.24 stalled cycles per insn (83.34%)
- 12,955,065,685 branches # 455.368 M/sec (83.33%)
- 2,288,864 branch-misses # 0.02% of all branches (83.35%)
- 28.498440635 seconds time elapsed
- 28.398338000 seconds user
- 0.051945000 seconds sys
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement