Advertisement
Guest User

scalar.c

a guest
Nov 28th, 2011
864
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 5.26 KB | None | 0 0
  1. // Scalar product benchmarks
  2. // Daniel Lemire, Nov. 28 2011
  3. // http://lemire.me/blog/
  4.  
  5. // gcc -funroll-loops -mno-sse2  -O3 -o scalar scalar.c
  6. // gcc -funroll-loops  -O3 -o scalar scalar.c
  7.  
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10. #include <sys/time.h>
  11.  
  12. typedef unsigned int uint32;
  13. typedef unsigned long long uint64;
  14.  
  15. uint32 scalarproduct( uint32 *  v1,  uint32 *  v2,  uint32 *  endv1) {
  16.     uint32 sum = 0;
  17.     for(; v1!= endv1; ++v1, ++v2 ) {
  18.         sum+= (*v2 *  *v1);
  19.     }
  20.     return sum;
  21. }
  22.  
  23.  
  24. uint32 scalarproduct2by2( uint32 *  v1,  uint32 *  v2,  uint32 *  endv1) {
  25.     uint64 sum = 0;
  26.     for(; v1!= endv1; v1+=2, v2+=2) {
  27.         sum+= (*v2 *  *v1) + (*(v2 + 1) *   *(v1+1));
  28.     }
  29.     return sum;
  30. }
  31.  
  32. float scalarproductf( float *  v1,  float *  v2,  float *  endv1) {
  33.     float sum = 0;
  34.     for(; v1!= endv1; ++v1, ++v2 ) {
  35.         sum+= (*v2 *  *v1);
  36.     }
  37.     return sum;
  38. }
  39.  
  40.  
  41. float scalarproduct2by2f( float *  v1,  float *  v2,  float *  endv1) {
  42.     float sum = 0;
  43.     for(; v1!= endv1; v1+=2, v2+=2) {
  44.         sum+= (*v2 * *v1) + (*(v2 + 1) *  *(v1+1));
  45.     }
  46.     return sum;
  47. }
  48.  
  49.  
  50. uint64 scalarproduct64( uint64 *  v1,  uint64 *  v2,  uint64 *  endv1) {
  51.     uint64 sum = 0;
  52.     for(; v1!= endv1; ++v1, ++v2 ) {
  53.         sum+= (*v2 * *v1);
  54.     }
  55.     return sum;
  56. }
  57.  
  58.  
  59. uint64 scalarproduct2by264( uint64 *  v1,  uint64 *  v2,  uint64 *  endv1) {
  60.     uint64 sum = 0;
  61.     for(; v1!= endv1; v1+=2, v2+=2) {
  62.         sum+= (*v2 * *v1) + (*(v2 + 1) *  *(v1+1));
  63.     }
  64.     return sum;
  65. }
  66.  
  67. double scalarproductf64( double *  v1,  double *  v2,  double *  endv1) {
  68.     float sum = 0;
  69.     for(; v1!= endv1; ++v1, ++v2 ) {
  70.         sum+= (*v2 *  *v1);
  71.     }
  72.     return sum;
  73. }
  74.  
  75.  
  76. double scalarproduct2by2f64( double *  v1,  double *  v2,  double *  endv1) {
  77.     float sum = 0;
  78.     for(; v1!= endv1; v1+=2, v2+=2) {
  79.         sum+= (*v2 * *v1) + (*(v2 + 1) *  *(v1+1));
  80.     }
  81.     return sum;
  82. }
  83.  
  84.  
  85. #if (defined (__i386__) || defined( __x86_64__ ))
  86.  
  87. typedef unsigned long long ticks;
  88.  
  89. static __inline__ ticks start (void) {
  90.   unsigned cycles_low, cycles_high;
  91.   asm volatile ("CPUID\n\t"
  92.         "RDTSC\n\t"
  93.         "mov %%edx, %0\n\t"
  94.         "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
  95.         "%rax", "%rbx", "%rcx", "%rdx");
  96.   return ((ticks)cycles_high << 32) | cycles_low;
  97. }
  98.  
  99. static __inline__ ticks stop (void) {
  100.   unsigned cycles_low, cycles_high;
  101.   asm volatile("RDTSCP\n\t"
  102.            "mov %%edx, %0\n\t"
  103.            "mov %%eax, %1\n\t"
  104.            "CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
  105.            "%rbx", "%rcx", "%rdx");
  106.   return ((ticks)cycles_high << 32) | cycles_low;
  107. }
  108.  
  109. #else
  110. #error Unknown architecture
  111. #endif
  112.  
  113.  
  114. int main(int argc, char **argv) {
  115.     int i,j;
  116.     uint64 sumToFoolCompiler = 0;
  117.     double sumToFoolCompilerf = 0;
  118.     uint64 bef,aft;
  119.     int N = 2048;
  120.     uint32 v1[2*N];
  121.     uint32 v2[2*N];
  122.     int trials = 100000;
  123.     for (i=0; i < 2*N; ++i) {
  124.         v1[i]= rand();
  125.         v2[i]= rand();
  126.     }
  127.     bef = start();
  128.     for(j=0; j < trials; ++j)
  129.       sumToFoolCompiler += scalarproduct( &v1[0],&v2[0],&v1[0] + N);
  130.     aft = stop();
  131.     printf("uint32 cycle count per element = %f  / ignore this: %llu \n",(aft-bef)*1.0/(N*trials), sumToFoolCompiler);
  132.    
  133.     bef = start();
  134.     for(j=0; j < trials; ++j)
  135.       sumToFoolCompiler += scalarproduct64( (uint64 *) &v1[0],(uint64 *) &v2[0],(uint64 *) &v1[2*N]);
  136.     aft = stop();
  137.     printf("uint64 cycle count per element = %f  / ignore this: %llu \n",(aft-bef)*1.0/(N*trials), sumToFoolCompiler);
  138.  
  139.     bef = start();
  140.     for(j=0; j < trials; ++j)
  141.       sumToFoolCompiler += scalarproduct2by2( &v1[0],&v2[0],&v1[0] + N);
  142.     aft = stop();
  143.     printf("uint32 2x2 cycle count per element = %f  / ignore this: %llu \n",(aft-bef)*1.0/(N*trials), sumToFoolCompiler);
  144.  
  145.     bef = start();
  146.     for(j=0; j < trials; ++j)
  147.       sumToFoolCompiler += scalarproduct2by264( (uint64 *) &v1[0],(uint64 *) &v2[0],(uint64 *)&v1[2*N] );
  148.     aft = stop();
  149.     printf("uint64 2x2 cycle count per element = %f  / ignore this: %llu \n",(aft-bef)*1.0/(N*trials), sumToFoolCompiler);
  150.  
  151.  
  152.     bef = start();
  153.     for(j=0; j < trials; ++j)
  154.       sumToFoolCompilerf += scalarproductf((float *) &v1[0],(float *) &v2[0],(float *) &v1[N]);
  155.     aft = stop();
  156.     printf("float cycle count per element = %f  / ignore this: %f \n",(aft-bef)*1.0/(N*trials), sumToFoolCompilerf);
  157.  
  158.     bef = start();
  159.     for(j=0; j < trials; ++j)
  160.       sumToFoolCompilerf += scalarproduct2by2f( (float *) &v1[0],(float *) &v2[0],(float *) &v1[N]);
  161.     aft = stop();
  162.     printf("float 2x2 cycle count per element = %f  / ignore this: %f \n",(aft-bef)*1.0/(N*trials), sumToFoolCompilerf);
  163.  
  164.  
  165.  
  166.     bef = start();
  167.     for(j=0; j < trials; ++j)
  168.       sumToFoolCompilerf += scalarproductf64((double *) &v1[0],(double *) &v2[0],(double *) &v1[2*N]);
  169.     aft = stop();
  170.     printf("double cycle count per element = %f  / ignore this: %f \n",(aft-bef)*1.0/(N*trials), sumToFoolCompilerf);
  171.  
  172.  
  173.  
  174.     bef = start();
  175.     for(j=0; j < trials; ++j)
  176.       sumToFoolCompilerf += scalarproduct2by2f64( (double *) &v1[0],(double *) &v2[0],(double *) &v1[2*N]);
  177.     aft = stop();
  178.     printf("double 2x2 cycle count per element = %f  / ignore this: %f \n",(aft-bef)*1.0/(N*trials), sumToFoolCompilerf);
  179.    
  180.    
  181.  
  182.  }
  183.  
  184.  
  185.  
  186.  
  187.  
  188.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement