Advertisement
Guest User

Cudasosat

a guest
Feb 16th, 2019
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 2.29 KB | None | 0 0
  1. #include <math.h>
  2. #include <sys/time.h>
  3.  
  4. // CUDA kernel. Each thread takes care of one element of c
  5. __global__ void vecAdd(double *a, double *b, double *c, int n)
  6. {
  7.     // Get our global thread ID
  8.     int id = blockIdx.x*blockDim.x+threadIdx.x;
  9.  
  10.     // Make sure we do not go out of bounds
  11.     if (id < n)
  12.         c[id] = a[id] + b[id];
  13. }
  14.  
  15. int main( int argc, char* argv[] )
  16. {
  17.     // Size of vectors
  18.     int n = 100000;
  19.     double st, en;
  20.     timeval start, end;
  21.  
  22.     // Host input vectors
  23.     double *h_a;
  24.     double *h_b;
  25.     //Host output vector
  26.     double *h_c;
  27.  
  28.     // Device input vectors
  29.     double *d_a;
  30.     double *d_b;
  31.     //Device output vector
  32.     double *d_c;
  33.  
  34.     // Size, in bytes, of each vector
  35.     size_t bytes = n*sizeof(double);
  36.  
  37.     // Allocate memory for each vector on host
  38.     h_a = (double*)malloc(bytes);
  39.     h_b = (double*)malloc(bytes);
  40.     h_c = (double*)malloc(bytes);
  41.  
  42.     // Allocate memory for each vector on GPU
  43.     cudaMalloc(&d_a, bytes);
  44.     cudaMalloc(&d_b, bytes);
  45.     cudaMalloc(&d_c, bytes);
  46.  
  47.     int i;
  48.     // Initialize vectors on host
  49.     for( i = 0; i < n; i++ ) {
  50.         h_a[i] = sin(i)*sin(i);
  51.         h_b[i] = cos(i)*cos(i);
  52.     }
  53.  
  54.     // Copy host vectors to device
  55.     cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
  56.     cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
  57.  
  58.     int blockSize, gridSize;
  59.  
  60.     // Number of threads in each thread block
  61.     blockSize = 1;
  62.  
  63.     // Number of thread blocks in grid
  64.     gridSize = (int)ceil((float)n/blockSize);
  65.  
  66.     // Execute the kernel
  67.    gettimeofday(&start, NULL);
  68.    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
  69.    gettimeofday(&end, NULL);
  70.    en = end.tv_sec + end.tv_usec * 1e-6;
  71.     st = start.tv_sec + start.tv_usec * 1e-6;
  72.     printf("func time: %f\n", en - st);
  73.  
  74.  
  75.     // Copy array back to host
  76.     cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
  77.  
  78.     // Sum up vector c and print result divided by n, this should equal 1 within error
  79.     double sum = 0;
  80.     for(i=0; i<n; i++)
  81.         sum += h_c[i];
  82.     printf("final result: %f\n", sum/n);
  83.  
  84.     // Release device memory
  85.     cudaFree(d_a);
  86.     cudaFree(d_b);
  87.     cudaFree(d_c);
  88.  
  89.     // Release host memory
  90.     free(h_a);
  91.     free(h_b);
  92.     free(h_c);
  93.  
  94.     return 0;
  95. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement