Advertisement
Guest User

Untitled

a guest
Apr 16th, 2013
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 3.78 KB | None | 0 0
  1. [code]#include <stdio.h>
  2. #include <stdlib.h>
  3. #include <iostream>
  4.  
  5. //cuda
  6. #include <helper_cuda.h>
  7.  
  8. //boost
  9. #include <boost/thread.hpp>
  10.  
  11. # define SIZE_MAX 9000
  12.  
  13. __global__ void calc(double* fpDeviceBuffer, double a, double b);
  14. __global__ void calc2(double* fpDeviceBuffer, double a, double b);
  15. __global__ void calc3(double* fpDeviceBuffer, double a, double b);
  16. __global__ void calc4(double* fpDeviceBuffer, double a, double b);
  17.  
  18. void role1();
  19. void role2();
  20.  
  21.  
  22.  
  23.  
  24.     double a = 6708.0f;
  25.     double b = 0.25f;
  26.     double *fpHostBuffer = NULL;
  27.     double *fpDeviceBuffer = NULL;
  28.     double *fpHostBuffer2 = NULL;
  29.     double *fpDeviceBuffer2 = NULL;
  30.     float kernel_time = 1;
  31.     int size = 512*128;
  32.     cudaStream_t  stream,stream2,stream3,stream4;
  33.     dim3 blocks(size/512,1,1);
  34.     dim3 threads(512,1,1);
  35.     int cuda_device = 0;
  36.     cudaDeviceProp deviceProp;
  37.  
  38. int main(int argc, char* argv[])
  39. {
  40.     //page-locked memory
  41.     checkCudaErrors(cudaMallocHost(&fpHostBuffer,size*sizeof(double)));
  42.     checkCudaErrors(cudaMallocHost(&fpHostBuffer2,size*sizeof(double)));
  43.  
  44.     //Device memory
  45.     checkCudaErrors(cudaMalloc(&fpDeviceBuffer,size*sizeof(double)));
  46.     checkCudaErrors(cudaMalloc(&fpDeviceBuffer2,size*sizeof(double)));
  47.  
  48.     // Get device properties
  49.     checkCudaErrors(cudaGetDevice(&cuda_device));
  50.     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
  51.  
  52.     checkCudaErrors( cudaMemcpy ( fpDeviceBuffer, fpHostBuffer, size*sizeof(double), cudaMemcpyHostToDevice) );
  53.     calc<<<blocks,threads>>>(fpDeviceBuffer,a,b);
  54.     calc2<<<blocks,threads>>>(fpDeviceBuffer,a,b);
  55.     calc3<<<blocks,threads>>>(fpDeviceBuffer,a,b);
  56.     calc4<<<blocks,threads>>>(fpDeviceBuffer,a,b);
  57.     checkCudaErrors( cudaMemcpy ( fpHostBuffer, fpDeviceBuffer, size*sizeof(double), cudaMemcpyDeviceToHost) );
  58.  
  59.     calc<<<blocks,threads>>>(fpDeviceBuffer,a,b);
  60.     calc2<<<blocks,threads>>>(fpDeviceBuffer,a,b);
  61.     calc3<<<blocks,threads>>>(fpDeviceBuffer,a,b);
  62.     calc4<<<blocks,threads>>>(fpDeviceBuffer,a,b);
  63.  
  64.  
  65.     cudaStreamCreate(&stream);
  66.     cudaStreamCreate(&stream2);
  67.     cudaStreamCreate(&stream3);
  68.     cudaStreamCreate(&stream4);
  69.  
  70.  
  71.     boost::thread t1(role1);
  72.     boost::thread t2(role2);
  73.     t1.join();
  74.     t2.join();
  75.  
  76.     cudaStreamDestroy(stream);
  77.     cudaStreamDestroy(stream2);
  78.     cudaStreamDestroy(stream3);
  79.     cudaStreamDestroy(stream4);
  80.  
  81.     checkCudaErrors(cudaFree(fpDeviceBuffer));
  82.     checkCudaErrors(cudaFreeHost(fpHostBuffer));
  83.     checkCudaErrors(cudaFree(fpDeviceBuffer2));
  84.     checkCudaErrors(cudaFreeHost(fpHostBuffer2));
  85.  
  86.     return(EXIT_SUCCESS);
  87. }
  88.  
  89. void role1()
  90. {
  91.     checkCudaErrors( cudaMemcpyAsync ( fpDeviceBuffer, fpHostBuffer, size*sizeof(double), cudaMemcpyHostToDevice,stream) );
  92.     calc<<<blocks,threads,0,stream>>>(fpDeviceBuffer,a,b);
  93.     calc2<<<blocks,threads,0,stream>>>(fpDeviceBuffer,a,b);
  94.     calc3<<<blocks,threads,0,stream>>>(fpDeviceBuffer,a,b);
  95.     calc4<<<blocks,threads,0,stream>>>(fpDeviceBuffer,a,b);
  96.     checkCudaErrors( cudaMemcpyAsync ( fpHostBuffer, fpDeviceBuffer, size*sizeof(double), cudaMemcpyDeviceToHost,stream) );
  97.     cudaStreamSynchronize(stream);
  98. }
  99.  
  100. void role2()
  101. {
  102.     calc<<<blocks,threads,0,stream2>>>(fpDeviceBuffer2,a,b);
  103.     calc2<<<blocks,threads,0,stream2>>>(fpDeviceBuffer2,a,b);
  104.     calc3<<<blocks,threads,0,stream2>>>(fpDeviceBuffer2,a,b);
  105.     calc4<<<blocks,threads,0,stream2>>>(fpDeviceBuffer2,a,b);
  106.     cudaStreamSynchronize(stream2);
  107. }
  108.  
  109.  
  110. __global__ void calc(double* fpDeviceBuffer, double a, double b)
  111. {
  112.         fpDeviceBuffer[threadIdx.x+512*blockIdx.x]=(a&&b);
  113. }
  114. __global__ void calc2(double* fpDeviceBuffer, double a, double b)
  115. {
  116.         fpDeviceBuffer[threadIdx.x+512*blockIdx.x]=(a*b);
  117. }
  118. __global__ void calc3(double* fpDeviceBuffer, double a, double b)
  119. {
  120.         fpDeviceBuffer[threadIdx.x+512*blockIdx.x]=(a||b);
  121. }
  122. __global__ void calc4(double* fpDeviceBuffer, double a, double b)
  123. {
  124.         fpDeviceBuffer[threadIdx.x+512*blockIdx.x]=(a+b);
  125. }
  126.  
  127. [/code]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement