Advertisement
Guest User

Untitled

a guest
May 3rd, 2015
244
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.27 KB | None | 0 0
  1. // Vector Addition with Streams (Extra Credit)
  2. // Hard deadline : Thu 26 Mar 2015 6:00 AM CST
  3. #include <wb.h>
  4. #define wbCheck(stmt) do { \
  5. cudaError_t err = stmt; \
  6. if (err != cudaSuccess) { \
  7. wbLog(ERROR, "Failed to run stmt ", #stmt); \
  8. wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
  9. return -1; \
  10. } \
  11. } while(0)
  12.  
  13. __global__ void vecAdd(float * in1, float * in2, float * out, int len) {
  14. //@@ Insert code to implement vector addition here
  15. int i=blockIdx.x*blockDim.x+threadIdx.x;
  16. if(i<len) out[i]=in1[i]+in2[i];
  17. }
  18.  
  19. int main(int argc, char ** argv) {
  20. // multi-stream host code
  21. cudaStream_t stream0,stream1,stream2,stream3;
  22. cudaStreamCreate(&stream0);
  23. cudaStreamCreate(&stream1);
  24. cudaStreamCreate(&stream2);
  25. cudaStreamCreate(&stream3);
  26.  
  27. wbArg_t args;
  28. int inputLength;
  29.  
  30. float *h_A,*h_B,*h_C;
  31. float *d_A0,*d_B0,*d_C0; // stream 0
  32. float *d_A1,*d_B1,*d_C1; // 1
  33. float *d_A2,*d_B2,*d_C2; // stream 2
  34. float *d_A3,*d_B3,*d_C3; // 3
  35.  
  36. int n;
  37. int size;
  38. int SegSize;
  39.  
  40. args = wbArg_read(argc, argv);
  41.  
  42. wbTime_start(Generic, "Importing data and creating memory on host");
  43. h_A = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
  44. h_B = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
  45. h_C = (float *) malloc(inputLength * sizeof(float));
  46. wbTime_stop(Generic, "Importing data and creating memory on host");
  47.  
  48.  
  49.  
  50. n=inputLength;
  51. SegSize=inputLength/4;
  52. size=n*sizeof(float);
  53.  
  54.  
  55. wbCheck(cudaMalloc((void **) &d_A0, size));
  56. wbCheck(cudaMalloc((void **) &d_B0, size));
  57. wbCheck(cudaMalloc((void **) &d_C0, size));
  58.  
  59. wbCheck(cudaMalloc((void **) &d_A1, size));
  60. wbCheck(cudaMalloc((void **) &d_B1, size));
  61. wbCheck(cudaMalloc((void **) &d_C1, size));
  62.  
  63. wbCheck(cudaMalloc((void **) &d_A2, size));
  64. wbCheck(cudaMalloc((void **) &d_B2, size));
  65. wbCheck(cudaMalloc((void **) &d_C2, size));
  66.  
  67. wbCheck(cudaMalloc((void **) &d_A3, size));
  68. wbCheck(cudaMalloc((void **) &d_B3, size));
  69. wbCheck(cudaMalloc((void **) &d_C3, size));
  70.  
  71.  
  72. // dim
  73. dim3 DimGrid((n-1)/256+1,1,1);
  74. dim3 DimBlock(256,1,1);
  75.  
  76. for(int i=0;i<n;i+=SegSize*4)
  77. {
  78. cudaMemcpyAsync(d_A0,h_A+i,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream0);
  79. cudaMemcpyAsync(d_B0,h_B+i,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream0);
  80. cudaMemcpyAsync(d_A1+i,h_A+i+SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream1);
  81. cudaMemcpyAsync(d_B1+i,h_B+i+SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream1);
  82.  
  83. cudaMemcpyAsync(d_A2,h_A+i+2*SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream2);
  84. cudaMemcpyAsync(d_B2,h_B+i+2*SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream2);
  85. cudaMemcpyAsync(d_A3+i,h_A+i+3*SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream3);
  86. cudaMemcpyAsync(d_B3+i,h_B+i+3*SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream3);
  87.  
  88. vecAdd<<<DimGrid,256,0,stream0>>>(d_A0,d_B0,d_C0,n);
  89. vecAdd<<<DimGrid,256,0,stream1>>>(d_A1,d_B1,d_C1,n);
  90. vecAdd<<<DimGrid,256,0,stream2>>>(d_A2,d_B2,d_C2,n);
  91. vecAdd<<<DimGrid,256,0,stream3>>>(d_A3,d_B3,d_C3,n);
  92.  
  93. cudaMemcpyAsync(h_C+i,d_C0,SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream0);
  94. cudaMemcpyAsync(h_C+i+SegSize,d_C1,SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream1);
  95. cudaMemcpyAsync(h_C+i+2*SegSize,d_C2,SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream2);
  96. cudaMemcpyAsync(h_C+i+3*SegSize,d_C3,SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream3);
  97. }
  98.  
  99. cudaFree(d_A0);
  100. cudaFree(d_B0);
  101. cudaFree(d_C0);
  102.  
  103. cudaFree(d_A1);
  104. cudaFree(d_B1);
  105. cudaFree(d_C1);
  106.  
  107. cudaFree(d_A2);
  108. cudaFree(d_B2);
  109. cudaFree(d_C2);
  110.  
  111. cudaFree(d_A3);
  112. cudaFree(d_B3);
  113. cudaFree(d_C3);
  114.  
  115. wbSolution(args, h_C, inputLength);
  116.  
  117. free(h_A);
  118. free(h_B);
  119. free(h_C);
  120.  
  121. return 0;
  122. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement