Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Vector Addition with Streams (Extra Credit)
- // Hard deadline : Thu 26 Mar 2015 6:00 AM CST
- #include <wb.h>
- #define wbCheck(stmt) do { \
- cudaError_t err = stmt; \
- if (err != cudaSuccess) { \
- wbLog(ERROR, "Failed to run stmt ", #stmt); \
- wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
- return -1; \
- } \
- } while(0)
- __global__ void vecAdd(float * in1, float * in2, float * out, int len) {
- //@@ Insert code to implement vector addition here
- int i=blockIdx.x*blockDim.x+threadIdx.x;
- if(i<len) out[i]=in1[i]+in2[i];
- }
- int main(int argc, char ** argv) {
- // multi-stream host code
- cudaStream_t stream0,stream1,stream2,stream3;
- cudaStreamCreate(&stream0);
- cudaStreamCreate(&stream1);
- cudaStreamCreate(&stream2);
- cudaStreamCreate(&stream3);
- wbArg_t args;
- int inputLength;
- float *h_A,*h_B,*h_C;
- float *d_A0,*d_B0,*d_C0; // stream 0
- float *d_A1,*d_B1,*d_C1; // 1
- float *d_A2,*d_B2,*d_C2; // stream 2
- float *d_A3,*d_B3,*d_C3; // 3
- int n;
- int size;
- int SegSize;
- args = wbArg_read(argc, argv);
- wbTime_start(Generic, "Importing data and creating memory on host");
- h_A = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
- h_B = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
- h_C = (float *) malloc(inputLength * sizeof(float));
- wbTime_stop(Generic, "Importing data and creating memory on host");
- n=inputLength;
- SegSize=inputLength/4;
- size=n*sizeof(float);
- wbCheck(cudaMalloc((void **) &d_A0, size));
- wbCheck(cudaMalloc((void **) &d_B0, size));
- wbCheck(cudaMalloc((void **) &d_C0, size));
- wbCheck(cudaMalloc((void **) &d_A1, size));
- wbCheck(cudaMalloc((void **) &d_B1, size));
- wbCheck(cudaMalloc((void **) &d_C1, size));
- wbCheck(cudaMalloc((void **) &d_A2, size));
- wbCheck(cudaMalloc((void **) &d_B2, size));
- wbCheck(cudaMalloc((void **) &d_C2, size));
- wbCheck(cudaMalloc((void **) &d_A3, size));
- wbCheck(cudaMalloc((void **) &d_B3, size));
- wbCheck(cudaMalloc((void **) &d_C3, size));
- // dim
- dim3 DimGrid((n-1)/256+1,1,1);
- dim3 DimBlock(256,1,1);
- for(int i=0;i<n;i+=SegSize*4)
- {
- cudaMemcpyAsync(d_A0,h_A+i,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream0);
- cudaMemcpyAsync(d_B0,h_B+i,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream0);
- cudaMemcpyAsync(d_A1+i,h_A+i+SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream1);
- cudaMemcpyAsync(d_B1+i,h_B+i+SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream1);
- cudaMemcpyAsync(d_A2,h_A+i+2*SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream2);
- cudaMemcpyAsync(d_B2,h_B+i+2*SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream2);
- cudaMemcpyAsync(d_A3+i,h_A+i+3*SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream3);
- cudaMemcpyAsync(d_B3+i,h_B+i+3*SegSize,SegSize*sizeof(float),cudaMemcpyHostToDevice,stream3);
- vecAdd<<<DimGrid,256,0,stream0>>>(d_A0,d_B0,d_C0,n);
- vecAdd<<<DimGrid,256,0,stream1>>>(d_A1,d_B1,d_C1,n);
- vecAdd<<<DimGrid,256,0,stream2>>>(d_A2,d_B2,d_C2,n);
- vecAdd<<<DimGrid,256,0,stream3>>>(d_A3,d_B3,d_C3,n);
- cudaMemcpyAsync(h_C+i,d_C0,SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream0);
- cudaMemcpyAsync(h_C+i+SegSize,d_C1,SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream1);
- cudaMemcpyAsync(h_C+i+2*SegSize,d_C2,SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream2);
- cudaMemcpyAsync(h_C+i+3*SegSize,d_C3,SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream3);
- }
- cudaFree(d_A0);
- cudaFree(d_B0);
- cudaFree(d_C0);
- cudaFree(d_A1);
- cudaFree(d_B1);
- cudaFree(d_C1);
- cudaFree(d_A2);
- cudaFree(d_B2);
- cudaFree(d_C2);
- cudaFree(d_A3);
- cudaFree(d_B3);
- cudaFree(d_C3);
- wbSolution(args, h_C, inputLength);
- free(h_A);
- free(h_B);
- free(h_C);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement