Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // MP Scan
- // Given a list (lst) of length n
- // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
- // +
- // lst[n-1]}
- #include <wb.h>
- #define BLOCK_SIZE 512 //@@ You can change this
- #define wbCheck(stmt) \
- do { \
- cudaError_t err = stmt; \
- if (err != cudaSuccess) { \
- wbLog(ERROR, "Failed to run stmt ", #stmt); \
- wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
- return -1; \
- } \
- } while (0)
- __global__ void addfinalscan(float* blockSums, float* scannedOutput, int len){
- int t = threadIdx.x;
- int idx = (blockIdx.x * blockDim.x * 2) + t;
- if(blockIdx.x > 0 && idx < len)
- scannedOutput[idx] += blockSums[blockIdx.x - 1];
- if(blockIdx.x > 0 && (blockDim.x + idx) < len)
- scannedOutput[idx + blockDim.x] += blockSums[blockIdx.x - 1];
- }
- __global__ void finalscan(float *blockSums, int len){
- __shared__ float T[2*BLOCK_SIZE];
- int t = threadIdx.x;
- int idx = (blockIdx.x * blockDim.x * 2) + t;
- if((idx) < len)
- T[t] = blockSums[idx];
- else
- T[t] = 0.0;
- if((blockDim.x + idx) < len)
- T[blockDim.x + t] = blockSums[blockDim.x + idx];
- else
- T[blockDim.x + t] = 0.0;
- //@@ Traverse the reduction tree
- int stride = 1;
- while(stride < 2*BLOCK_SIZE){
- __syncthreads();
- int index = (t + 1)* stride * 2 - 1;
- if(index < 2*BLOCK_SIZE && (index - stride) >= 0)
- T[index] += T[index-stride];
- stride *= 2;
- }
- // Root to leaf traversal -> balanced binary tree computation
- stride = BLOCK_SIZE/2;
- while(stride > 0){
- __syncthreads();
- int index = (t + 1)* 2 * stride - 1;
- if((index + stride) < 2*BLOCK_SIZE)
- T[index + stride] += T[index];
- stride /= 2;
- }
- __syncthreads();
- if(idx < len)
- blockSums[idx] = T[t];
- if((idx + blockDim.x) < len)
- blockSums[idx + blockDim.x] = T[t + blockDim.x];
- }
- __global__ void scan(float *input, float* blockSums, float *output, int len) {
- //@@ Modify the body of this function to complete the functionality of
- //@@ the scan on the device
- //@@ You may need multiple kernel calls; write your kernels before this
- //@@ function and call them from the host
- __shared__ float T[2*BLOCK_SIZE];
- int t = threadIdx.x;
- int idx = (blockIdx.x * blockDim.x * 2) + t;
- if((idx) < len)
- T[t] = input[idx];
- else
- T[t] = 0.0;
- if((blockDim.x + idx) < len)
- T[blockDim.x + t] = input[blockDim.x + idx];
- else
- T[blockDim.x + t] = 0.0;
- //@@ Traverse the reduction tree
- int stride = 1;
- while(stride < 2*BLOCK_SIZE){
- __syncthreads();
- int index = (t + 1)* stride * 2 - 1;
- if(index < 2*BLOCK_SIZE && (index - stride) >= 0)
- T[index] += T[index-stride];
- stride *= 2;
- }
- // Root to leaf traversal -> balanced binary tree computation
- stride = BLOCK_SIZE/2;
- while(stride > 0){
- __syncthreads();
- int index = (t + 1)* 2 * stride - 1;
- if((index + stride) < 2*BLOCK_SIZE)
- T[index + stride] += T[index];
- stride /= 2;
- }
- __syncthreads();
- if(idx < len)
- output[idx] = T[t];
- if(idx + blockDim.x < len)
- output[idx + blockDim.x] = T[t + blockDim.x];
- if(idx == (blockDim.x - 1))
- blockSums[blockIdx.x] = T[2*BLOCK_SIZE - 1];
- }
- int main(int argc, char **argv) {
- wbArg_t args;
- float *hostInput; // The input 1D list
- float *hostOutput; // The output list
- float *deviceInput;
- float *deviceOutput;
- int numElements; // number of elements in the list
- float *blockSums;
- args = wbArg_read(argc, argv);
- wbTime_start(Generic, "Importing data and creating memory on host");
- hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
- hostOutput = (float *)malloc(numElements * sizeof(float));
- wbTime_stop(Generic, "Importing data and creating memory on host");
- wbLog(TRACE, "The number of input elements in the input is ",
- numElements);
- unsigned int blockScanSize = ceil(numElements/(BLOCK_SIZE*2.0));
- wbTime_start(GPU, "Allocating GPU memory.");
- wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
- wbCheck(cudaMalloc((void **)&blockSums, blockScanSize * sizeof(float)));
- wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
- wbTime_stop(GPU, "Allocating GPU memory.");
- wbTime_start(GPU, "Clearing output memory.");
- wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
- wbTime_stop(GPU, "Clearing output memory.");
- wbTime_start(GPU, "Copying input memory to the GPU.");
- wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float),
- cudaMemcpyHostToDevice));
- wbTime_stop(GPU, "Copying input memory to the GPU.");
- //@@ Initialize the grid and block dimensions here
- dim3 DimGrid(blockScanSize, 1, 1);
- dim3 DimBlock(BLOCK_SIZE, 1, 1);
- wbTime_start(Compute, "Performing CUDA computation");
- //@@ Modify this to complete the functionality of the scan
- //@@ on the deivce
- scan<<<DimGrid, DimBlock>>>(&deviceInput[0], blockSums, &deviceOutput[0], numElements);
- cudaDeviceSynchronize();
- // blockSum scan
- dim3 DimGridblockSum(1, 1, 1);
- finalscan<<<DimGridblockSum, DimBlock>>>(blockSums, blockScanSize);
- cudaDeviceSynchronize();
- // add kernel
- addfinalscan<<<DimGrid, DimBlock>>>(&blockSums[0], &deviceOutput[0], numElements);
- cudaDeviceSynchronize();
- wbTime_stop(Compute, "Performing CUDA computation");
- wbTime_start(Copy, "Copying output memory to the CPU");
- wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements * sizeof(float),
- cudaMemcpyDeviceToHost));
- for(int i = 0; i < numElements; i++) {
- printf("% d %f %f\n", i, hostInput[i], hostOutput[i]);
- }
- float sum = 0;
- for(int j = 0; j < numElements; j++) {
- sum += hostInput[j];
- }
- printf("Expected: %f\n", sum);
- wbTime_stop(Copy, "Copying output memory to the CPU");
- wbTime_start(GPU, "Freeing GPU Memory");
- cudaFree(deviceInput);
- cudaFree(deviceOutput);
- cudaFree(blockSums);
- wbTime_stop(GPU, "Freeing GPU Memory");
- wbSolution(args, hostOutput, numElements);
- free(hostInput);
- free(hostOutput);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement