Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // MP Scan
- // Given a list (lst) of length n
- // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
- // +
- // lst[n-1]}
- #include <wb.h>
- #define BLOCK_SIZE 1024 //@@ You can change this
- #define SEC_SIZE 2*BLOCK_SIZE //section size is how many elements it takes in once
- #define wbCheck(stmt) \
- do { \
- cudaError_t err = stmt; \
- if (err != cudaSuccess) { \
- wbLog(ERROR, "Failed to run stmt ", #stmt); \
- wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
- return -1; \
- } \
- } while (0)
- __global__ void add(float *input, float *aux, int len){
- // adds the first element from the auxillary array to the second section of the input array
- //till the second last element of the auxiallary array
- //returns input array itself
- int i = 2 * blockIdx.x * blockDim.x + threadIdx.x;
- if(blockIdx.x > 0){
- if(i<len)
- input[i] += aux[blockIdx.x-1];
- if(i + BLOCK_SIZE < len)
- input[i + BLOCK_SIZE] += aux[blockIdx.x-1];
- }
- }
- //use flag to tell GPU not to create an auxillary array when scanning auxillary array itself
- __global__ void scan(float *input, float *output, float *aux, int flag, int len) {
- //@@ Modify the body of this function to complete the functionality of
- //@@ the scan on the device
- //@@ You may need multiple kernel calls; write your kernels before this
- //@@ function and call them from here
- __shared__ float XY[SEC_SIZE];
- int i = 2*blockIdx.x*blockDim.x + threadIdx.x;
- int t = threadIdx.x;
- XY[t] = 0;
- XY[t + BLOCK_SIZE] = 0;
- if (i < len)
- XY[t] = input[i];
- if(i + BLOCK_SIZE < len)
- XY[t+blockDim.x] = input[i+blockDim.x];
- //for loop for first reduction phase
- for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
- __syncthreads();
- int index = (t+1) * 2* stride -1;
- if (index < SEC_SIZE) {
- XY[index] += XY[index - stride];
- }
- }
- //start distribution phase here
- for (int stride = SEC_SIZE/4; stride > 0; stride /= 2) {
- __syncthreads();
- int index = (t+1)*stride*2 - 1;
- if(index + stride < SEC_SIZE) {
- XY[index + stride] += XY[index];
- }
- }
- //copy it
- __syncthreads();
- if (i < len)
- output[i] = XY[t];
- if (i+BLOCK_SIZE < len)
- output[i+blockDim.x] = XY[t+blockDim.x];
- //only for first case to create our auxillary array
- if(flag && t == blockDim.x-1)
- aux[blockIdx.x] = XY[SEC_SIZE-1];
- }
- int main(int argc, char **argv) {
- wbArg_t args;
- float *hostInput; // The input 1D list
- float *hostOutput; // The output list
- float *deviceInput;
- float *deviceOutput;
- float *aux; //array auxillary
- int numElements; // number of elements in the list
- args = wbArg_read(argc, argv);
- wbTime_start(Generic, "Importing data and creating memory on host");
- hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
- hostOutput = (float *)malloc(numElements * sizeof(float));
- wbTime_stop(Generic, "Importing data and creating memory on host");
- int num = ceil(numElements/(2.0*BLOCK_SIZE)); //number elements of the block
- wbLog(TRACE, "The number of input elements in the input is ",
- numElements);
- wbTime_start(GPU, "Allocating GPU memory.");
- wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
- wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
- wbCheck(cudaMalloc((void **)&aux, num * sizeof(float))); //allocating GPU memory for auxillary array input
- wbTime_stop(GPU, "Allocating GPU memory.");
- wbTime_start(GPU, "Clearing output memory.");
- wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
- wbTime_stop(GPU, "Clearing output memory.");
- wbTime_start(GPU, "Copying input memory to the GPU.");
- wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float),
- cudaMemcpyHostToDevice));
- wbTime_stop(GPU, "Copying input memory to the GPU.");
- //@@ Initialize the grid and block dimensions here
- dim3 dimGrid(num); //number of blocks needed are just num
- dim3 dimBlock(BLOCK_SIZE); //number of threads are BLOCK_SIZE. Operate on twice that value
- dim3 auxDimGrid(1);
- dim3 auxDimBlock(ceil(num/2.0)); //try changing to ceil(num/2) because we don't need that many threads
- //call the scan kernel to create the auxillary array and perform scan on input
- scan <<<dimGrid, dimBlock>>> (deviceInput, deviceOutput, aux, 1, numElements);
- //cudaFree(deviceInput); (just gonna reuse this)
- //copy auxillary array received back to host memory
- //copy output array to host memory
- cudaDeviceSynchronize();
- scan <<<auxDimGrid, auxDimBlock>>> (aux, aux, NULL, 0, numElements);
- //we have auxOut in the GPU now.
- //we also have deviceOutput in the GPU now.
- cudaDeviceSynchronize();
- add <<<dimGrid, dimBlock>>> (deviceOutput, aux, numElements);
- cudaDeviceSynchronize();
- wbTime_start(Compute, "Performing CUDA computation");
- //@@ Modify this to complete the functionality of the scan
- //@@ on the deivce
- wbTime_stop(Compute, "Performing CUDA computation");
- wbTime_start(Copy, "Copying output memory to the CPU");
- wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements * sizeof(float),
- cudaMemcpyDeviceToHost));
- wbTime_stop(Copy, "Copying output memory to the CPU");
- wbTime_start(GPU, "Freeing GPU Memory");
- cudaFree(deviceInput);
- cudaFree(deviceOutput);
- cudaFree(aux);
- wbTime_stop(GPU, "Freeing GPU Memory");
- wbSolution(args, hostOutput, numElements);
- free(hostInput);
- free(hostOutput);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement