Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // MP Scan
- // Given a list (lst) of length n
- // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
- // +
- // lst[n-1]}
- #include <wb.h>
- #include <iostream>
- #define BLOCK_SIZE 512 //@@ You can change this
- #define MAX_BLOCKS 65535 // Maximum number of blocks per grid dimension
- #define wbCheck(stmt) \
- do { \
- cudaError_t err = stmt; \
- if (err != cudaSuccess) { \
- wbLog(ERROR, "Failed to run stmt ", #stmt); \
- wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
- return -1; \
- } \
- } while (0)
- ////// KERNEL 1//////////////
- __global__ void blockScan(float *input, float *output, int len, float *AuxSum) {
- __shared__ float partialSum[2*BLOCK_SIZE];
- unsigned int t = threadIdx.x;
- unsigned int start = 2 * blockIdx.x * blockDim.x;
- if (start + t < len) {
- partialSum[t] = input[start + t];
- } else {
- partialSum[t] = 0.0f;
- }
- if (start + blockDim.x + t < len) {
- partialSum[blockDim.x + t] = input[start + blockDim.x + t];
- } else {
- partialSum[blockDim.x + t] = 0.0f;
- }
- __syncthreads();
- int stride = 1;
- while (stride < 2*BLOCK_SIZE) {
- __syncthreads();
- int index = (threadIdx.x + 1) * stride * 2 - 1;
- if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
- partialSum[index] += partialSum[index - stride];
- }
- stride = stride*2;
- }
- __syncthreads();
- stride = BLOCK_SIZE/2;
- while (stride > 0) {
- __syncthreads();
- int index = (threadIdx.x + 1) * stride * 2 - 1;
- if ((index + stride) < 2*BLOCK_SIZE){
- partialSum[index+stride] += partialSum[index];
- }
- stride = stride/2;
- }
- __syncthreads();
- if (start + t < len){
- output[start + t] = partialSum[t];
- }
- if (start + blockDim.x + t < len) {
- output[start + blockDim.x + t] = partialSum[blockDim.x + t];
- }
- if (threadIdx.x == blockDim.x -1) {
- AuxSum[blockIdx.x] = partialSum[2*blockDim.x - 1];
- }
- }
- /////////////////////// KERNEL 2////////////////////////////
- __global__ void auxScan(float *AuxSum, int len) {
- __shared__ float partialSum[2*BLOCK_SIZE];
- unsigned int t = threadIdx.x;
- unsigned int start = 2 * blockIdx.x * blockDim.x;
- if (start + t < len) {
- partialSum[t] = AuxSum[start + t];
- } else {
- partialSum[t] = 0.0f;
- }
- if (start + blockDim.x + t < len) {
- partialSum[blockDim.x + t] = AuxSum[start + blockDim.x + t];
- } else {
- partialSum[blockDim.x + t] = 0.0f;
- }
- __syncthreads();
- int stride = 1;
- while (stride < 2*BLOCK_SIZE) {
- __syncthreads();
- int index = (threadIdx.x + 1) * stride * 2 - 1;
- if (index < 2*BLOCK_SIZE && (index - stride) >= 0){
- partialSum[index] += partialSum[index - stride];
- }
- stride = stride*2;
- }
- __syncthreads();
- stride = BLOCK_SIZE/2;
- while (stride > 0) {
- __syncthreads();
- int index = (threadIdx.x + 1) * stride * 2 - 1;
- if ((index + stride) < 2*BLOCK_SIZE){
- partialSum[index+stride] += partialSum[index];
- }
- stride = stride/2;
- }
- __syncthreads();
- if (start + t < len) {
- AuxSum[start + t] = partialSum[t];
- }
- if (start + t + blockDim.x < len) {
- AuxSum[start + blockDim.x + t] = partialSum[t + blockDim.x];
- }
- }
- __global__ void sum(float *input, float *output, float *AuxSum, int len) {
- unsigned int t = threadIdx.x;
- unsigned int start = 2 * blockDim.x * blockIdx.x;
- if (blockIdx.x == 0){
- if (t < len){
- output[t] = input[t];
- }
- if (t + blockDim.x < len){
- output[t + blockDim.x] = input[t + blockDim.x];
- }
- }
- else {
- if(t + start < len){
- output[t + start] = input[t + start] + AuxSum[blockIdx.x-1];
- }
- if (t + start + blockDim.x < len){
- output[t + start + blockDim.x] = input[t + start + blockDim.x] + AuxSum[blockIdx.x-1];
- }
- }
- }
- int main(int argc, char **argv) {
- wbArg_t args;
- float *hostInput; // The input 1D list
- float *hostOutput1; // the final output
- float *deviceInput;
- float *deviceInput1;
- float *deviceOutput;
- float *deviceOutput1;
- float *deviceAuxSum;
- int numElements; // number of elements in the list
- args = wbArg_read(argc, argv);
- // Import data and create memory on host
- // The number of input elements in the input is numElements
- hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
- hostOutput1 = (float *)malloc(numElements * sizeof(float));
- int len_aux = ceil(1.0*numElements/(2*BLOCK_SIZE));
- // Allocate GPU memory.
- wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
- wbCheck(cudaMalloc((void **)&deviceInput1, numElements * sizeof(float)));
- wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
- wbCheck(cudaMalloc((void **)&deviceOutput1, numElements * sizeof(float)));
- wbCheck(cudaMalloc((void **)&deviceAuxSum, len_aux * sizeof(float)));
- // Clear output memory.
- wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
- wbCheck(cudaMemset(deviceOutput1, 0, numElements * sizeof(float)));
- wbCheck(cudaMemset(deviceAuxSum, 0, len_aux * sizeof(float)));
- // Copy input memory to the GPU.
- wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float), cudaMemcpyHostToDevice));
- //@@ Initialize the grid and block dimensions here --- Kernels 1
- dim3 DimGrid(ceil(1.0*numElements/(2*BLOCK_SIZE)), 1, 1);
- dim3 DimBlock(BLOCK_SIZE, 1, 1);
- blockScan<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numElements, deviceAuxSum);
- wbCheck(cudaGetLastError());
- cudaDeviceSynchronize();
- //@@ Initialize the grid and block dimensions here --- Kernel 2
- dim3 AuxGrid(ceil(1.0*len_aux/ (2*BLOCK_SIZE)), 1, 1);
- auxScan<<<AuxGrid, DimBlock>>>(deviceAuxSum, len_aux);
- wbCheck(cudaGetLastError());
- cudaDeviceSynchronize();
- wbCheck(cudaMemcpy(deviceInput1, deviceOutput, numElements * sizeof(float), cudaMemcpyDeviceToDevice));
- //@@ Initialize the grid and block dimensions here --- Kernel 3
- sum<<<DimGrid, DimBlock>>>(deviceInput1, deviceOutput1, deviceAuxSum, numElements);
- wbCheck(cudaGetLastError());
- cudaDeviceSynchronize();
- // Copying output memory to the CPU from Kernel 3
- wbCheck(cudaMemcpy(hostOutput1, deviceOutput1, numElements * sizeof(float), cudaMemcpyDeviceToHost));
- //@@ Free GPU Memory
- cudaFree(deviceInput);
- cudaFree(deviceInput1);
- cudaFree(deviceOutput);
- cudaFree(deviceOutput1);
- cudaFree(deviceAuxSum);
- wbSolution(args, hostOutput1, numElements);
- free(hostInput);
- free(hostOutput1);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement