Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- int main(){
- cudaEvent_t start, stop;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- const unsigned int Block_Size = 32; //Size of block
- const unsigned int Input_Size = 2048; //How many numbers
- srand(time(NULL));//Initialize random number generator
- bool print = true; //Print out complete results
- //Grid size initialisation
- const unsigned int numBlocks = (Input_Size / Block_Size) + 1;
- const unsigned int X_Dim = (numBlocks > 65535) ? 65535 : numBlocks;
- const unsigned int Y_Dim = (numBlocks / 65536) + 1;
- dim3 gridSize(X_Dim, Y_Dim);
- //Create pointers and allocate arrays
- float* d_input;
- float* d_output;
- float* h_input;
- float* h_output;
- h_input = (float*) malloc(Input_Size*sizeof(float));
- h_output = (float*) malloc(Input_Size*sizeof(float));
- cudaMalloc(&d_input, Input_Size*sizeof(float));
- cudaMalloc(&d_output, Input_Size*sizeof(float));
- printf("Generating random numbers.\n");
- //Fill h_input with random numbers
- for (int i = 0; i < Input_Size; i++){
- h_input[i] = 1;
- }
- printf("Copying memory.\n");
- //Copy memory to device
- cudaMemcpy(d_input, h_input, sizeof(float)*Input_Size, cudaMemcpyHostToDevice);
- cudaDeviceSynchronize();
- unsigned int offset = 1;
- printf("Performing scan.\n");
- cudaEventRecord(start);
- while (offset < (Input_Size*2)){ //For all powers of 2 up to (but not including) length
- sumScan <<<gridSize, Block_Size >>>(d_input, d_output, offset, Input_Size); //Perform single iteration of scan
- cudaDeviceSynchronize(); cudaGetLastError();//Make sure everything finished
- offset <<= 1;//Multiply offset by 2
- float* temp = d_input;//Swap pointers for double buffering
- d_input = d_output;
- d_output = temp;
- }
- cudaEventRecord(stop);
- cudaEventSynchronize(stop);
- float milliseconds = 0;
- cudaEventElapsedTime(&milliseconds, start, stop);
- printf("Elapsed time was: %f\n milliseconds", milliseconds);
- //printf("Copying results.\n");
- //Copy memory back
- cudaMemcpy(h_output, d_output, Input_Size*sizeof(float), cudaMemcpyDeviceToHost);
- //for (int i = 0; i < length; i++){
- //printf("%f\t%f\n", h_input[i], h_output[i]);}
- printf("Calculation complete. Final result: %f", h_output[Input_Size - 1]);
- //Free allocated memory
- cudaFree(d_input);
- cudaFree(d_output);
- free(h_input);
- free(h_output);
- getchar();//Pause at end of program
- }
- __global__ void sumScan(float* input, float* output, unsigned int offset, unsigned int max){
- unsigned int position = (gridDim.x*blockDim.x)*blockIdx.y + blockDim.x*blockIdx.x + threadIdx.x;//Calculate 1D position
- if (position < max){//If in bounds
- output[position] = input[position]; //Copy input to output
- if (position >= offset){ //Add offset element if in bounds
- output[position] += input[position - offset];
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement