Advertisement
Guest User

Untitled

a guest
Apr 23rd, 2019
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.02 KB | None | 0 0
  1. int main(){
  2. cudaEvent_t start, stop;
  3. cudaEventCreate(&start);
  4. cudaEventCreate(&stop);
  5.  
  6. const unsigned int Block_Size = 32; //Size of block
  7. const unsigned int Input_Size = 2048; //How many numbers
  8.  
  9. srand(time(NULL));//Initialize random number generator
  10.  
  11.  
  12.  
  13. bool print = true; //Print out complete results
  14.  
  15. //Grid size initialisation
  16. const unsigned int numBlocks = (Input_Size / Block_Size) + 1;
  17. const unsigned int X_Dim = (numBlocks > 65535) ? 65535 : numBlocks;
  18. const unsigned int Y_Dim = (numBlocks / 65536) + 1;
  19. dim3 gridSize(X_Dim, Y_Dim);
  20.  
  21.  
  22. //Create pointers and allocate arrays
  23. float* d_input;
  24. float* d_output;
  25.  
  26. float* h_input;
  27. float* h_output;
  28.  
  29. h_input = (float*) malloc(Input_Size*sizeof(float));
  30. h_output = (float*) malloc(Input_Size*sizeof(float));
  31.  
  32. cudaMalloc(&d_input, Input_Size*sizeof(float));
  33. cudaMalloc(&d_output, Input_Size*sizeof(float));
  34.  
  35. printf("Generating random numbers.\n");
  36.  
  37. //Fill h_input with random numbers
  38. for (int i = 0; i < Input_Size; i++){
  39. h_input[i] = 1;
  40. }
  41.  
  42. printf("Copying memory.\n");
  43.  
  44. //Copy memory to device
  45. cudaMemcpy(d_input, h_input, sizeof(float)*Input_Size, cudaMemcpyHostToDevice);
  46. cudaDeviceSynchronize();
  47. unsigned int offset = 1;
  48.  
  49. printf("Performing scan.\n");
  50. cudaEventRecord(start);
  51. while (offset < (Input_Size*2)){ //For all powers of 2 up to (but not including) length
  52.  
  53. sumScan <<<gridSize, Block_Size >>>(d_input, d_output, offset, Input_Size); //Perform single iteration of scan
  54. cudaDeviceSynchronize(); cudaGetLastError();//Make sure everything finished
  55.  
  56. offset <<= 1;//Multiply offset by 2
  57.  
  58.  
  59. float* temp = d_input;//Swap pointers for double buffering
  60. d_input = d_output;
  61. d_output = temp;
  62. }
  63. cudaEventRecord(stop);
  64. cudaEventSynchronize(stop);
  65. float milliseconds = 0;
  66. cudaEventElapsedTime(&milliseconds, start, stop);
  67. printf("Elapsed time was: %f\n milliseconds", milliseconds);
  68. //printf("Copying results.\n");
  69. //Copy memory back
  70. cudaMemcpy(h_output, d_output, Input_Size*sizeof(float), cudaMemcpyDeviceToHost);
  71.  
  72.  
  73. //for (int i = 0; i < length; i++){
  74. //printf("%f\t%f\n", h_input[i], h_output[i]);}
  75. printf("Calculation complete. Final result: %f", h_output[Input_Size - 1]);
  76.  
  77.  
  78. //Free allocated memory
  79. cudaFree(d_input);
  80. cudaFree(d_output);
  81. free(h_input);
  82. free(h_output);
  83.  
  84. getchar();//Pause at end of program
  85.  
  86. }
  87.  
  88.  
  89. __global__ void sumScan(float* input, float* output, unsigned int offset, unsigned int max){
  90.  
  91. unsigned int position = (gridDim.x*blockDim.x)*blockIdx.y + blockDim.x*blockIdx.x + threadIdx.x;//Calculate 1D position
  92.  
  93. if (position < max){//If in bounds
  94. output[position] = input[position]; //Copy input to output
  95. if (position >= offset){ //Add offset element if in bounds
  96. output[position] += input[position - offset];
  97. }
  98. }
  99. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement