Guest User

Untitled

a guest
Sep 14th, 2018
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.27 KB | None | 0 0
  1. Very poor memory access performance with CUDA
  2. #include "cuda_runtime.h"
  3. #include <stdio.h>
  4.  
  5. #define d_MaxParallelRows 320
  6. #define d_MinTreatedRow 5
  7. #define d_MaxTreatedRow 915
  8. #define d_RowsResolution 1
  9. #define k_ThreadsPerBlock 64
  10.  
  11. __global__ void myKernel(int Xi_FirstTreatedRow)
  12. {
  13. int l_ThreadIndex = blockDim.x * blockIdx.x + threadIdx.x;
  14. if (l_ThreadIndex >= d_MaxParallelRows)
  15. return;
  16. int l_Row = Xi_FirstTreatedRow + (l_ThreadIndex * d_RowsResolution);
  17. if (l_Row <= d_MaxTreatedRow) {
  18.  
  19. //float l_SrcIntegral[1700];
  20. float* l_SrcIntegral = (float*)malloc(1700 * sizeof(float));
  21.  
  22. for (int x=185; x<1407; x++) {
  23. for (int i=0; i<1700; i++)
  24. l_SrcIntegral[i] = i;
  25. }
  26.  
  27. free(l_SrcIntegral);
  28. }
  29. }
  30.  
  31. int main()
  32. {
  33. cudaError_t cudaStatus;
  34.  
  35. cudaStatus = cudaSetDevice(0);
  36.  
  37. int l_ThreadsPerBlock = k_ThreadsPerBlock;
  38. int l_BlocksPerGrid = (d_MaxParallelRows + l_ThreadsPerBlock - 1) / l_ThreadsPerBlock;
  39.  
  40. int l_FirstRow = d_MinTreatedRow;
  41. while (l_FirstRow <= d_MaxTreatedRow) {
  42. printf("CUDA: FirstRow=%dn", l_FirstRow);
  43. fflush(stdout);
  44.  
  45. myKernel<<<l_BlocksPerGrid, l_ThreadsPerBlock>>>(l_FirstRow);
  46.  
  47. cudaDeviceSynchronize();
  48.  
  49. l_FirstRow += (d_MaxParallelRows * d_RowsResolution);
  50. }
  51.  
  52. printf("CUDA: Donen");
  53.  
  54. return 0;
  55. }
Add Comment
Please, Sign In to add comment