daily pastebin goal
24%
SHARE
TWEET

Untitled

a guest Sep 14th, 2018 49 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. Very poor memory access performance with CUDA
  2. #include "cuda_runtime.h"
  3. #include <stdio.h>
  4.  
  5. #define d_MaxParallelRows 320
  6. #define d_MinTreatedRow   5
  7. #define d_MaxTreatedRow   915
  8. #define d_RowsResolution  1
  9. #define k_ThreadsPerBlock 64
  10.  
  11. __global__ void myKernel(int Xi_FirstTreatedRow)
  12. {
  13.   int l_ThreadIndex = blockDim.x * blockIdx.x + threadIdx.x;
  14.   if (l_ThreadIndex >= d_MaxParallelRows)
  15.     return;
  16.   int l_Row = Xi_FirstTreatedRow + (l_ThreadIndex * d_RowsResolution);
  17.   if (l_Row <= d_MaxTreatedRow) {
  18.  
  19.     //float l_SrcIntegral[1700];
  20.     float* l_SrcIntegral = (float*)malloc(1700 * sizeof(float));
  21.  
  22.     for (int x=185; x<1407; x++) {
  23.       for (int i=0; i<1700; i++)
  24.         l_SrcIntegral[i] = i;
  25.     }
  26.  
  27.     free(l_SrcIntegral);
  28.   }
  29. }
  30.  
  31. int main()
  32. {
  33.   cudaError_t cudaStatus;
  34.  
  35.   cudaStatus = cudaSetDevice(0);
  36.  
  37.   int l_ThreadsPerBlock = k_ThreadsPerBlock;
  38.   int l_BlocksPerGrid = (d_MaxParallelRows + l_ThreadsPerBlock - 1) / l_ThreadsPerBlock;
  39.  
  40.   int l_FirstRow = d_MinTreatedRow;
  41.   while (l_FirstRow <= d_MaxTreatedRow) {
  42.     printf("CUDA: FirstRow=%dn", l_FirstRow);
  43.     fflush(stdout);
  44.  
  45.     myKernel<<<l_BlocksPerGrid, l_ThreadsPerBlock>>>(l_FirstRow);
  46.  
  47.     cudaDeviceSynchronize();
  48.  
  49.     l_FirstRow += (d_MaxParallelRows * d_RowsResolution);
  50.   }
  51.  
  52.   printf("CUDA: Donen");
  53.  
  54.   return 0;
  55. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top