Advertisement
mmilakov

Untitled

Feb 16th, 2015
941
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.55 KB | None | 0 0
  1. #include <iostream>
  2. #include <vector>
  3.  
  4. __global__ void kernel2(float * buf)
  5. {
  6.     int tid = blockIdx.x * blockDim.x + threadIdx.x;
  7.  
  8.     float a[5];
  9.     #pragma unroll
  10.     for(int i = 0; i < 5; ++i)
  11.         a[i] = buf[tid + i];
  12.  
  13.     float sum = 0.0F;
  14.     #pragma unroll
  15.     for(int i = 0; i < 5; ++i)
  16.         sum += a[i];
  17.  
  18.     buf[tid] = sum;
  19. }
  20.  
  21. __global__ void kernel3(float * buf, int start_index)
  22. {
  23.     int tid = blockIdx.x * blockDim.x + threadIdx.x;
  24.  
  25.     float a[6];
  26.     #pragma unroll
  27.     for(int i = 0; i < 6; ++i)
  28.         a[i] = buf[tid + i];
  29.  
  30.     float sum = 0.0F;
  31.     #pragma unroll
  32.     for(int i = 0; i < 5; ++i)
  33.         sum += a[start_index + i];
  34.  
  35.     buf[tid] = sum;
  36. }
  37.  
  38. #define ARRAY_SIZE_BASE 32
  39. #define ARRAY_SIZE (ARRAY_SIZE_BASE+3)
  40. #define ITERATIONS 400
  41.  
  42. __global__ void kernel4(float * __restrict buf, const int * __restrict indexbuf)
  43. {
  44.     float a[ARRAY_SIZE];
  45.     int tid = blockIdx.x * blockDim.x + threadIdx.x;
  46.  
  47.     int current_ind = tid & 31;
  48.     int index = indexbuf[current_ind];
  49.  
  50.     #pragma unroll
  51.     for(int i = 0; i < ARRAY_SIZE; ++i)
  52.         a[i] = __ldg(buf + i);
  53.  
  54.     float sum = 0.0F;
  55.     #pragma unroll 10
  56.     for(int i = 0; i < ITERATIONS; ++i)
  57.     {
  58.         index = (index + 1) & 31;
  59.         float x = a[index];
  60.         float y = a[index + 1];
  61.         float z = a[index + 2];
  62.         float w = a[index + 3];
  63.         sum = x + y + z + w;
  64.         a[index] = sum;
  65.     }
  66.  
  67.     buf[tid] = sum;
  68. }
  69.  
  70. #define THREADBLOCK_SIZE 64
  71.  
  72. __device__ __forceinline__ int no_bank_conflict_index(int thread_id, int logical_index)
  73. {
  74.     return logical_index * THREADBLOCK_SIZE + thread_id;
  75. }
  76. __global__ void kernel5(float * __restrict buf, const int * __restrict indexbuf)
  77. {
  78.     __shared__ float A[ARRAY_SIZE * THREADBLOCK_SIZE];
  79.     int tid = blockIdx.x * blockDim.x + threadIdx.x;
  80.  
  81.     int current_ind = tid & 31;
  82.     int index = indexbuf[current_ind];
  83.  
  84.     #pragma unroll
  85.     for(int i = 0; i < ARRAY_SIZE; ++i)
  86.         A[no_bank_conflict_index(threadIdx.x, i)] = __ldg(buf + i);
  87.  
  88.     float sum = 0.0F;
  89.     #pragma unroll 10
  90.     for(int i = 0; i < ITERATIONS; ++i)
  91.     {
  92.         index = (index + 1) & 31;
  93.         float x = A[no_bank_conflict_index(threadIdx.x, index)];
  94.         float y = A[no_bank_conflict_index(threadIdx.x, index + 1)];
  95.         float z = A[no_bank_conflict_index(threadIdx.x, index + 2)];
  96.         float w = A[no_bank_conflict_index(threadIdx.x, index + 3)];
  97.         sum = x + y + z + w;
  98.         A[no_bank_conflict_index(threadIdx.x, index)] = sum;
  99.     }
  100.  
  101.     buf[tid] = sum;
  102. }
  103.  
  104. #define cuda_safe_call(stmt) {cudaError_t err = stmt; if (err != cudaSuccess) {std::cerr << "CUDA error " << err << ": " << cudaGetErrorString(err) << std::endl; exit(-1);}}
  105.  
  106. int main(int argc, char * argv[])
  107. {
  108.     cuda_safe_call(cudaSetDevice(1));
  109.  
  110.     int grid_size = 4096;
  111.     int access_pattern = 0; // 0 - uniform, 1 - 32 replay
  112.  
  113.     float * p_d;
  114.     int * p_i;
  115.     cuda_safe_call(cudaMalloc((void**)&p_d, (THREADBLOCK_SIZE * grid_size + 32) * sizeof(float)));
  116.     cuda_safe_call(cudaMalloc((void**)&p_i, (32 + ITERATIONS) * sizeof(int)));
  117.  
  118.     std::vector<int> indexes(32 + ITERATIONS);
  119.     if (access_pattern == 0)
  120.     {
  121.         for(int i = 0; i < (32 + ITERATIONS); ++i)
  122.             indexes[i] = 0;
  123.     }
  124.     else
  125.     {
  126.         for(int i = 0; i < (32 + ITERATIONS); ++i)
  127.             indexes[i] = i % 32;
  128.     }
  129.     cuda_safe_call(cudaMemcpy(p_i, &indexes[0], indexes.size() * sizeof(int), cudaMemcpyHostToDevice));
  130.  
  131.     //kernel2<<<grid_size,THREADBLOCK_SIZE>>>(p_d);
  132.     //kernel3<<<grid_size,THREADBLOCK_SIZE>>>(p_d, 1);
  133.     cudaFuncSetCacheConfig(kernel4, cudaFuncCachePreferL1);
  134.     kernel4<<<grid_size,THREADBLOCK_SIZE>>>(p_d, p_i);
  135.     //kernel5<<<grid_size,THREADBLOCK_SIZE>>>(p_d, p_i);
  136.  
  137.     cuda_safe_call(cudaFree(p_d));
  138.     cuda_safe_call(cudaFree(p_i));
  139.     cuda_safe_call(cudaDeviceReset());
  140.  
  141.     return 0;
  142. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement