Guest User

CUDA bandwidth test

a guest
Sep 24th, 2015
237
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 3.68 KB | None | 0 0
  1. #include <stdio.h>
  2. #include <time.h>
  3. #include <stdlib.h>
  4. #include <stdint.h>
  5.  
  6. #include "cuda_runtime.h"
  7. #include "curand_kernel.h"
  8. #include "device_launch_parameters.h"
  9.  
  10. #define GRID_SIZE  8192
  11. #define BLOCK_SIZE 256
  12. #define MEGABYTE (1024 * 1024)
  13. #define THREADS_PER_HASH 8
  14. #define ITERATIONS 16
  15. #define FNV_PRIME   0x01000193
  16.  
  17. #define fnv(x,y) ((x) * FNV_PRIME ^(y))
  18. #define random() (rand() * rand())
  19.  
  20. #define CUDA_SAFE_CALL(call)                                          \
  21. do {                                                                  \
  22.     cudaError_t err = call;                                           \
  23.     if (cudaSuccess != err) {                                         \
  24.         fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
  25.                  __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
  26.         exit(EXIT_FAILURE);                                           \
  27.         }                                                             \
  28. } while (0)
  29.  
  30. typedef union
  31. {
  32.     uint32_t uint32s[128 / sizeof(uint32_t)];
  33.     uint4    uint4s[128 / sizeof(uint4)];
  34. } hash128_t;
  35.  
  36. __constant__ hash128_t * d_buffer;
  37. __constant__ unsigned int d_buffer_size;
  38.  
  39. __device__ uint32_t fnv_reduce(uint4 v)
  40. {
  41.     return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
  42. }
  43.  
  44. __global__ void test(int search, volatile unsigned int * num_results)
  45. {
  46.     const unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
  47.  
  48.     const int thread_id = threadIdx.x &  (THREADS_PER_HASH - 1);
  49.     const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
  50.  
  51.     unsigned int s = gid;
  52.     unsigned int r;
  53.  
  54.     for (int i = 0; i < THREADS_PER_HASH; i++) {
  55.        
  56.         for (int j = 0; j < ITERATIONS; j++) {
  57.             unsigned int index = __shfl(s, start_lane + i)  % d_buffer_size;
  58.             uint4 v = d_buffer[index].uint4s[thread_id];
  59.             s = fnv_reduce(v);
  60.         }
  61.        
  62.         s = __shfl(s, start_lane + i);
  63.         if (i == thread_id) {
  64.             r = s;
  65.         }
  66.     }
  67.  
  68.     if (search == r) {
  69.         atomicInc(const_cast<unsigned int *>(num_results), UINT_MAX);
  70.         __threadfence_system();
  71.     }
  72. }
  73.  
  74. int main(int argc, char *argv[])
  75. {
  76.     unsigned int buffer_size;
  77.  
  78.     if (argc == 2)
  79.         buffer_size = atoi(argv[1]) * MEGABYTE;
  80.     else
  81.         buffer_size = 1024 * MEGABYTE;
  82.  
  83.     unsigned int * buffer = (unsigned int *)malloc(buffer_size);
  84.    
  85.     printf("Creating buffer of size %u bytes...\n", buffer_size);
  86.     srand(time(NULL));
  87.    
  88.     for (unsigned int i = 0; i < buffer_size / 4; i++) {
  89.         buffer[i] = random();
  90.     }
  91.    
  92.     hash128_t * h_buffer;
  93.     volatile unsigned int * num_results;
  94.     unsigned int h_buffer_size = buffer_size / sizeof(hash128_t);
  95.    
  96.  
  97.     CUDA_SAFE_CALL(cudaSetDevice(0));
  98.     CUDA_SAFE_CALL(cudaMallocHost((void**)&num_results, sizeof(unsigned int)));
  99.     CUDA_SAFE_CALL(cudaMalloc((void**)&h_buffer, buffer_size));
  100.     CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(h_buffer), buffer, buffer_size, cudaMemcpyHostToDevice));
  101.     CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_buffer, &h_buffer, sizeof(hash128_t *)));
  102.     CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_buffer_size, &h_buffer_size, sizeof(unsigned int)));
  103.  
  104.     num_results[0] = 0;
  105.  
  106.     unsigned int target;
  107.     target = random();
  108.  
  109.     cudaEvent_t start, stop;
  110.     CUDA_SAFE_CALL(cudaEventCreate(&start));
  111.     CUDA_SAFE_CALL(cudaEventCreate(&stop));
  112.     cudaEventRecord(start, nullptr);
  113.     int count = 16;
  114.     for (int i = 0; i < count; i++) {
  115.         test << <GRID_SIZE, BLOCK_SIZE >> >(target, num_results);
  116.         CUDA_SAFE_CALL(cudaGetLastError());
  117.         CUDA_SAFE_CALL(cudaDeviceSynchronize());
  118.     }
  119.     cudaEventRecord(stop, nullptr);
  120.     cudaFree(h_buffer);
  121.  
  122.     float duration;
  123.     cudaEventElapsedTime(&duration, start, stop);
  124.     printf("%f GB/s\n", (1000.0f / duration) * count * sizeof(uint4) * THREADS_PER_HASH * ITERATIONS * GRID_SIZE * BLOCK_SIZE / static_cast<float>(1 << 30));
  125.    
  126.     return num_results[0];
  127. }
Advertisement
Add Comment
Please, Sign In to add comment