Advertisement
Guest User

CUB::DEVICESCAN

a guest
Oct 9th, 2015
219
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 3.05 KB | None | 0 0
  1. #define CUB_STDERR
  2. #include <stdio.h>
  3. #include "cub/util_allocator.cuh"
  4. #include "cub/device/device_scan.cuh"
  5. #include "test/test_util.h"
  6. #include <sys/time.h>
  7. using namespace cub;
  8.  
  9. bool                    g_verbose = false;  // Whether to display input/output to console
  10. CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
  11. typedef int mytype;
  12.  
  13. /**
  14.  * Solve inclusive-scan problem
  15.  */
  16.  
  17. static void solve(mytype *h_in, mytype *h_cpu, int n)
  18. {
  19.     mytype inclusive = 0;
  20.     for (int i = 0; i < n; ++i) {
  21.       inclusive += h_in[i];
  22.       h_cpu[i] = inclusive;
  23.     }
  24. }
  25. static int compare(mytype *h_cpu, mytype *h_o, int n)
  26. {
  27.     for (int i = 0; i < n; i++) {
  28.       if (h_cpu[i] != h_o[i]) {
  29.         return i + 1;
  30.       }
  31.     }
  32.     return 0;
  33. }
  34.  
  35. /**
  36.  * Main
  37.  */
  38. int main(int argc, char** argv)
  39. {
  40.     cudaSetDevice(0);
  41.     struct timeval start, end;
  42.     int num_items = 1073741824;
  43.     const int repetitions = 5
  44.     mytype *h_in, *h_out, *h_cpu;
  45.     const int size = num_items * sizeof(mytype);
  46.     // Allocate host arrays
  47.     h_in = (mytype *)malloc(size);
  48.     h_out = (mytype *)malloc(size);
  49.     h_cpu = (mytype *)malloc(size);
  50.  
  51.    
  52.     // Initialize problem and solution
  53.     for (int i = 0; i < num_items; i++) {
  54.         h_in[i] = i;
  55.         h_out[i] = 0;
  56.         h_cpu[i] = 0;
  57.     }
  58.    
  59.     solve(h_in, h_cpu, num_items);
  60.    
  61.     // Allocate problem device arrays
  62.     mytype *d_in = NULL;
  63.     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
  64.  
  65.     // Initialize device input
  66.     CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
  67.  
  68.     // Allocate device output array
  69.     mytype *d_out = NULL;
  70.     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
  71.  
  72.     // Allocate temporary storage
  73.     void            *d_temp_storage = NULL;
  74.     size_t          temp_storage_bytes = 0;
  75.    
  76.    
  77.     CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
  78.     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
  79.    
  80.     // Run
  81.     gettimeofday(&start, NULL);
  82.     for (long i = 0; i < repetitions; i++)
  83.         DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
  84.     cudaThreadSynchronize();
  85.     gettimeofday(&end, NULL);
  86.     double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
  87.    
  88.     cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
  89.     int cmp = compare(h_cpu, h_out, num_items);
  90.     printf("%d\t", num_items);
  91.     if (!cmp)
  92.         printf("\t%7.4fs \n", ctime);
  93.     printf("\n");
  94.     if (h_in) delete[] h_in;
  95.     if (h_out) delete[] h_out;
  96.     if (h_cpu) delete[] h_cpu;
  97.     if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
  98.     if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
  99.     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
  100.  
  101.     printf("\n\n");
  102.  
  103.     return 0;
  104. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement