Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define CUB_STDERR
- #include <stdio.h>
- #include "cub/util_allocator.cuh"
- #include "cub/device/device_scan.cuh"
- #include "test/test_util.h"
- #include <sys/time.h>
- using namespace cub;
- bool g_verbose = false; // Whether to display input/output to console
- CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
- typedef int mytype;
- /**
- * Solve inclusive-scan problem
- */
- static void solve(mytype *h_in, mytype *h_cpu, int n)
- {
- mytype inclusive = 0;
- for (int i = 0; i < n; ++i) {
- inclusive += h_in[i];
- h_cpu[i] = inclusive;
- }
- }
- static int compare(mytype *h_cpu, mytype *h_o, int n)
- {
- for (int i = 0; i < n; i++) {
- if (h_cpu[i] != h_o[i]) {
- return i + 1;
- }
- }
- return 0;
- }
- /**
- * Main
- */
- int main(int argc, char** argv)
- {
- cudaSetDevice(0);
- struct timeval start, end;
- int num_items = 1073741824;
- const int repetitions = 5
- mytype *h_in, *h_out, *h_cpu;
- const int size = num_items * sizeof(mytype);
- // Allocate host arrays
- h_in = (mytype *)malloc(size);
- h_out = (mytype *)malloc(size);
- h_cpu = (mytype *)malloc(size);
- // Initialize problem and solution
- for (int i = 0; i < num_items; i++) {
- h_in[i] = i;
- h_out[i] = 0;
- h_cpu[i] = 0;
- }
- solve(h_in, h_cpu, num_items);
- // Allocate problem device arrays
- mytype *d_in = NULL;
- CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
- // Initialize device input
- CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
- // Allocate device output array
- mytype *d_out = NULL;
- CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
- // Allocate temporary storage
- void *d_temp_storage = NULL;
- size_t temp_storage_bytes = 0;
- CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
- CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
- // Run
- gettimeofday(&start, NULL);
- for (long i = 0; i < repetitions; i++)
- DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
- cudaThreadSynchronize();
- gettimeofday(&end, NULL);
- double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
- cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
- int cmp = compare(h_cpu, h_out, num_items);
- printf("%d\t", num_items);
- if (!cmp)
- printf("\t%7.4fs \n", ctime);
- printf("\n");
- if (h_in) delete[] h_in;
- if (h_out) delete[] h_out;
- if (h_cpu) delete[] h_cpu;
- if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
- if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
- if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
- printf("\n\n");
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement