Advertisement
Guest User

Untitled

a guest
Jun 15th, 2013
238
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.57 KB | None | 0 0
  1. // main.h file
  2. #ifndef ODINN_CUDA_MAIN_H
  3. #define ODINN_CUDA_MAIN_H
  4.  
  5. #define ARR_SIZE 100
  6. #define ITER_AMOUNT 1
  7.  
  8. typedef enum cudaError cudaError_t;
  9.  
  10. static void HandleError(cudaError_t err, const char *file, int line) {
  11.     if (err != CUDA_SUCCESS) {
  12.         printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
  13.         exit(EXIT_FAILURE);
  14.     }
  15. }
  16.  
  17. #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
  18.  
  19. #define GET_CURRENT_CLOCKS(var) (var = clock())
  20. #define GET_CLOCK_INTERVAL_SEC(start, end, result) (result = ((double)((double)end - (double)start) / (double)CLOCKS_PER_SEC))
  21.  
  22. __host__ dim3 requestBlockSize(int x, int y=0, int z=0);
  23. __host__ dim3 requestNumBlocks(int x, int y=0, int z=0);
  24. __host__ void allocateVectors(unsigned int **a_host, unsigned int **b_host, unsigned int **c_host, unsigned int **a_device, unsigned int **b_device, unsigned int **c_device);
  25. __global__ void addVectors(unsigned int* a, unsigned int* b, unsigned int* result, int n);
  26. __host__ void cleanUp(unsigned int *a_host, unsigned int *b_host, unsigned int *c_host, unsigned int *a_device, unsigned int *b_device, unsigned int *c_device);
  27.  
  28. #endif
  29. // End of main.h file
  30.  
  31. // kernel.cu file
  32. #include <cuda.h>
  33. #include <stdio.h>
  34. #include <stdio.h>
  35. #include <stdlib.h>
  36. #include <time.h>
  37. #include <math.h>
  38.  
  39. #include "main.h"
  40.  
  41. static cudaDeviceProp prop;
  42.  
  43. int main(void) {
  44.     // Start lazy init now so first cudaMallow will run faster.
  45.     cudaSetDevice(0);
  46.     cudaFree(0);
  47.  
  48.     unsigned int *a_host, *b_host, *c_host;
  49.     unsigned int *a_device, *b_device, *c_device;
  50.     double delta_in_sec;
  51.     size_t size = sizeof(unsigned int) * ARR_SIZE;
  52.     clock_t start_clock, end_clock;
  53.  
  54.     HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
  55.  
  56.     dim3 block_size = requestBlockSize(1024);
  57.     int blocks_requested = floor((double)(ARR_SIZE / block_size.x));
  58.     dim3 n_blocks = requestNumBlocks(blocks_requested > 0 ? blocks_requested : 1);
  59.    
  60.     fprintf(stdout, "Allocating vectors ...\n");
  61.     allocateVectors(&a_host, &b_host, &c_host, &a_device, &b_device, &c_device);
  62.    
  63.     fprintf(stdout, "Copying to device ...\n");
  64.     HANDLE_ERROR(cudaMemcpy(a_device, a_host, size, cudaMemcpyHostToDevice));
  65.     HANDLE_ERROR(cudaMemcpy(b_device, b_host, size, cudaMemcpyHostToDevice));
  66.    
  67.     fprintf(stdout, "Running kernel ...\n");
  68.     GET_CURRENT_CLOCKS(start_clock);
  69.    
  70.     for(int i=0; i<ITER_AMOUNT; i++) {\
  71.         addVectors<<<n_blocks, block_size>>>(a_device, b_device, c_device, ARR_SIZE);
  72.     }
  73.  
  74.     GET_CURRENT_CLOCKS(end_clock);
  75.     GET_CLOCK_INTERVAL_SEC(start_clock, end_clock, delta_in_sec);
  76.  
  77.     fprintf(stdout, "Runtime of kernel %d times on arrays in length %d took %f seconds\n"
  78.         "Copying results back to host ...\n", ITER_AMOUNT, ARR_SIZE, delta_in_sec);
  79.  
  80.     HANDLE_ERROR(cudaMemcpy(c_host, c_device, size, cudaMemcpyDeviceToHost));;
  81.     fprintf(stdout, "%u + %u != %u\n", a_host[0], b_host[0], c_host[0]);
  82.  
  83.     fprintf(stdout, "Cleaning up ...\n");
  84.     cleanUp(a_host, b_host, c_host, a_device, b_device, c_device);
  85.    
  86.     fprintf(stdout, "Done!\n");
  87. }
  88.  
  89. __host__ dim3 requestBlockSize(int x, int y, int z) {
  90.     dim3 blocksize(
  91.         x <= prop.maxThreadsDim[0] ? x : prop.maxThreadsDim[0],
  92.         y <= prop.maxThreadsDim[1] ? y : prop.maxThreadsDim[1],
  93.         z <= prop.maxThreadsDim[2] ? z : prop.maxThreadsDim[2]
  94.     );
  95.  
  96.     return blocksize;
  97. }
  98.  
  99. __host__ dim3 requestNumBlocks(int x, int y, int z) {
  100.     dim3 numblocks(x, y, z);
  101.  
  102.     return numblocks;
  103. }
  104.  
  105. __host__ void allocateVectors(unsigned int **a_host, unsigned int **b_host, unsigned int **c_host, unsigned int **a_device, unsigned int **b_device, unsigned int **c_device) {
  106.     size_t size = sizeof(unsigned int) * ARR_SIZE;
  107.  
  108.     *a_host = (unsigned int *)malloc(size);
  109.     *b_host = (unsigned int *)malloc(size);
  110.     *c_host = (unsigned int *)malloc(size);
  111.  
  112.     HANDLE_ERROR(cudaMalloc((void **)a_device, size));
  113.     HANDLE_ERROR(cudaMalloc((void **)b_device, size));
  114.     HANDLE_ERROR(cudaMalloc((void **)c_device, size));
  115.  
  116.     srand(time(NULL));
  117.  
  118.     for(int i=0; i<ARR_SIZE; i++) {
  119.         (*a_host)[i] = rand() % ARR_SIZE;
  120.         (*b_host)[i] = rand() % ARR_SIZE;
  121.     }
  122. }
  123.  
  124. __global__ void addVectors(unsigned int* a, unsigned int* b, unsigned int* result, int n) {
  125.     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  126.    
  127.     if(idx >= 0 && idx < n)
  128.         result[idx] = a[idx] + b[idx];
  129. }
  130.  
  131. __host__ void cleanUp(unsigned int *a_host, unsigned int *b_host, unsigned int *c_host, unsigned int *a_device, unsigned int *b_device, unsigned int *c_device) {
  132.     free(a_host);
  133.     free(b_host);
  134.     free(c_host);
  135.  
  136.     HANDLE_ERROR(cudaFree(a_device));
  137.     HANDLE_ERROR(cudaFree(b_device));
  138.     HANDLE_ERROR(cudaFree(c_device));
  139. }
  140. // End of kernel.cu file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement