Advertisement
Guest User

Untitled

a guest
Feb 16th, 2014
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.67 KB | None | 0 0
  1. $ cat t351.cu
  2. #include <stdio.h>
  3. #define BLOCK_WIDTH 20
  4. #define BLOCK_HEIGHT 20
  5. #define NUM_BLOCKS_X 16
  6. #define NUM_BLOCKS_Y 16
  7. #define NUM_BLOCKS (NUM_BLOCKS_X*NUM_BLOCKS_Y)
  8. #define HANDLE_ERROR(x) x
  9. #define TOL 0.0001
  10.  
  11. #define cudaCheckErrors(msg) \
  12. do { \
  13. cudaError_t __err = cudaGetLastError(); \
  14. if (__err != cudaSuccess) { \
  15. fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
  16. msg, cudaGetErrorString(__err), \
  17. __FILE__, __LINE__); \
  18. fprintf(stderr, "*** FAILED - ABORTING\n"); \
  19. exit(1); \
  20. } \
  21. } while (0)
  22.  
  23. __global__ void copy_kernel(float *old_data, float *new_data, int old_len, int new_len){
  24.  
  25. int idx = threadIdx.x + (threadIdx.y*blockDim.x) + (blockDim.x*blockDim.y*blockIdx.x);
  26. if (idx < old_len)
  27. new_data[idx] = old_data[idx];
  28. }
  29.  
  30. __global__ void test_kernel(float *array, float data, int len){
  31. int idx = threadIdx.x + (threadIdx.y*blockDim.x) + (blockDim.x*blockDim.y*blockIdx.x);
  32. if (idx < len) array[idx] = data;
  33. }
  34.  
  35. void grow_array(float **ptr, int length, int length_new)
  36. {
  37. float *ptr_new;
  38. $ cat t351.cu
  39. #include <stdio.h>
  40. #define BLOCK_WIDTH 20
  41. #define BLOCK_HEIGHT 20
  42. #define NUM_BLOCKS_X 16
  43. #define NUM_BLOCKS_Y 16
  44. #define NUM_BLOCKS (NUM_BLOCKS_X*NUM_BLOCKS_Y)
  45. #define HANDLE_ERROR(x) x
  46. #define TOL 0.0001
  47.  
  48. #define cudaCheckErrors(msg) \
  49. do { \
  50. cudaError_t __err = cudaGetLastError(); \
  51. if (__err != cudaSuccess) { \
  52. fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
  53. msg, cudaGetErrorString(__err), \
  54. __FILE__, __LINE__); \
  55. fprintf(stderr, "*** FAILED - ABORTING\n"); \
  56. exit(1); \
  57. } \
  58. } while (0)
  59.  
  60. __global__ void copy_kernel(float *old_data, float *new_data, int old_len, int new_len){
  61.  
  62. int idx = threadIdx.x + (threadIdx.y*blockDim.x) + (blockDim.x*blockDim.y*blockIdx.x);
  63. if (idx < old_len*(blockDim.x*blockDim.y))
  64. new_data[idx] = old_data[idx];
  65. }
  66.  
  67. __global__ void test_kernel(float *array, float data, int len){
  68. int idx = threadIdx.x + (threadIdx.y*blockDim.x) + (blockDim.x*blockDim.y*blockIdx.x);
  69. if (idx < len) array[idx] = data;
  70. }
  71.  
  72. void grow_array(float **ptr, int length, int length_new)
  73. {
  74. float *ptr_new;
  75. int width = length_new * (BLOCK_WIDTH - 4);
  76. int height= (BLOCK_HEIGHT- 4);
  77. HANDLE_ERROR(cudaMalloc(&ptr_new , width * height * sizeof(float)));
  78. //this is the copy kernel
  79. dim3 threads(BLOCK_WIDTH-4,BLOCK_HEIGHT-4);
  80. dim3 blocks(length_new);
  81. copy_kernel<<<blocks,threads>>>(*ptr,ptr_new, length, length_new);
  82.  
  83. float *old_ptr;
  84. old_ptr = *ptr;
  85. HANDLE_ERROR( cudaFree( old_ptr ) );
  86. *ptr = ptr_new;
  87. }
  88.  
  89. void memory_manager(int &blocks_available, int blocks_used, float** h_dev)
  90. {
  91. double ratio = (double)blocks_used/(double)blocks_available;
  92. if (ratio > 0.5)
  93. {
  94. int new_length = (int)(1.5f * blocks_available);
  95. grow_array(h_dev , blocks_available, new_length);
  96. }
  97. }
  98.  
  99. int main(){
  100.  
  101. float *my_data, *host_data;
  102. int dsize = (BLOCK_WIDTH-4) * (BLOCK_HEIGHT-4) * NUM_BLOCKS;
  103.  
  104. cudaMalloc((void **)&my_data, dsize*sizeof(float));
  105. cudaMemset(my_data, 0, dsize*sizeof(float));
  106. cudaCheckErrors("cudaMalloc/cudaMemset fail");
  107. dim3 threads(BLOCK_WIDTH-4,BLOCK_HEIGHT-4);
  108. dim3 blocks(NUM_BLOCKS);
  109. test_kernel<<<blocks, threads>>>(my_data,0.1f,(NUM_BLOCKS * (BLOCK_HEIGHT-4) * (BLOCK_WIDTH-4)));
  110. cudaDeviceSynchronize();
  111. cudaCheckErrors("kernel 1 fail");
  112. int test_blocks = NUM_BLOCKS;
  113. memory_manager(test_blocks, NUM_BLOCKS, &my_data);
  114. cudaCheckErrors("memory manager fail");
  115. dim3 new_blocks((NUM_BLOCKS_X+1)*(NUM_BLOCKS_Y+1));
  116. int new_size = ((NUM_BLOCKS_X+1)*(NUM_BLOCKS_Y+1)) * (BLOCK_HEIGHT-4) * (BLOCK_WIDTH-4);
  117. host_data = (float *)malloc(new_size * sizeof(float));
  118. cudaMemcpy(host_data, my_data, dsize*sizeof(float), cudaMemcpyDeviceToHost);
  119. cudaCheckErrors("cudaMemcpy 1 fail");
  120. for (int i = 0 ; i < dsize; i++)
  121. if (abs(host_data[i] - 0.1f) > TOL) {printf("mismatch 1 at %d, %f\n", i, host_data[i]); return 1;}
  122. test_kernel<<<new_blocks, threads>>>(my_data,0.2f, new_size);
  123. cudaDeviceSynchronize();
  124. cudaCheckErrors("kernel 2 fail");
  125. cudaMemcpy(host_data, my_data, new_size*sizeof(float), cudaMemcpyDeviceToHost);
  126. cudaCheckErrors("cudaMemcpy 2 fail");
  127. for (int i = 0 ; i < new_size; i++)
  128. if (abs(host_data[i] - 0.2f) > TOL) {printf("mismatch 2 at %d, %f\n", i, host_data[i]); return 1;}
  129. printf("Success\n");
  130. return 0;
  131. }
  132. $ nvcc -O3 -arch=sm_20 -o t351 t351.cu
  133. $ cuda-memcheck ./t351
  134. ========= CUDA-MEMCHECK
  135. Success
  136. ========= ERROR SUMMARY: 0 errors
  137. $
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement