Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- $ cat t351.cu
- #include <stdio.h>
- #define BLOCK_WIDTH 20
- #define BLOCK_HEIGHT 20
- #define NUM_BLOCKS_X 16
- #define NUM_BLOCKS_Y 16
- #define NUM_BLOCKS (NUM_BLOCKS_X*NUM_BLOCKS_Y)
- #define HANDLE_ERROR(x) x
- #define TOL 0.0001
- #define cudaCheckErrors(msg) \
- do { \
- cudaError_t __err = cudaGetLastError(); \
- if (__err != cudaSuccess) { \
- fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
- msg, cudaGetErrorString(__err), \
- __FILE__, __LINE__); \
- fprintf(stderr, "*** FAILED - ABORTING\n"); \
- exit(1); \
- } \
- } while (0)
- __global__ void copy_kernel(float *old_data, float *new_data, int old_len, int new_len){
- int idx = threadIdx.x + (threadIdx.y*blockDim.x) + (blockDim.x*blockDim.y*blockIdx.x);
- if (idx < old_len)
- new_data[idx] = old_data[idx];
- }
- __global__ void test_kernel(float *array, float data, int len){
- int idx = threadIdx.x + (threadIdx.y*blockDim.x) + (blockDim.x*blockDim.y*blockIdx.x);
- if (idx < len) array[idx] = data;
- }
- void grow_array(float **ptr, int length, int length_new)
- {
- float *ptr_new;
- $ cat t351.cu
- #include <stdio.h>
- #define BLOCK_WIDTH 20
- #define BLOCK_HEIGHT 20
- #define NUM_BLOCKS_X 16
- #define NUM_BLOCKS_Y 16
- #define NUM_BLOCKS (NUM_BLOCKS_X*NUM_BLOCKS_Y)
- #define HANDLE_ERROR(x) x
- #define TOL 0.0001
- #define cudaCheckErrors(msg) \
- do { \
- cudaError_t __err = cudaGetLastError(); \
- if (__err != cudaSuccess) { \
- fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
- msg, cudaGetErrorString(__err), \
- __FILE__, __LINE__); \
- fprintf(stderr, "*** FAILED - ABORTING\n"); \
- exit(1); \
- } \
- } while (0)
- __global__ void copy_kernel(float *old_data, float *new_data, int old_len, int new_len){
- int idx = threadIdx.x + (threadIdx.y*blockDim.x) + (blockDim.x*blockDim.y*blockIdx.x);
- if (idx < old_len*(blockDim.x*blockDim.y))
- new_data[idx] = old_data[idx];
- }
- __global__ void test_kernel(float *array, float data, int len){
- int idx = threadIdx.x + (threadIdx.y*blockDim.x) + (blockDim.x*blockDim.y*blockIdx.x);
- if (idx < len) array[idx] = data;
- }
- void grow_array(float **ptr, int length, int length_new)
- {
- float *ptr_new;
- int width = length_new * (BLOCK_WIDTH - 4);
- int height= (BLOCK_HEIGHT- 4);
- HANDLE_ERROR(cudaMalloc(&ptr_new , width * height * sizeof(float)));
- //this is the copy kernel
- dim3 threads(BLOCK_WIDTH-4,BLOCK_HEIGHT-4);
- dim3 blocks(length_new);
- copy_kernel<<<blocks,threads>>>(*ptr,ptr_new, length, length_new);
- float *old_ptr;
- old_ptr = *ptr;
- HANDLE_ERROR( cudaFree( old_ptr ) );
- *ptr = ptr_new;
- }
- void memory_manager(int &blocks_available, int blocks_used, float** h_dev)
- {
- double ratio = (double)blocks_used/(double)blocks_available;
- if (ratio > 0.5)
- {
- int new_length = (int)(1.5f * blocks_available);
- grow_array(h_dev , blocks_available, new_length);
- }
- }
- int main(){
- float *my_data, *host_data;
- int dsize = (BLOCK_WIDTH-4) * (BLOCK_HEIGHT-4) * NUM_BLOCKS;
- cudaMalloc((void **)&my_data, dsize*sizeof(float));
- cudaMemset(my_data, 0, dsize*sizeof(float));
- cudaCheckErrors("cudaMalloc/cudaMemset fail");
- dim3 threads(BLOCK_WIDTH-4,BLOCK_HEIGHT-4);
- dim3 blocks(NUM_BLOCKS);
- test_kernel<<<blocks, threads>>>(my_data,0.1f,(NUM_BLOCKS * (BLOCK_HEIGHT-4) * (BLOCK_WIDTH-4)));
- cudaDeviceSynchronize();
- cudaCheckErrors("kernel 1 fail");
- int test_blocks = NUM_BLOCKS;
- memory_manager(test_blocks, NUM_BLOCKS, &my_data);
- cudaCheckErrors("memory manager fail");
- dim3 new_blocks((NUM_BLOCKS_X+1)*(NUM_BLOCKS_Y+1));
- int new_size = ((NUM_BLOCKS_X+1)*(NUM_BLOCKS_Y+1)) * (BLOCK_HEIGHT-4) * (BLOCK_WIDTH-4);
- host_data = (float *)malloc(new_size * sizeof(float));
- cudaMemcpy(host_data, my_data, dsize*sizeof(float), cudaMemcpyDeviceToHost);
- cudaCheckErrors("cudaMemcpy 1 fail");
- for (int i = 0 ; i < dsize; i++)
- if (abs(host_data[i] - 0.1f) > TOL) {printf("mismatch 1 at %d, %f\n", i, host_data[i]); return 1;}
- test_kernel<<<new_blocks, threads>>>(my_data,0.2f, new_size);
- cudaDeviceSynchronize();
- cudaCheckErrors("kernel 2 fail");
- cudaMemcpy(host_data, my_data, new_size*sizeof(float), cudaMemcpyDeviceToHost);
- cudaCheckErrors("cudaMemcpy 2 fail");
- for (int i = 0 ; i < new_size; i++)
- if (abs(host_data[i] - 0.2f) > TOL) {printf("mismatch 2 at %d, %f\n", i, host_data[i]); return 1;}
- printf("Success\n");
- return 0;
- }
- $ nvcc -O3 -arch=sm_20 -o t351 t351.cu
- $ cuda-memcheck ./t351
- ========= CUDA-MEMCHECK
- Success
- ========= ERROR SUMMARY: 0 errors
- $
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement