Guest User

Untitled

a guest
Dec 14th, 2018
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.55 KB | None | 0 0
  1. __global__ void thread_local_var_test_kernal1(int** worker_array, int* var, int size)
  2. {
  3. const int
  4. blockId = blockIdx.y * gridDim.x + blockIdx.x,
  5. idx = blockId * blockDim.x + threadIdx.x;
  6. if (idx < size)
  7. {
  8. int* worker = worker_array[threadIdx.x]; // <------- I want worker array to work without race-condition
  9.  
  10. var[threadIdx.x]++; // race condition
  11. //atomicAdd(&var[threadIdx.x], 1); // OK
  12. }
  13. }
  14.  
  15. static void thread_local_var_test_kernal_test1(int size)
  16. {
  17. int threadsPerBlock = block_dim;
  18. int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
  19.  
  20. int* dev_var = 0;
  21. cudaMalloc(&dev_var, threadsPerBlock * sizeof(int));
  22. cudaMemset(dev_var, 0, threadsPerBlock * sizeof(int));
  23.  
  24. int** dev_worker_array = 0;
  25. cudaMalloc(&dev_worker_array, threadsPerBlock * sizeof(int*));
  26.  
  27. thread_local_var_test_kernal1 <<<blocksPerGrid, threadsPerBlock>>>(dev_worker_array, dev_var, size);
  28. cudaDeviceSynchronize();
  29.  
  30. printf("================================ CUDA kernal is completed ================================n");
  31. int* host_var = (int*)malloc(threadsPerBlock * sizeof(int));
  32. cudaMemcpy(host_var, dev_var, threadsPerBlock * sizeof(int), cudaMemcpyDeviceToHost);
  33. cudaFree(dev_var);
  34. int sum_var = 0;
  35. for (int i = 0; i < threadsPerBlock; i++)
  36. {
  37. printf("%d, ", host_var[i]);
  38. sum_var += host_var[i];
  39. }
  40. printf("CUDA sum = %d (%d is expected).n", sum_var, size);
  41. (sum_var == size) ? printf("Good.n") : printf("Bad.n");
  42. free(host_var);
  43. }
Add Comment
Please, Sign In to add comment