Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __global__ void thread_local_var_test_kernal1(int** worker_array, int* var, int size)
- {
- const int
- blockId = blockIdx.y * gridDim.x + blockIdx.x,
- idx = blockId * blockDim.x + threadIdx.x;
- if (idx < size)
- {
- int* worker = worker_array[threadIdx.x]; // <------- I want worker array to work without race-condition
- var[threadIdx.x]++; // race condition
- //atomicAdd(&var[threadIdx.x], 1); // OK
- }
- }
- static void thread_local_var_test_kernal_test1(int size)
- {
- int threadsPerBlock = block_dim;
- int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
- int* dev_var = 0;
- cudaMalloc(&dev_var, threadsPerBlock * sizeof(int));
- cudaMemset(dev_var, 0, threadsPerBlock * sizeof(int));
- int** dev_worker_array = 0;
- cudaMalloc(&dev_worker_array, threadsPerBlock * sizeof(int*));
- thread_local_var_test_kernal1 <<<blocksPerGrid, threadsPerBlock>>>(dev_worker_array, dev_var, size);
- cudaDeviceSynchronize();
- printf("================================ CUDA kernal is completed ================================n");
- int* host_var = (int*)malloc(threadsPerBlock * sizeof(int));
- cudaMemcpy(host_var, dev_var, threadsPerBlock * sizeof(int), cudaMemcpyDeviceToHost);
- cudaFree(dev_var);
- int sum_var = 0;
- for (int i = 0; i < threadsPerBlock; i++)
- {
- printf("%d, ", host_var[i]);
- sum_var += host_var[i];
- }
- printf("CUDA sum = %d (%d is expected).n", sum_var, size);
- (sum_var == size) ? printf("Good.n") : printf("Bad.n");
- free(host_var);
- }
Add Comment
Please, Sign In to add comment