Advertisement
Guest User

Untitled

a guest
May 18th, 2021
202
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.89 KB | None | 0 0
  1. #include <assert.h>
  2. #include <stdio.h>
  3. #include <math.h>
  4. #include <stdint.h>
  5. #include <string.h>
  6. #include <stdlib.h>
  7. #include <cuda_runtime.h>
  8.  
  9. #define MAX_NUM_SPOTS_IN_WINDOW 80
  10. #define WARP_SIZE 32
  11. #define ASSERT assert
  12.  
  13. __device__ static inline uint32_t atomicInc_(uint32_t* ptr) {
  14. uint32_t value;
  15. do {
  16. value = *ptr;
  17. } while(atomicCAS(ptr, value, value + 1) != value);
  18. return value;
  19. }
  20.  
  21.  
  22. __device__ static inline void UnpackCoordinates(uint32_t coord, int16_t* row, int16_t* col) {
  23. *row = (int16_t)(uint16_t)coord;
  24. coord >>= 16u;
  25. *col = (int16_t)(uint16_t)coord;
  26. }
  27.  
  28.  
  29. __global__ void Kernel(
  30. const uint16_t* blockNumSpotsBuffer,
  31. const uint32_t* blockSpotCoordsBuffer,
  32. size_t blockSpotCoordsBufferElementPitch
  33. ) {
  34. typedef unsigned int uint;
  35.  
  36. if(blockIdx.x!=30 || blockIdx.y!=1) return;
  37.  
  38. ASSERT(blockDim.x % WARP_SIZE == 0);
  39. ASSERT(blockDim.y == 1);
  40.  
  41. uint numWarps = blockDim.x / WARP_SIZE;
  42. uint idxWarp = threadIdx.x / WARP_SIZE;
  43. int idxThreadInWarp = threadIdx.x % WARP_SIZE;
  44.  
  45. struct Spot {
  46. int16_t row;
  47. int16_t col;
  48. volatile unsigned int numSamples;
  49. float signalSamples[MAX_NUM_SPOTS_IN_WINDOW];
  50. };
  51.  
  52. __shared__ uint numLocalSpots;
  53. __shared__ Spot localSpots[3 * 32];
  54.  
  55. numLocalSpots = 0;
  56.  
  57. __syncthreads();
  58.  
  59. ASSERT(numWarps >= 3);
  60. int blocks[3] = {174, 222, 270};
  61. if(idxWarp < 3) {
  62. uint spotBlockIdx = blocks[idxWarp];
  63.  
  64. uint numSpots = blockNumSpotsBuffer[spotBlockIdx];
  65. ASSERT(numSpots < WARP_SIZE);
  66.  
  67. size_t inOffset = (spotBlockIdx * blockSpotCoordsBufferElementPitch) + idxThreadInWarp;
  68.  
  69. uint outOffset;
  70. if(idxThreadInWarp == 0) outOffset = atomicAdd(&numLocalSpots, numSpots);
  71. outOffset = __shfl_sync(0xffffffff, outOffset, 0, 32);
  72.  
  73. if(idxThreadInWarp < numSpots) {
  74. Spot* outSpot = &localSpots[outOffset + idxThreadInWarp];
  75. outSpot->numSamples = 0;
  76.  
  77. uint32_t coord = blockSpotCoordsBuffer[inOffset];
  78. UnpackCoordinates(coord, &outSpot->row, &outSpot->col);
  79. }
  80. }
  81.  
  82. __syncthreads();
  83.  
  84. int sblocks[] = { 29,30,31,77,78,79,125,126,127,173,174,175,221,222,223,269,270,271,317,318,319,365,366,367,413,414,415 };
  85. if(idxWarp == 0) for(int block = 0; block < sizeof(sblocks)/sizeof(int); ++block) {
  86. uint spotBlockIdx = sblocks[block];
  87.  
  88. //__syncwarp(); // <-- adding this seems to solve it
  89.  
  90. uint numSpots = blockNumSpotsBuffer[spotBlockIdx];
  91. uint idxThreadInWarp = threadIdx.x % WARP_SIZE;
  92. if(idxThreadInWarp >= numSpots) continue;
  93.  
  94. size_t inOffset = (spotBlockIdx * blockSpotCoordsBufferElementPitch) + idxThreadInWarp;
  95.  
  96. uint32_t coord = blockSpotCoordsBuffer[inOffset];
  97. int16_t row, col;
  98. UnpackCoordinates(coord, &row, &col);
  99.  
  100. for(int idxLocalSpot = 0; idxLocalSpot < numLocalSpots; ++idxLocalSpot) {
  101. Spot* localSpot = &localSpots[idxLocalSpot];
  102.  
  103. if(localSpot->row == 0 && localSpot->col == 0) continue;
  104. if((abs(localSpot->row - row) >= 5) && (abs(localSpot->col - col) >= 5)) continue;
  105.  
  106. int index = atomicInc_block((unsigned int*)&localSpot->numSamples, 0xffffffff);
  107. //int index = atomicInc_((unsigned int*)&localSpot->numSamples); // <-- using this instead seems to solve it
  108.  
  109. if(localSpot->row == 37 && localSpot->col == 977)
  110. printf("%02d(%d:%p)\n", index, threadIdx.x, &localSpot->numSamples); // <-- sometimes indices are skipped or duplicated. different threads can get same index for same &localSpot->numSamples
  111.  
  112. if(index >= MAX_NUM_SPOTS_IN_WINDOW) continue;
  113. localSpot->signalSamples[index] = coord;
  114. }
  115. }
  116. }
  117.  
  118.  
  119. typedef struct {
  120. int idxBlock;
  121. uint16_t numSpots;
  122. uint32_t spotCoords[10];
  123. } BlockInfo;
  124.  
  125. static BlockInfo blockInfos[] = {
  126. { 29, 1, {61734922, 0, 0, 0, 0, 0, 0, 0, 0, 0} },
  127. { 30, 1, {64094217, 0, 0, 0, 0, 0, 0, 0, 0, 0} },
  128. { 31, 2, {65863691, 66519049, 0, 0, 0, 0, 0, 0, 0, 0} },
  129. { 77, 9, {61407248, 61145110, 62521363, 60882956, 62128143, 62849047, 60817425, 61800469, 62717965, 0} },
  130. { 78, 6, {64880658, 64487438, 64225300, 63111185, 63438860, 63569941, 0, 0, 0, 0} },
  131. { 79, 7, {65536017, 65929237, 65077261, 66846733, 66584595, 66191375, 65273879, 0, 0, 0} },
  132. { 125, 7, {61866015, 62324771, 62586909, 61276192, 60882972, 62193689, 61538330, 0, 0, 0} },
  133. { 126, 7, {64290846, 62980129, 64684067, 63897626, 63635488, 63307804, 64618520, 0, 0, 0} },
  134. { 127, 8, {65667099, 66715678, 66060319, 66322457, 67043362, 65339424, 66977816, 65011740, 0, 0} },
  135. { 173, 5, {60948519, 61341739, 61997097, 61603876, 62783533, 0, 0, 0, 0, 0} },
  136. { 174, 8, {63045676, 64421928, 64815149, 64028709, 63176751, 63701034, 64159790, 63373350, 0, 0} },
  137. { 175, 7, {66125865, 65732645, 66519086, 65077287, 65470507, 66781224, 66453540, 0, 0, 0} },
  138. { 221, 6, {61079600, 62849079, 62324790, 61800497, 61603898, 62652475, 0, 0, 0, 0} },
  139. { 222, 4, {63504436, 64946231, 64553011, 64225337, 0, 0, 0, 0, 0, 0} },
  140. { 223, 8, {66912306, 65601590, 65339451, 66256948, 65929273, 66650168, 65208369, 65863728, 0, 0} },
  141. { 269, 5, {61276222, 60948548, 60948540, 62062663, 61866045, 0, 0, 0, 0, 0} },
  142. { 270, 4, {63438916, 64618557, 64356419, 62980159, 0, 0, 0, 0, 0, 0} },
  143. { 271, 7, {66322494, 66715715, 65011777, 65404997, 67043388, 66060356, 65667136, 0, 0, 0} },
  144. { 317, 6, {61079630, 61800525, 62849104, 62455883, 62193745, 61341779, 0, 0, 0, 0} },
  145. { 318, 6, {64749640, 63504462, 63897683, 64487501, 64815185, 63111242, 0, 0, 0, 0} },
  146. { 319, 8, {66846797, 65142859, 65536080, 66453576, 66191438, 65929299, 65798218, 66584658, 0, 0} },
  147. { 365, 6, {61210712, 61866071, 61669469, 60948575, 60817493, 62521429, 0, 0, 0, 0} },
  148. { 366, 7, {63635545, 63963228, 64684123, 62914650, 64225367, 63307870, 63176788, 0, 0, 0} },
  149. { 367, 4, {66256984, 66977878, 65077343, 66650205, 0, 0, 0, 0, 0, 0} },
  150. { 413, 8, {62652512, 62390373, 61735015, 61276259, 62128235, 61014121, 62718057, 61997153, 0, 0} },
  151. { 414, 7, {64749669, 63438952, 63045732, 64421985, 64094311, 63701091, 64487531, 0, 0, 0} },
  152. { 415, 5, {66781287, 67043426, 65405028, 65798248, 65142889, 0, 0, 0, 0, 0} }
  153. };
  154.  
  155. int main() {
  156. int numBlocks = 500;
  157. int maxNumSpots = 10;
  158. uint16_t* blockNumSpots = (uint16_t*)calloc(numBlocks, sizeof(uint16_t));
  159. uint32_t* spotCoords = (uint32_t*)calloc(maxNumSpots*numBlocks, sizeof(uint32_t));
  160.  
  161. for(int idxBlock = 0; idxBlock < sizeof(blockInfos) / sizeof(BlockInfo); ++idxBlock) {
  162. const BlockInfo* info = &blockInfos[idxBlock];
  163. blockNumSpots[info->idxBlock] = info->numSpots;
  164. memcpy(spotCoords + maxNumSpots*info->idxBlock, info->spotCoords, info->numSpots * sizeof(uint32_t));
  165. }
  166.  
  167. uint16_t* blockNumSpots_d = NULL;
  168. uint32_t* spotCoords_d = NULL;
  169. size_t spotCoordsPitch = 0;
  170. cudaMalloc(&blockNumSpots_d, numBlocks * sizeof(uint16_t));
  171. cudaMallocPitch(&spotCoords_d, &spotCoordsPitch, maxNumSpots, numBlocks);
  172. cudaMemcpy(blockNumSpots_d, blockNumSpots, numBlocks * sizeof(uint16_t), cudaMemcpyHostToDevice);
  173. cudaMemcpy2D(spotCoords_d, spotCoordsPitch, spotCoords, maxNumSpots*sizeof(uint32_t), maxNumSpots*sizeof(uint32_t), numBlocks, cudaMemcpyHostToDevice);
  174.  
  175. free(blockNumSpots);
  176. free(spotCoords);
  177.  
  178. cudaDeviceSynchronize();
  179.  
  180.  
  181. int2 numSpotBlocksPerBlock = make_int2(1, 3);
  182. int numWarps = 4;
  183.  
  184. dim3 blocksInGrid = dim3(48, 15);
  185. dim3 threadsInBlock = dim3(32 * numWarps);
  186.  
  187. Kernel<<<blocksInGrid, threadsInBlock>>>(
  188. blockNumSpots_d,
  189. spotCoords_d,
  190. spotCoordsPitch / sizeof(uint32_t)
  191. );
  192.  
  193. cudaDeviceSynchronize();
  194.  
  195. cudaFree(blockNumSpots_d);
  196. cudaFree(spotCoords_d);
  197.  
  198. return 0;
  199. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement