Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <assert.h>
- #include <stdio.h>
- #include <math.h>
- #include <stdint.h>
- #include <string.h>
- #include <stdlib.h>
- #include <cuda_runtime.h>
- #define MAX_NUM_SPOTS_IN_WINDOW 80
- #define WARP_SIZE 32
- #define ASSERT assert
- __device__ static inline uint32_t atomicInc_(uint32_t* ptr) {
- uint32_t value;
- do {
- value = *ptr;
- } while(atomicCAS(ptr, value, value + 1) != value);
- return value;
- }
- __device__ static inline void UnpackCoordinates(uint32_t coord, int16_t* row, int16_t* col) {
- *row = (int16_t)(uint16_t)coord;
- coord >>= 16u;
- *col = (int16_t)(uint16_t)coord;
- }
- __global__ void Kernel(
- const uint16_t* blockNumSpotsBuffer,
- const uint32_t* blockSpotCoordsBuffer,
- size_t blockSpotCoordsBufferElementPitch
- ) {
- typedef unsigned int uint;
- if(blockIdx.x!=30 || blockIdx.y!=1) return;
- ASSERT(blockDim.x % WARP_SIZE == 0);
- ASSERT(blockDim.y == 1);
- uint numWarps = blockDim.x / WARP_SIZE;
- uint idxWarp = threadIdx.x / WARP_SIZE;
- int idxThreadInWarp = threadIdx.x % WARP_SIZE;
- struct Spot {
- int16_t row;
- int16_t col;
- volatile unsigned int numSamples;
- float signalSamples[MAX_NUM_SPOTS_IN_WINDOW];
- };
- __shared__ uint numLocalSpots;
- __shared__ Spot localSpots[3 * 32];
- numLocalSpots = 0;
- __syncthreads();
- ASSERT(numWarps >= 3);
- int blocks[3] = {174, 222, 270};
- if(idxWarp < 3) {
- uint spotBlockIdx = blocks[idxWarp];
- uint numSpots = blockNumSpotsBuffer[spotBlockIdx];
- ASSERT(numSpots < WARP_SIZE);
- size_t inOffset = (spotBlockIdx * blockSpotCoordsBufferElementPitch) + idxThreadInWarp;
- uint outOffset;
- if(idxThreadInWarp == 0) outOffset = atomicAdd(&numLocalSpots, numSpots);
- outOffset = __shfl_sync(0xffffffff, outOffset, 0, 32);
- if(idxThreadInWarp < numSpots) {
- Spot* outSpot = &localSpots[outOffset + idxThreadInWarp];
- outSpot->numSamples = 0;
- uint32_t coord = blockSpotCoordsBuffer[inOffset];
- UnpackCoordinates(coord, &outSpot->row, &outSpot->col);
- }
- }
- __syncthreads();
- int sblocks[] = { 29,30,31,77,78,79,125,126,127,173,174,175,221,222,223,269,270,271,317,318,319,365,366,367,413,414,415 };
- if(idxWarp == 0) for(int block = 0; block < sizeof(sblocks)/sizeof(int); ++block) {
- uint spotBlockIdx = sblocks[block];
- //__syncwarp(); // <-- adding this seems to solve it
- uint numSpots = blockNumSpotsBuffer[spotBlockIdx];
- uint idxThreadInWarp = threadIdx.x % WARP_SIZE;
- if(idxThreadInWarp >= numSpots) continue;
- size_t inOffset = (spotBlockIdx * blockSpotCoordsBufferElementPitch) + idxThreadInWarp;
- uint32_t coord = blockSpotCoordsBuffer[inOffset];
- int16_t row, col;
- UnpackCoordinates(coord, &row, &col);
- for(int idxLocalSpot = 0; idxLocalSpot < numLocalSpots; ++idxLocalSpot) {
- Spot* localSpot = &localSpots[idxLocalSpot];
- if(localSpot->row == 0 && localSpot->col == 0) continue;
- if((abs(localSpot->row - row) >= 5) && (abs(localSpot->col - col) >= 5)) continue;
- int index = atomicInc_block((unsigned int*)&localSpot->numSamples, 0xffffffff);
- //int index = atomicInc_((unsigned int*)&localSpot->numSamples); // <-- using this instead seems to solve it
- if(localSpot->row == 37 && localSpot->col == 977)
- printf("%02d(%d:%p)\n", index, threadIdx.x, &localSpot->numSamples); // <-- sometimes indices are skipped or duplicated. different threads can get same index for same &localSpot->numSamples
- if(index >= MAX_NUM_SPOTS_IN_WINDOW) continue;
- localSpot->signalSamples[index] = coord;
- }
- }
- }
- typedef struct {
- int idxBlock;
- uint16_t numSpots;
- uint32_t spotCoords[10];
- } BlockInfo;
- static BlockInfo blockInfos[] = {
- { 29, 1, {61734922, 0, 0, 0, 0, 0, 0, 0, 0, 0} },
- { 30, 1, {64094217, 0, 0, 0, 0, 0, 0, 0, 0, 0} },
- { 31, 2, {65863691, 66519049, 0, 0, 0, 0, 0, 0, 0, 0} },
- { 77, 9, {61407248, 61145110, 62521363, 60882956, 62128143, 62849047, 60817425, 61800469, 62717965, 0} },
- { 78, 6, {64880658, 64487438, 64225300, 63111185, 63438860, 63569941, 0, 0, 0, 0} },
- { 79, 7, {65536017, 65929237, 65077261, 66846733, 66584595, 66191375, 65273879, 0, 0, 0} },
- { 125, 7, {61866015, 62324771, 62586909, 61276192, 60882972, 62193689, 61538330, 0, 0, 0} },
- { 126, 7, {64290846, 62980129, 64684067, 63897626, 63635488, 63307804, 64618520, 0, 0, 0} },
- { 127, 8, {65667099, 66715678, 66060319, 66322457, 67043362, 65339424, 66977816, 65011740, 0, 0} },
- { 173, 5, {60948519, 61341739, 61997097, 61603876, 62783533, 0, 0, 0, 0, 0} },
- { 174, 8, {63045676, 64421928, 64815149, 64028709, 63176751, 63701034, 64159790, 63373350, 0, 0} },
- { 175, 7, {66125865, 65732645, 66519086, 65077287, 65470507, 66781224, 66453540, 0, 0, 0} },
- { 221, 6, {61079600, 62849079, 62324790, 61800497, 61603898, 62652475, 0, 0, 0, 0} },
- { 222, 4, {63504436, 64946231, 64553011, 64225337, 0, 0, 0, 0, 0, 0} },
- { 223, 8, {66912306, 65601590, 65339451, 66256948, 65929273, 66650168, 65208369, 65863728, 0, 0} },
- { 269, 5, {61276222, 60948548, 60948540, 62062663, 61866045, 0, 0, 0, 0, 0} },
- { 270, 4, {63438916, 64618557, 64356419, 62980159, 0, 0, 0, 0, 0, 0} },
- { 271, 7, {66322494, 66715715, 65011777, 65404997, 67043388, 66060356, 65667136, 0, 0, 0} },
- { 317, 6, {61079630, 61800525, 62849104, 62455883, 62193745, 61341779, 0, 0, 0, 0} },
- { 318, 6, {64749640, 63504462, 63897683, 64487501, 64815185, 63111242, 0, 0, 0, 0} },
- { 319, 8, {66846797, 65142859, 65536080, 66453576, 66191438, 65929299, 65798218, 66584658, 0, 0} },
- { 365, 6, {61210712, 61866071, 61669469, 60948575, 60817493, 62521429, 0, 0, 0, 0} },
- { 366, 7, {63635545, 63963228, 64684123, 62914650, 64225367, 63307870, 63176788, 0, 0, 0} },
- { 367, 4, {66256984, 66977878, 65077343, 66650205, 0, 0, 0, 0, 0, 0} },
- { 413, 8, {62652512, 62390373, 61735015, 61276259, 62128235, 61014121, 62718057, 61997153, 0, 0} },
- { 414, 7, {64749669, 63438952, 63045732, 64421985, 64094311, 63701091, 64487531, 0, 0, 0} },
- { 415, 5, {66781287, 67043426, 65405028, 65798248, 65142889, 0, 0, 0, 0, 0} }
- };
- int main() {
- int numBlocks = 500;
- int maxNumSpots = 10;
- uint16_t* blockNumSpots = (uint16_t*)calloc(numBlocks, sizeof(uint16_t));
- uint32_t* spotCoords = (uint32_t*)calloc(maxNumSpots*numBlocks, sizeof(uint32_t));
- for(int idxBlock = 0; idxBlock < sizeof(blockInfos) / sizeof(BlockInfo); ++idxBlock) {
- const BlockInfo* info = &blockInfos[idxBlock];
- blockNumSpots[info->idxBlock] = info->numSpots;
- memcpy(spotCoords + maxNumSpots*info->idxBlock, info->spotCoords, info->numSpots * sizeof(uint32_t));
- }
- uint16_t* blockNumSpots_d = NULL;
- uint32_t* spotCoords_d = NULL;
- size_t spotCoordsPitch = 0;
- cudaMalloc(&blockNumSpots_d, numBlocks * sizeof(uint16_t));
- cudaMallocPitch(&spotCoords_d, &spotCoordsPitch, maxNumSpots, numBlocks);
- cudaMemcpy(blockNumSpots_d, blockNumSpots, numBlocks * sizeof(uint16_t), cudaMemcpyHostToDevice);
- cudaMemcpy2D(spotCoords_d, spotCoordsPitch, spotCoords, maxNumSpots*sizeof(uint32_t), maxNumSpots*sizeof(uint32_t), numBlocks, cudaMemcpyHostToDevice);
- free(blockNumSpots);
- free(spotCoords);
- cudaDeviceSynchronize();
- int2 numSpotBlocksPerBlock = make_int2(1, 3);
- int numWarps = 4;
- dim3 blocksInGrid = dim3(48, 15);
- dim3 threadsInBlock = dim3(32 * numWarps);
- Kernel<<<blocksInGrid, threadsInBlock>>>(
- blockNumSpots_d,
- spotCoords_d,
- spotCoordsPitch / sizeof(uint32_t)
- );
- cudaDeviceSynchronize();
- cudaFree(blockNumSpots_d);
- cudaFree(spotCoords_d);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement