Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- template <int BLOCK_SIZE> __global__ void sumKernelStr2(float *c, float*a) {
- __shared__ float sdata[BLOCK_SIZE*2];
- unsigned int tid = 2*threadIdx.x;
- unsigned int i = blockIdx.x * 2*blockDim.x + 2*threadIdx.x;
- sdata[tid] = a[i];
- sdata[tid + 1] = a[i + 1];
- __syncthreads();
- for (unsigned int odstep = 1; odstep < 2*blockDim.x; odstep *= 2) {
- int index = odstep*tid;
- if (index < 2*blockDim.x) {
- sdata[index] += sdata[index + odstep];
- }
- __syncthreads();
- }
- if (tid == 0) c[blockIdx.x] = sdata[0];
- }
- template <int BLOCK_SIZE> __global__ void sumKernelStr3(float *c, float *a) {
- __shared__ float sdata[BLOCK_SIZE * 2];
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * 2*blockDim.x + threadIdx.x;
- sdata[tid] = a[i];
- sdata[tid + blockDim.x] = a[i + blockDim.x];
- __syncthreads();
- for (unsigned int odstep = blockDim.x; odstep > 0; odstep /= 2) {
- if (tid < odstep) sdata[tid] += sdata[tid + odstep];
- __syncthreads();
- }
- if (tid == 0) c[blockIdx.x] = sdata[0];
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement