Untitled


template <int BLOCK_SIZE> __global__ void sumKernelStr2(float *c, float*a) {
    __shared__ float sdata[BLOCK_SIZE*2];
    unsigned int tid = 2*threadIdx.x;
    unsigned int i = blockIdx.x * 2*blockDim.x + 2*threadIdx.x;

    sdata[tid] = a[i];
    sdata[tid + 1] = a[i + 1];
    __syncthreads();

    for (unsigned int odstep = 1; odstep < 2*blockDim.x; odstep *= 2) {
        int index = odstep*tid;
        if (index < 2*blockDim.x) {
            sdata[index] += sdata[index + odstep];
        }
        __syncthreads();
    }

    if (tid == 0) c[blockIdx.x] = sdata[0];
}

template <int BLOCK_SIZE> __global__ void sumKernelStr3(float *c, float *a) {
    __shared__ float sdata[BLOCK_SIZE * 2];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * 2*blockDim.x + threadIdx.x;

    sdata[tid] = a[i];
    sdata[tid + blockDim.x] = a[i + blockDim.x];
    __syncthreads();

    for (unsigned int odstep = blockDim.x; odstep > 0; odstep /= 2) {
        if (tid < odstep) sdata[tid] += sdata[tid + odstep];

        __syncthreads();
    }
    if (tid == 0) c[blockIdx.x] = sdata[0];
}