Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Note: Compiled with
- // nvcc -arch=sm_20 sum_reduction.cu
- //
- // My NVIDIA devices: http://pastebin.com/8yTzXUuK
- #include <stdio.h>
- #define N 512
- #define num_t float
- __shared__ num_t temp[256];
- // Parallel Summation...
- __global__ void psum(num_t *d_sum, num_t *d_data)
- {
- extern __shared__ num_t temp[];
- int tid = threadIdx.x;
- temp[tid] = d_data[tid+blockIdx.x * blockDim.x];
- for (int d=blockDim.x>>1; d>=1; d>>=1) {
- __syncthreads();
- if (tid<d) temp[tid] += temp[tid+d];
- }
- if (tid==0) d_sum[blockIdx.x] = temp[0];
- }
- // Initialize the vector
- void init_vec(num_t *x){
- int i;
- for (i=0; i<N ; i++){
- x[i] = i;
- }
- }
- int main(void){
- // Host-size
- num_t *a, *sum;
- num_t *dev_a, *dev_sum;
- size_t size0 = sizeof(num_t);
- size_t size = N * size0;
- a = (num_t*)malloc(size);
- sum = (num_t*)malloc(size0);
- init_vec(a);
- // Copy values from the host to the device
- cudaMalloc((void**)&dev_a, size);
- cudaMalloc((void**)&dev_sum, size0);
- cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
- cudaMemset(dev_sum, 0, size0);
- psum<<<4, 128>>>(dev_sum, dev_a);
- cudaError_t err = cudaGetLastError();
- if (err != cudaSuccess){
- printf("CUDA ERROR while executing the kernel: %s\n",cudaGetErrorString(err));
- return 103;
- }
- cudaMemcpy(sum, dev_sum, size0, cudaMemcpyDeviceToHost);
- printf("sum (cuda) = %f\n", *sum);
- num_t S = 0;
- for (int j=0 ; j<N ; j++){
- S += a[j];
- }
- printf("sum (real) = %f\n", S);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment