cuda_reduce

 #include <iostream>
 #include <cstdlib>
 #include <ctime>
 #include <cuda_runtime.h>
 using namespace std;


 void init(float*, int);

 // CUDA kernel code
 __global__ void sum_reduce(float* da, float* db, float* dc, int n)
 {
     int tid = threadIdx.x;
	 dc[tid] = 0;
     for (int stride = 1; stride < n; stride *= 2) {
         if (tid % (2 * stride) == 0)
                 dc[tid] += (da[tid] * db[tid]) + (da[tid+stride] * db[tid+stride]);
         __syncthreads();
     }
 }

 int main(int argc, char** argv) {
     // interpret command-line arguments
     if (argc != 2) {
         cerr << "**invalid number of arguments**" << endl;
         return 1;
     }
     int n = atoi(argv[1]);
     srand((unsigned)time(NULL));

     // host vectors
     float* ha = new float[n];
     float* hb = new float[n];
     float* hc = new float[1];
     init(ha, n);
     init(hb, n);
     // device vectors (da[n], db[n], dc[n])
	 float* da;
	 float* db;
	 float* dc;

	 cudaMalloc((void**)&da, n * sizeof(float));
	 cudaMalloc((void**)&db, n * sizeof(float));
	 cudaMalloc((void**)&dc, n * sizeof(float));

	 cudaMemcpy(da, ha, n *sizeof(float),cudaMemcpyHostToDevice);
	 cudaMemcpy(db, hb, n * sizeof(float),cudaMemcpyHostToDevice);

     // copy from the host to the device ha -> da, hb -> db
	 sum_reduce<<<1, n>>>(da, db, dc, n);

     // copy the result from the device to the host dc -> hc
	 cudaMemcpy(hc, dc, sizeof(float), cudaMemcpyDeviceToHost);

     float dx = hc[0];
     // dot product on the host
     float hx = 0;
     for (int i = 0; i < n; i++)
         hx += ha[i] * hb[i];

     // compare the results
     cout << "Device = " << dx << " Host = " << hx << endl;

     // free device memory
	 cudaFree(da);
	 cudaFree(db);
	 cudaFree(dc);


     // free host memory
     delete [] ha;
     delete [] hb;
     delete [] hc;

     return 0;
 }

 void init(float* a, int n) {
     float f = 1.0f / RAND_MAX;
     for (int i = 0; i < n; i++)
         a[i] = ::rand() * f; // [0.0f 1.0f]
 }