Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <cstdlib>
- #include <ctime>
- #include <cuda_runtime.h>
- using namespace std;
- void init(float*, int);
- // CUDA kernel code
- __global__ void sum_reduce(float* da, float* db, float* dc, int n)
- {
- int tid = threadIdx.x;
- dc[tid] = 0;
- for (int stride = 1; stride < n; stride *= 2) {
- if (tid % (2 * stride) == 0)
- dc[tid] += (da[tid] * db[tid]) + (da[tid+stride] * db[tid+stride]);
- __syncthreads();
- }
- }
- int main(int argc, char** argv) {
- // interpret command-line arguments
- if (argc != 2) {
- cerr << "**invalid number of arguments**" << endl;
- return 1;
- }
- int n = atoi(argv[1]);
- srand((unsigned)time(NULL));
- // host vectors
- float* ha = new float[n];
- float* hb = new float[n];
- float* hc = new float[1];
- init(ha, n);
- init(hb, n);
- // device vectors (da[n], db[n], dc[n])
- float* da;
- float* db;
- float* dc;
- cudaMalloc((void**)&da, n * sizeof(float));
- cudaMalloc((void**)&db, n * sizeof(float));
- cudaMalloc((void**)&dc, n * sizeof(float));
- cudaMemcpy(da, ha, n *sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(db, hb, n * sizeof(float),cudaMemcpyHostToDevice);
- // copy from the host to the device ha -> da, hb -> db
- sum_reduce<<<1, n>>>(da, db, dc, n);
- // copy the result from the device to the host dc -> hc
- cudaMemcpy(hc, dc, sizeof(float), cudaMemcpyDeviceToHost);
- float dx = hc[0];
- // dot product on the host
- float hx = 0;
- for (int i = 0; i < n; i++)
- hx += ha[i] * hb[i];
- // compare the results
- cout << "Device = " << dx << " Host = " << hx << endl;
- // free device memory
- cudaFree(da);
- cudaFree(db);
- cudaFree(dc);
- // free host memory
- delete [] ha;
- delete [] hb;
- delete [] hc;
- return 0;
- }
- void init(float* a, int n) {
- float f = 1.0f / RAND_MAX;
- for (int i = 0; i < n; i++)
- a[i] = ::rand() * f; // [0.0f 1.0f]
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement