Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <stdio.h>
- #include <stdlib.h>
- #include <math.h>
- #include <omp.h>
- __global__ void vecAdd(double *A, double *B, double *C, int n){
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- if (idx < n){
- C[idx] = A[idx] + B[idx];
- }
- }
- int main(int argc, char **argv){
- if (argc != 2){
- std::cerr << "Wrong arguments" << std::endl;
- return 1;
- }
- int n = atoi(argv[1]);
- double *h_a, *h_b, *h_c;
- size_t bytes = n *sizeof(double);
- h_a = (double*)malloc(bytes);
- h_b = (double*)malloc(bytes);
- h_c = (double*)malloc(bytes);
- cudaHostRegister(h_a, bytes, 0);
- cudaHostRegister(h_b, bytes, 0);
- cudaHostRegister(h_c, bytes, 0);
- for (int i = 0; i < n; ++i){
- h_a[i] = sin(i)*sin(i);
- h_b[i] = cos(i)*cos(i);
- }
- double *d_a0, *d_b0, *d_c0;
- double *d_a1, *d_b1, *d_c1;
- size_t bytes_device = ((n-1)/2 + 1)*sizeof(double);
- cudaSetDevice(0);
- cudaMalloc(&d_a0, bytes_device);
- cudaMalloc(&d_b0, bytes_device);
- cudaMalloc(&d_c0, bytes_device);
- cudaSetDevice(1);
- cudaMalloc(&d_a1, bytes_device);
- cudaMalloc(&d_b1, bytes_device);
- cudaMalloc(&d_c1, bytes_device);
- cudaSetDevice(0);
- cudaEvent_t start_gpu, stop_gpu;
- cudaEventCreate(&start_gpu);
- cudaEventCreate(&stop_gpu);
- cudaEventRecord(start_gpu);
- int block_size, grid_size;
- block_size = 1024;
- grid_size = ((n-1)/2)/block_size + 1;
- cudaSetDevice(0);
- cudaMemcpyAsync(d_a0, &h_a[0], bytes_device, cudaMemcpyHostToDevice);
- cudaMemcpyAsync(d_b0, &h_b[0], bytes_device, cudaMemcpyHostToDevice);
- cudaSetDevice(0);
- vecAdd<<<grid_size, block_size>>>(d_a0, d_b0, d_c0, (n-1)/2+1);
- cudaMemcpyAsync(&h_c[0], d_c0, bytes_device, cudaMemcpyDeviceToHost);
- cudaSetDevice(1);
- cudaMemcpyAsync(d_a1, &h_a[(n-1)/2 + 1], bytes_device, cudaMemcpyHostToDevice);
- cudaMemcpyAsync(d_b1, &h_b[(n-1)/2 + 1], bytes_device, cudaMemcpyHostToDevice);
- vecAdd<<<grid_size, block_size>>>(d_a1, d_b1, d_c1, (n-1)/2+1);
- cudaMemcpyAsync(&h_c[(n-1)/2+1], d_c1, bytes_device, cudaMemcpyDeviceToHost);
- cudaDeviceSynchronize();
- cudaSetDevice(0);
- cudaDeviceSynchronize();
- cudaEventRecord(stop_gpu);
- cudaEventSynchronize(stop_gpu);
- float delta = 0.0;
- cudaEventElapsedTime(&delta, start_gpu, stop_gpu);
- /*
- for (int i = 0; i < n; ++i){
- std::cout << h_c[i] << std::endl;
- }*/
- for (int i = 0; i < n; ++i){
- if (h_c[i] != h_a[i] + h_b[i]){
- std::cout << "Not equal" << std::endl;
- break;
- }
- if (i == n-1){
- std::cout << "Equal" << std::endl;
- }
- }
- printf("Elapsed time %lf\n", delta);
- cudaFree(d_a0);
- cudaFree(d_b0);
- cudaFree(d_c0);
- cudaSetDevice(1);
- cudaFree(d_a1);
- cudaFree(d_b1);
- cudaFree(d_c1);
- cudaSetDevice(0);
- cudaHostUnregister(h_a);
- cudaHostUnregister(h_b);
- cudaHostUnregister(h_c);
- free(h_a);
- free(h_b);
- free(h_c);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement