Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __global__ void add(int* a, int* b, int* c, int n)
- {
- int globalIdx = threadIdx.x + blockIdx.x * blockDim.x;
- int nt = blockDim.x * gridDim.x;
- while (globalIdx < n) {
- c[globalIdx] = a[globalIdx] + b[globalIdx];
- globalIdx += nt;
- }
- }
- template<typename T>
- void random_ints(T* a, int n)
- {
- for (int i = 0; i < n; ++i)
- {
- a[i] = T(1);
- }
- }
- void check_results(int* c, int n)
- {
- bool isOk = true;
- int i;
- for (i = 0; i < n ; ++i)
- {
- if (c[i] != 2) {
- isOk = false;
- break;
- }
- }
- std::cout << (isOk ? "OK" : "WA") << " " << i << std::endl;
- }
- void AddVectorsViaStreams()
- {
- const int StreamsCount = 2;
- const int N = 67108864;
- const int M = 1024;
- int *a, *b, *c; // host copies of a, b, c
- int *d_a, *d_b, *d_c; // device copies of a, b, c
- int size = N * sizeof(int);
- // Alloc space for device copies of a, b, c
- cudaMalloc((void **)&d_a, size);
- cudaMalloc((void **)&d_b, size);
- cudaMalloc((void **)&d_c, size);
- // Alloc space for host copies of a, b, c and setup input values
- a = (int *)malloc(size); random_ints(a, N);
- b = (int *)malloc(size); random_ints(b, N);
- c = (int *)calloc(N, sizeof(int));
- // Copy inputs to device
- //cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
- //cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
- // Launch add() kernel on GPU with N blocks
- /*
- cudaEvent_t start, stop;
- float time = 0;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- cudaEventRecord(start, 0); */
- cudaDeviceProp devProp;
- cudaGetDeviceProperties(&devProp, 0);
- /* std::cout << devProp.maxThreadsPerBlock << " " << "\n";
- for (int i = 0; i < 3; ++i)
- {
- std::cout << devProp.maxGridSize[i] << std::endl;
- }*/
- cudaStream_t streams[StreamsCount];
- for (int i = 0; i < StreamsCount; ++i)
- {
- cudaStreamCreate(&streams[i]);
- cudaError_t error = cudaGetLastError();
- if (error != cudaSuccess)
- {
- // print the CUDA error message and exit
- printf("CUDA error: %s\n", cudaGetErrorString(error));
- exit(-1);
- }
- }
- int actialN = N / StreamsCount;
- int blocksCount = std::min(devProp.maxGridSize[1], (actialN + M - 1) / M);
- std::cout << "blocks count: " << blocksCount << std::endl;
- std::cout << "threads per block: " << M << std::endl;
- for (int i = 0; i < StreamsCount; ++i)
- {
- cudaMemcpyAsync(d_a + i * actialN,
- a + i * actialN, actialN, cudaMemcpyHostToDevice, streams[i]);
- cudaMemcpyAsync(d_b + i * actialN,
- b + i * actialN, actialN, cudaMemcpyHostToDevice, streams[i]);
- add <<< blocksCount, M, 0, streams[i] >> >(d_a + i * actialN,
- d_b + i * actialN, d_c + i * actialN, actialN);
- cudaMemcpyAsync(c + i * actialN, d_c + i * actialN,
- actialN, cudaMemcpyDeviceToHost, streams[i]);
- }
- // std::this_thread::sleep_for(std::chrono::seconds(25));
- cudaDeviceSynchronize();
- /*
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaEventElapsedTime(&time, start, stop);
- printf("Elapsed time : %.2f ms\n", time);
- cudaEventDestroy(start);
- cudaEventDestroy(stop);
- */
- check_results(c, N);
- cudaError_t error = cudaGetLastError();
- if (error != cudaSuccess)
- {
- // print the CUDA error message and exit
- printf("CUDA error: %s\n", cudaGetErrorString(error));
- exit(-1);
- }
- // Copy result back to host
- cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
- // Cleanup
- for (int i = 0; i < StreamsCount; ++i)
- {
- cudaStreamDestroy(streams[i]);
- }
- free(a); free(b); free(c);
- cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
- }
- int main(void)
- {
- //AddVectors();
- // MulMatrices();
- AddVectorsViaStreams();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement