Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <cuda.h>
- #include <curand.h>
- #include <stdio.h>
- #include <iostream>
- #include <iomanip>
- #include <chrono>
- #include <intrin.h>
- #ifndef __CUDACC__
- #define __CUDACC__
- #endif
- #include <cuda_runtime_api.h>
- using namespace std;
- #pragma intrinsic(__rdtsc)
- #define N 1000
- #define N_int N / 2
- #define M 2000
- #define M_int M / 2
- #define THREAD_COUNT (N * M)
- #define BLOCK_COUTN ceil(THREAD_COUNT/128)
- #define CHECK_ERROR( call ) \
- { \
- cudaError_t result = call; \
- if ( cudaSuccess != result ) { \
- cerr << "CUDA error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString( result ) << " (" << #call << ")" << endl; \
- exit(1); } \
- }
- int* randomInit(char* a) {
- int size = N * M;
- curandGenerator_t generator;
- curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW);
- curandSetPseudoRandomGeneratorSeed(generator, NULL);
- int* d_a;
- CHECK_ERROR(cudaMalloc(&d_a, size));
- curandGenerate(generator, (unsigned int*)d_a, size/4);
- CHECK_ERROR(cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost));
- curandDestroyGenerator(generator);
- return d_a;
- }
- void printPart(char* b) {
- for (int i = 0; i < 10; i++) {
- for (int j = 0; j < 10; j++) {
- cout << setw(15) << (int)b[i * N + j] << " |";
- }
- cout << endl;
- }
- }
- void CPU(char* a, char* b) {
- for (int i = 0; i < M; i++) {
- for (int j = 0; j < N; j++) {
- b[i * N + j] = a[(N - j) * M - i - 1];
- }
- }
- }
- __global__ void GPU(int* a, int* b) {
- __shared__ char temp[512];
- char buffer[4];
- int currentThread = blockIdx.x * blockDim.x + threadIdx.x;
- int i = currentThread / N_int;
- int j = currentThread % N_int;
- if (currentThread >= N_int * M_int)
- return;
- *(int*)buffer = a[(N_int - j) * M_int - i - 1];
- temp[4 * threadIdx.x] = buffer[0];
- temp[4 * threadIdx.x + 1] = buffer[1];
- temp[4 * threadIdx.x + 2] = buffer[2];
- temp[4 * threadIdx.x + 3] = buffer[3];
- __syncthreads();
- b[i * N_int + j] = *(int*)(&temp[4 * threadIdx.x]);
- }
- bool equal(char* b1, char* b2) {
- for (int i = 0; i < N * M; i++) {
- if (b1[i] != b2[i]) {
- return false;
- }
- }
- return true;
- }
- int main()
- {
- char* a = (char*)malloc(N * M);
- char* b = (char*)malloc(N * M);
- char* gpuResult = (char*)malloc(N * M);
- //CHECK_ERROR(cudaMallocHost(&a, N * M));
- // CHECK_ERROR(cudaMallocHost(&b, N * M));
- cout << setw(30) << left << "Threads count: " << THREAD_COUNT << endl;
- cout << setw(30) << left << "Blocks count: " << BLOCK_COUTN << endl;
- // cout << setw(30) << left << "Thread per block: " << threadsInBlock << endl << endl;
- int* d_a = randomInit(a);
- int* d_b;
- CHECK_ERROR(cudaMalloc(&d_b, N * M));
- cout << sizeof(a[0]) << endl;
- cout << sizeof(d_a[0]) << endl;
- cudaEvent_t start, stop;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- auto start_time = __rdtsc();
- CPU(a, b);
- auto end_time = __rdtsc();
- float time = end_time - start_time;
- cout << setw(30) << left << "CPU time: " << time / 3590000 << " ms" << endl; //3.59GHz (tick/freq = time)
- float timeGPU = NULL;
- CHECK_ERROR(cudaEventRecord(start));
- GPU <<< BLOCK_COUTN, 128 >>> (d_a, d_b);
- CHECK_ERROR(cudaEventRecord(stop));
- CHECK_ERROR(cudaEventSynchronize(stop));
- CHECK_ERROR(cudaEventElapsedTime(&timeGPU, start, stop));
- cout << setw(30) << left << "GPU time: " << timeGPU << " ms" << endl;;
- CHECK_ERROR(cudaMemcpy(gpuResult, d_b, N * M * sizeof(char), cudaMemcpyDeviceToHost));
- CHECK_ERROR(cudaEventDestroy(start));
- CHECK_ERROR(cudaEventDestroy(stop));
- CHECK_ERROR(cudaFree(d_b));
- CHECK_ERROR(cudaFree(d_a));
- printPart(b);
- cout << endl;
- printPart(gpuResult);
- cout << (equal(b, gpuResult) ? "Results are equal!" : " Results not equal");
- //CHECK_ERROR(cudaFreeHost(a));
- // CHECK_ERROR(cudaFreeHost(b));
- return 0;
- }
Add Comment
Please, Sign In to add comment