Untitled

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <intrin.h>
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda_runtime_api.h>

using namespace std;

#pragma intrinsic(__rdtsc)


#define N 1000
#define N_int N / 2
#define M 2000
#define M_int M / 2

#define THREAD_COUNT (N * M)
#define BLOCK_COUTN ceil(THREAD_COUNT/128)

#define CHECK_ERROR( call )             \
{                                       \
cudaError_t result = call;              \
if ( cudaSuccess != result ) {          \
    cerr << "CUDA error " << result << " in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString( result ) << " (" << #call << ")" << endl;  \
    exit(1); }                          \
}


int* randomInit(char* a) {
    int size = N * M;
    curandGenerator_t generator;
    curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW);
    curandSetPseudoRandomGeneratorSeed(generator, NULL);
    int* d_a;
    CHECK_ERROR(cudaMalloc(&d_a, size));
    curandGenerate(generator, (unsigned int*)d_a, size/4);
    CHECK_ERROR(cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost));
    curandDestroyGenerator(generator);
    return d_a;
}


void printPart(char* b) {
    for (int i = 0; i < 10; i++) {
        for (int j = 0; j < 10; j++) {
            cout << setw(15) << (int)b[i * N + j] << " |";
        }
        cout << endl;
    }

}

void CPU(char* a, char* b) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            b[i * N + j] = a[(N - j) * M - i - 1];
        }
    }
}

__global__ void GPU(int* a, int* b) {
    __shared__ char temp[512];
    char buffer[4];

    int currentThread = blockIdx.x * blockDim.x + threadIdx.x;
    int i = currentThread / N_int;
    int j = currentThread % N_int;
    if (currentThread >= N_int * M_int)
        return;

    *(int*)buffer = a[(N_int - j) * M_int - i - 1];

    temp[4 * threadIdx.x] = buffer[0];
    temp[4 * threadIdx.x + 1] = buffer[1];
    temp[4 * threadIdx.x + 2] = buffer[2];
    temp[4 * threadIdx.x + 3] = buffer[3];

    __syncthreads();

    b[i * N_int + j] = *(int*)(&temp[4 * threadIdx.x]);
}


bool equal(char* b1, char* b2) {
    for (int i = 0; i < N * M; i++) {
        if (b1[i] != b2[i]) {
            return false;
        }
    }
    return true;
}

int main()
{
    char* a = (char*)malloc(N * M);
    char* b = (char*)malloc(N * M);
    char* gpuResult = (char*)malloc(N * M);
    //CHECK_ERROR(cudaMallocHost(&a, N * M));
   // CHECK_ERROR(cudaMallocHost(&b, N * M));


    cout << setw(30) << left << "Threads count: " << THREAD_COUNT << endl;
    cout << setw(30) << left << "Blocks count: " << BLOCK_COUTN << endl;
  //  cout << setw(30) << left << "Thread per block: " << threadsInBlock << endl << endl;

    int* d_a = randomInit(a);
    int* d_b;

    CHECK_ERROR(cudaMalloc(&d_b, N * M));

    cout << sizeof(a[0]) << endl;
    cout << sizeof(d_a[0]) << endl;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    auto start_time = __rdtsc();
    CPU(a, b);
    auto end_time = __rdtsc();
    float time = end_time - start_time;
    cout << setw(30) << left << "CPU time: " << time / 3590000 << " ms" << endl; //3.59GHz   (tick/freq = time)

    float timeGPU = NULL;


    CHECK_ERROR(cudaEventRecord(start));
    GPU <<< BLOCK_COUTN, 128 >>> (d_a, d_b);
    CHECK_ERROR(cudaEventRecord(stop));
    CHECK_ERROR(cudaEventSynchronize(stop));
    CHECK_ERROR(cudaEventElapsedTime(&timeGPU, start, stop));
    cout << setw(30) << left << "GPU time: " << timeGPU << " ms" << endl;;
    CHECK_ERROR(cudaMemcpy(gpuResult, d_b, N * M * sizeof(char), cudaMemcpyDeviceToHost));
    CHECK_ERROR(cudaEventDestroy(start));
    CHECK_ERROR(cudaEventDestroy(stop));
    CHECK_ERROR(cudaFree(d_b));
    CHECK_ERROR(cudaFree(d_a));


    printPart(b);
    cout << endl;
    printPart(gpuResult);

    cout << (equal(b, gpuResult) ? "Results are equal!" : " Results not equal");

    //CHECK_ERROR(cudaFreeHost(a));
  //  CHECK_ERROR(cudaFreeHost(b));
    return 0;
}