Untitled

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <device_functions.h>
#include <stdio.h>
#include <iostream>
#include <ctime>
#include <stdlib.h>

#define N 16
#define BLOCK_SIZE 16

using namespace std;


__global__ void transposeMatrix(float *in, float *out)
{
    __shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
    // Определение текущего индекса
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    // Проверка что не вышли за пределы матрицы
    if ( (i < N) && (j < N) )
    {
        int line_i = N * j + i;
        // Сохранение во временную переменную
        temp[threadIdx.x][threadIdx.y] = in[line_i];
    }
    // Синхронизация всех потоков.
    __syncthreads();
    i = blockDim.y * blockIdx.y + threadIdx.x; j = blockDim.x * blockIdx.x + threadIdx.y;
    if ( (i < N) && (j < N) )
    {
        int line_i = N * j + i;
        out[line_i] = temp[threadIdx.x][threadIdx.y];
    }
}


int main(int argc, char **argv)
{
    float *hostA;  // The A matrix
    float *hostC;  // The output C matrix
    float *deviceA;
    float *deviceC;


    hostA = (float *) malloc(sizeof(float) * N * N);
    hostB = (float *) malloc(sizeof(float) * N * N);

    for (int i = 0; i < N * N; i++)
    {
        hostA[i] = (rand() % 1000) / 2.0;
    }

    hostC = (float *) malloc(sizeof(float) * N * N);

    //count start time
    clock_t start = clock();

    // memory allocation on device
    cudaMalloc((void **)&deviceA, sizeof(float)*N * N);
    cudaMalloc((void **)&deviceC, sizeof(float)*N * N);

    // copy arrays from host to device
    cudaMemcpy(deviceA, hostA, sizeof(float)*N * N, cudaMemcpyHostToDevice);

    // init the grid and block dimensions
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE, 1);
    dim3 dimGrid(((N + BLOCK_SIZE - 1) / BLOCK_SIZE) + 1, ((N + BLOCK_SIZE - 1) / 32) + 1, 1);

    // start
    transposeMatrix <<< dimGrid, dimBlock>>>(deviceA, deviceC);


    cudaDeviceSynchronize();

    // copy the results in GPU memory back to the CPU
    cudaMemcpy(hostC, deviceC, sizeof(float)*N * N, cudaMemcpyDeviceToHost);

    clock_t end = clock();
    cout << "work with " << N << " size array:" << (end - start) << endl;


    cudaFree(deviceA);
    cudaFree(deviceC);

    free(hostA);
    free(hostC);

    return 0;
}