Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <device_functions.h>
- #include <stdio.h>
- #include <iostream>
- #include <ctime>
- #include <stdlib.h>
- #define N 16
- #define BLOCK_SIZE 16
- using namespace std;
- __global__ void transposeMatrix(float *inputMatrix, float *outputMatrix)
- {
- __shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
- // Определение текущего индекса
- int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
- // Проверка что не вышли за пределы матрицы
- if ( (xIndex < N) && (yIndex < N) )
- {
- int linearIndex = N * yIndex + xIndex;
- // Сохранение во временную переменную
- temp[threadIdx.x][threadIdx.y] = inputMatrix[linearIndex];
- }
- // Синхронизация всех потоков.
- __syncthreads();
- xIndex = blockDim.y * blockIdx.y + threadIdx.x; yIndex = blockDim.x * blockIdx.x + threadIdx.y;
- if ( (xIndex < N) && (yIndex < N) )
- {
- int linearIndex = N * yIndex + xIndex;
- outputMatrix[linearIndex] = temp[threadIdx.x][threadIdx.y];
- }
- }
- int main(int argc, char **argv)
- {
- float *hostA; // The A matrix
- float *hostC; // The output C matrix
- float *deviceA;
- float *deviceC;
- hostA = (float *) malloc(sizeof(float) * N * N);
- hostB = (float *) malloc(sizeof(float) * N * N);
- for (int i = 0; i < N * N; i++)
- {
- hostA[i] = (rand() % 1000) / 2.0;
- }
- hostC = (float *) malloc(sizeof(float) * N * N);
- //count start time
- clock_t start = clock();
- // memory allocation on device
- cudaMalloc((void **)&deviceA, sizeof(float)*N * N);
- cudaMalloc((void **)&deviceC, sizeof(float)*N * N);
- // copy arrays from host to device
- cudaMemcpy(deviceA, hostA, sizeof(float)*N * N, cudaMemcpyHostToDevice);
- // init the grid and block dimensions
- dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE, 1);
- dim3 dimGrid(((N + BLOCK_SIZE - 1) / BLOCK_SIZE) + 1, ((N + BLOCK_SIZE - 1) / 32) + 1, 1);
- // start
- transposeMatrix <<< dimGrid, dimBlock>>>(deviceA, deviceC);
- cudaDeviceSynchronize();
- // copy the results in GPU memory back to the CPU
- cudaMemcpy(hostC, deviceC, sizeof(float)*N * N, cudaMemcpyDeviceToHost);
- clock_t end = clock();
- cout << "work with " << N << " size array:" << (end - start) << endl;
- cudaFree(deviceA);
- cudaFree(deviceC);
- free(hostA);
- free(hostC);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement