Advertisement
Guest User

Untitled

a guest
Dec 19th, 2014
171
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.49 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <device_functions.h>
  4. #include <stdio.h>
  5. #include <iostream>
  6. #include <ctime>
  7. #include <stdlib.h>
  8.  
  9. #define N 16
  10. #define BLOCK_SIZE 16
  11.  
  12. using namespace std;
  13.  
  14.  
  15. __global__ void transposeMatrix(float *inputMatrix, float *outputMatrix)
  16. {
  17. __shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
  18. // Определение текущего индекса
  19. int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
  20. // Проверка что не вышли за пределы матрицы
  21. if ( (xIndex < N) && (yIndex < N) )
  22. {
  23. int linearIndex = N * yIndex + xIndex;
  24. // Сохранение во временную переменную
  25. temp[threadIdx.x][threadIdx.y] = inputMatrix[linearIndex];
  26. }
  27. // Синхронизация всех потоков.
  28. __syncthreads();
  29. xIndex = blockDim.y * blockIdx.y + threadIdx.x; yIndex = blockDim.x * blockIdx.x + threadIdx.y;
  30. if ( (xIndex < N) && (yIndex < N) )
  31. {
  32. int linearIndex = N * yIndex + xIndex;
  33. outputMatrix[linearIndex] = temp[threadIdx.x][threadIdx.y];
  34. }
  35. }
  36.  
  37.  
  38.  
  39. int main(int argc, char **argv)
  40. {
  41. float *hostA; // The A matrix
  42. float *hostC; // The output C matrix
  43. float *deviceA;
  44. float *deviceC;
  45.  
  46.  
  47. hostA = (float *) malloc(sizeof(float) * N * N);
  48. hostB = (float *) malloc(sizeof(float) * N * N);
  49.  
  50. for (int i = 0; i < N * N; i++)
  51. {
  52. hostA[i] = (rand() % 1000) / 2.0;
  53. }
  54.  
  55. hostC = (float *) malloc(sizeof(float) * N * N);
  56.  
  57. //count start time
  58. clock_t start = clock();
  59.  
  60. // memory allocation on device
  61. cudaMalloc((void **)&deviceA, sizeof(float)*N * N);
  62. cudaMalloc((void **)&deviceC, sizeof(float)*N * N);
  63.  
  64. // copy arrays from host to device
  65. cudaMemcpy(deviceA, hostA, sizeof(float)*N * N, cudaMemcpyHostToDevice);
  66.  
  67. // init the grid and block dimensions
  68. dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE, 1);
  69. dim3 dimGrid(((N + BLOCK_SIZE - 1) / BLOCK_SIZE) + 1, ((N + BLOCK_SIZE - 1) / 32) + 1, 1);
  70.  
  71. // start
  72. transposeMatrix <<< dimGrid, dimBlock>>>(deviceA, deviceC);
  73.  
  74.  
  75. cudaDeviceSynchronize();
  76.  
  77. // copy the results in GPU memory back to the CPU
  78. cudaMemcpy(hostC, deviceC, sizeof(float)*N * N, cudaMemcpyDeviceToHost);
  79.  
  80. clock_t end = clock();
  81. cout << "work with " << N << " size array:" << (end - start) << endl;
  82.  
  83.  
  84. cudaFree(deviceA);
  85. cudaFree(deviceC);
  86.  
  87. free(hostA);
  88. free(hostC);
  89.  
  90. return 0;
  91. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement