Advertisement
Guest User

Untitled

a guest
Dec 19th, 2014
184
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.38 KB | None | 0 0
  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <device_functions.h>
  4. #include <stdio.h>
  5. #include <iostream>
  6. #include <ctime>
  7. #include <stdlib.h>
  8.  
  9. #define N 16
  10. #define BLOCK_SIZE 16
  11.  
  12. using namespace std;
  13.  
  14.  
  15. __global__ void transposeMatrix(float *in, float *out)
  16. {
  17. __shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
  18. // Определение текущего индекса
  19. int i = blockDim.x * blockIdx.x + threadIdx.x;
  20. int j = blockDim.y * blockIdx.y + threadIdx.y;
  21. // Проверка что не вышли за пределы матрицы
  22. if ( (i < N) && (j < N) )
  23. {
  24. int line_i = N * j + i;
  25. // Сохранение во временную переменную
  26. temp[threadIdx.x][threadIdx.y] = in[line_i];
  27. }
  28. // Синхронизация всех потоков.
  29. __syncthreads();
  30. i = blockDim.y * blockIdx.y + threadIdx.x; j = blockDim.x * blockIdx.x + threadIdx.y;
  31. if ( (i < N) && (j < N) )
  32. {
  33. int line_i = N * j + i;
  34. out[line_i] = temp[threadIdx.x][threadIdx.y];
  35. }
  36. }
  37.  
  38.  
  39.  
  40. int main(int argc, char **argv)
  41. {
  42. float *hostA; // The A matrix
  43. float *hostC; // The output C matrix
  44. float *deviceA;
  45. float *deviceC;
  46.  
  47.  
  48. hostA = (float *) malloc(sizeof(float) * N * N);
  49. hostB = (float *) malloc(sizeof(float) * N * N);
  50.  
  51. for (int i = 0; i < N * N; i++)
  52. {
  53. hostA[i] = (rand() % 1000) / 2.0;
  54. }
  55.  
  56. hostC = (float *) malloc(sizeof(float) * N * N);
  57.  
  58. //count start time
  59. clock_t start = clock();
  60.  
  61. // memory allocation on device
  62. cudaMalloc((void **)&deviceA, sizeof(float)*N * N);
  63. cudaMalloc((void **)&deviceC, sizeof(float)*N * N);
  64.  
  65. // copy arrays from host to device
  66. cudaMemcpy(deviceA, hostA, sizeof(float)*N * N, cudaMemcpyHostToDevice);
  67.  
  68. // init the grid and block dimensions
  69. dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE, 1);
  70. dim3 dimGrid(((N + BLOCK_SIZE - 1) / BLOCK_SIZE) + 1, ((N + BLOCK_SIZE - 1) / 32) + 1, 1);
  71.  
  72. // start
  73. transposeMatrix <<< dimGrid, dimBlock>>>(deviceA, deviceC);
  74.  
  75.  
  76. cudaDeviceSynchronize();
  77.  
  78. // copy the results in GPU memory back to the CPU
  79. cudaMemcpy(hostC, deviceC, sizeof(float)*N * N, cudaMemcpyDeviceToHost);
  80.  
  81. clock_t end = clock();
  82. cout << "work with " << N << " size array:" << (end - start) << endl;
  83.  
  84.  
  85. cudaFree(deviceA);
  86. cudaFree(deviceC);
  87.  
  88. free(hostA);
  89. free(hostC);
  90.  
  91. return 0;
  92. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement