Advertisement
Guest User

Untitled

a guest
Jun 19th, 2019
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.28 KB | None | 0 0
  1. #include "stdio.h"
  2. #include <iostream>
  3. #include<cuda.h>
  4. #include <cuda_runtime.h>
  5. #define N 32
  6. __global__ void sum(float * a, float * b, float *c) {
  7. int nx = blockIdx.x * blockDim.x + threadIdx.x;
  8. int ny = blockIdx.y * blockDim.y + threadIdx.y;
  9. c[ny*N + nx] = a[ny*N + nx] + b[ny*N + nx];
  10. }
  11. float a[N][N], b[N][N], c[N][N];
  12. int main() {
  13. float *dev_a, *dev_b, *dev_c;
  14. cudaMalloc((void**)&dev_a, sizeof(float)*N*N);
  15. cudaMalloc((void**)&dev_b, sizeof(float)*N*N);
  16. cudaMalloc((void**)&dev_c, sizeof(float)*N*N);
  17. for (int i = 0; i<N; i++) {
  18. for (int j = 0; j < N; j++) {
  19. a[i][j] = 1;
  20. b[i][j] = 2;
  21. }
  22. }
  23. cudaMemcpy(dev_a, a, sizeof(float)*N*N, cudaMemcpyHostToDevice);
  24. cudaMemcpy(dev_b, b, sizeof(float)*N*N, cudaMemcpyHostToDevice);
  25. cudaMemcpy(dev_c, c, sizeof(float)*N*N, cudaMemcpyHostToDevice);
  26. dim3 dim1 = { N / 16, N / 16, 1 };
  27. dim3 dim2 = { 16,16,1 };
  28. sum <<< dim1, dim2 >>> (dev_a, dev_b, dev_c);
  29. cudaDeviceSynchronize();
  30. cudaMemcpy(c, dev_c, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
  31. cudaDeviceSynchronize();
  32. for (int i = 0; i < N; i++) {
  33. for (int j = 0; j < N; j++) {
  34. std::cout << (int)a[i][j] << " + " << (int)b[i][j] << " = " << (int)c[i][j] << std::endl;
  35. }
  36. }
  37. cudaFree(dev_a);
  38. cudaFree(dev_b);
  39. cudaFree(dev_c);
  40. return 0;
  41. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement