Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "stdio.h"
- #include <iostream>
- #include<cuda.h>
- #include <cuda_runtime.h>
- #define N 32
- __global__ void sum(float * a, float * b, float *c) {
- int nx = blockIdx.x * blockDim.x + threadIdx.x;
- int ny = blockIdx.y * blockDim.y + threadIdx.y;
- c[ny*N + nx] = a[ny*N + nx] + b[ny*N + nx];
- }
- float a[N][N], b[N][N], c[N][N];
- int main() {
- float *dev_a, *dev_b, *dev_c;
- cudaMalloc((void**)&dev_a, sizeof(float)*N*N);
- cudaMalloc((void**)&dev_b, sizeof(float)*N*N);
- cudaMalloc((void**)&dev_c, sizeof(float)*N*N);
- for (int i = 0; i<N; i++) {
- for (int j = 0; j < N; j++) {
- a[i][j] = 1;
- b[i][j] = 2;
- }
- }
- cudaMemcpy(dev_a, a, sizeof(float)*N*N, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_b, b, sizeof(float)*N*N, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_c, c, sizeof(float)*N*N, cudaMemcpyHostToDevice);
- dim3 dim1 = { N / 16, N / 16, 1 };
- dim3 dim2 = { 16,16,1 };
- sum <<< dim1, dim2 >>> (dev_a, dev_b, dev_c);
- cudaDeviceSynchronize();
- cudaMemcpy(c, dev_c, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
- cudaDeviceSynchronize();
- for (int i = 0; i < N; i++) {
- for (int j = 0; j < N; j++) {
- std::cout << (int)a[i][j] << " + " << (int)b[i][j] << " = " << (int)c[i][j] << std::endl;
- }
- }
- cudaFree(dev_a);
- cudaFree(dev_b);
- cudaFree(dev_c);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement