Untitled

#include "stdio.h"
#include <iostream>
#include<cuda.h>
#include <cuda_runtime.h>
#define N 32
__global__ void sum(float * a, float * b, float *c) {
	int nx = blockIdx.x * blockDim.x + threadIdx.x;
	int ny = blockIdx.y * blockDim.y + threadIdx.y;
	c[ny*N + nx] = a[ny*N + nx] + b[ny*N + nx];
}
float a[N][N], b[N][N], c[N][N];
int main() {
	float *dev_a, *dev_b, *dev_c;
	cudaMalloc((void**)&dev_a, sizeof(float)*N*N);
	cudaMalloc((void**)&dev_b, sizeof(float)*N*N);
	cudaMalloc((void**)&dev_c, sizeof(float)*N*N);
	for (int i = 0; i<N; i++) {
		for (int j = 0; j < N; j++) {
			a[i][j] = 1;
			b[i][j] = 2;
		}
	}
	cudaMemcpy(dev_a, a, sizeof(float)*N*N, cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b, b, sizeof(float)*N*N, cudaMemcpyHostToDevice);
	cudaMemcpy(dev_c, c, sizeof(float)*N*N, cudaMemcpyHostToDevice);
	dim3 dim1 = { N / 16, N / 16, 1 };
	dim3 dim2 = { 16,16,1 };
	sum <<< dim1, dim2 >>> (dev_a, dev_b, dev_c);
	cudaDeviceSynchronize();
	cudaMemcpy(c, dev_c, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
	cudaDeviceSynchronize();
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < N; j++) {
			std::cout << (int)a[i][j] << " + " << (int)b[i][j] << " = " << (int)c[i][j] << std::endl;
			}
	}
	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);
	return 0;
}