Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cuda_runtime.h>
- #include <helper_cuda.h>
- #include <iostream>
- #include <stdio.h>
- #define N 5
- __global__ void matrixAdd(int* A, int* B, int* C) {
- int i = blockIdx.x * blockDim.x + threadIdx.x;
- int j = blockIdx.y * blockDim.y + threadIdx.y;
- A[i * N + j] = B[i * N + j] + C[i * N + j];
- }
- int main() {
- //Allocate matrix A,B,C for host
- int h_A[N][N], h_B[N][N], h_C[N][N];
- size_t size = sizeof(int) * N;
- //Allocate matrix A,B,C for device
- int* d_A, * d_B, * d_C;
- cudaMalloc((void**)& d_A, size * N);
- cudaMalloc((void**)& d_B, size * N);
- cudaMalloc((void**)& d_C, size * N);
- //Initializing A,B
- for (int i = 0; i < N; i++) {
- for (int j = 0; j < N; j++) {
- h_A[i][j] = -1;
- h_B[i][j] = 1;
- }
- }
- //From device to host memory copy
- cudaMemcpy(d_A, h_A, size * N, cudaMemcpyHostToDevice);
- cudaMemcpy(d_B, h_B, size * N, cudaMemcpyHostToDevice);
- cudaMemcpy(d_C, h_C, size * N, cudaMemcpyHostToDevice);
- dim3 blocks = { 2, 2 };
- dim3 threads = { 16,16 };
- matrixAdd << < blocks, threads >> > (d_A, d_B, d_C);
- //From host to device memory copy
- cudaMemcpy(h_C, d_C, size * N, cudaMemcpyDeviceToHost);
- //Cheking the answer
- for (int i = 0; i < N; i++) {
- for (int j = 0; j < N; j++) {
- std::cout << h_A[i][j] << " + " << h_B[i][j] << " = " << h_C[i][j] << '\n';
- }
- }
- cudaFree(d_A);
- cudaFree(d_B);
- cudaFree(d_C);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement