Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- const int N = 4;
- __global__ void scalar(int *a, int *b, int *res){
- int tidx = threadIdx.x+blockIdx.x*blockDim.x;//ind1
- int tidy = threadIdx.y+blockIdx.y*blockDim.y;
- int ind1 = tidx;
- int ind = tidy*N;
- int ind2 = tidx+tidy*blockDim.x*gridDim.x;
- int s =0;
- for (int k=0;k<N;k++)
- s+=a[ind+k]*b[ind1+k*N];
- res[ind2]=s;
- }
- int main()
- {
- dim3 Nth(2,2);
- dim3 Nbl(N/Nth.x, N/Nth.x);
- int *dev_a, *dev_b, *dev_res;
- int a[N*N], b[N*N], res[N*N], res2[N*N];
- for (long i = 0; i < N*N; i++)
- {
- a[i] = i;
- b[i] = i+1;
- }
- cudaMalloc((void**)&dev_a, sizeof(int)*N*N);
- cudaMalloc((void**)&dev_b, sizeof(int)*N*N);
- cudaMalloc((void**)&dev_res, sizeof(int)*N*N);
- cudaMemcpy(dev_a, a, sizeof(int)*N*N, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_b, b, sizeof(int)*N*N, cudaMemcpyHostToDevice);
- scalar<<<Nbl, Nth >>>(dev_a, dev_b, dev_res);
- cudaMemcpy(res, dev_res, sizeof(int)*N*N, cudaMemcpyDeviceToHost);
- for (int i =0;i<N;i++)
- {
- for (int j = 0;j<N;j++)
- printf("%d ",res[i*N+j]);
- printf("\n");
- }
- for (int k = 0; k<N;k++)
- {
- for (int i = 0; i<N;i++)
- {
- int s = 0;
- for (int j = 0; j<N;j++)
- {
- s+=a[k*N+j]*b[j*N+i];
- }
- res2[k*N+i]=s;
- }
- }
- for (int i =0;i<N;i++)
- {
- for (int j = 0;j<N;j++)
- printf("%d ",res2[i*N+j]);
- printf("\n");
- }
- getchar();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement