Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <cuda_runtime.h>
- #include <device_launch_parameters.h>
- #define w 100
- #define h 100
- #define mw 10
- #define mh 10
- using namespace std;
- __global__ void kernel(int *a, int* b, int* count)
- {
- int tidx = threadIdx.x + blockIdx.x * blockDim.x;
- int tidy = threadIdx.y + blockIdx.y * blockDim.y;
- int n =0;
- // if(a[tidy+tidx]==b[tidx]) atomicAdd(&count[tidy], 1);
- count[tidy*w+tidx] = a[tidy*w+tidx];
- }
- int main(void)
- {
- int *a_device, *b_device, *count_device;
- int b[mw][mh];
- int **a = new int*[h];
- for(int i = 0; i< h; i++) a[i] = new int[w];
- for(int i=0; i<h; i++)
- for(int j=0; j<w; j++)
- a[i][j] = i*w+j;
- for(int i=0; i<mh; i++)
- for(int j=0; j<mw; j++)
- b[i][j] = i*mw+j;
- const size_t a_size = sizeof(int) * size_t(w*h);
- const size_t b_size = sizeof(int) * size_t(mw*mh);
- cudaMalloc((void **)&a_device, a_size);
- cudaMalloc((void **)&b_device, b_size);
- count_device = new int[h*w - mh*mw];
- const size_t count_size = sizeof(int) * size_t(h*w - mh*mw);
- cudaMalloc((void**)&count_device, count_size);
- cudaMemcpy(a_device, a, a_size, cudaMemcpyHostToDevice);
- cudaMemcpy(b_device, b, b_size, cudaMemcpyHostToDevice);
- const dim3 block(16,16);
- const dim3 grid((mw*mh + block.x - 1)/block.x, (w*h-mw*mh + block.y - 1)/block.y);
- kernel<<<grid, block>>>(a_device, b_device, count_device);
- int* result = new int[h*w - mh*mw];
- cudaMemcpy(result, count_device, count_size, cudaMemcpyDeviceToHost);
- for(int i=0; i< h*w - mh*mw; i++)
- //if(result[i]>0)
- cout << "i= " << i << "res =" << result[i] << endl;
- getchar();
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement