Untitled

#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define w 100
#define h 100
#define mw 10
#define mh 10
using namespace std;
__global__ void kernel(int *a, int* b, int* count)
{

    int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    int tidy = threadIdx.y + blockIdx.y * blockDim.y;
	int n =0;
 //  if(a[tidy+tidx]==b[tidx])  atomicAdd(&count[tidy], 1);
	count[tidy*w+tidx] = a[tidy*w+tidx];
}
int main(void)
{
    int *a_device, *b_device, *count_device;
	int b[mw][mh];

	int **a = new int*[h];
	for(int i = 0; i< h; i++) a[i] = new int[w];

	for(int i=0; i<h; i++)
		for(int j=0; j<w; j++)
			a[i][j] = i*w+j;

	for(int i=0; i<mh; i++)
		for(int j=0; j<mw; j++)
			b[i][j] = i*mw+j;

    const size_t a_size = sizeof(int) * size_t(w*h);
	const size_t b_size = sizeof(int) * size_t(mw*mh);

    cudaMalloc((void **)&a_device, a_size);
	cudaMalloc((void **)&b_device, b_size);

	count_device = new int[h*w - mh*mw];
	const size_t count_size = sizeof(int) * size_t(h*w - mh*mw);

	cudaMalloc((void**)&count_device, count_size);

	cudaMemcpy(a_device, a, a_size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_device, b, b_size, cudaMemcpyHostToDevice);

	const dim3 block(16,16);
	const dim3 grid((mw*mh + block.x - 1)/block.x, (w*h-mw*mh + block.y - 1)/block.y);

	kernel<<<grid, block>>>(a_device, b_device, count_device);
	int* result = new int[h*w - mh*mw];

	cudaMemcpy(result, count_device, count_size, cudaMemcpyDeviceToHost);

	for(int i=0; i< h*w - mh*mw; i++)
		//if(result[i]>0)
		cout << "i= " << i << "res =" <<  result[i] << endl;

	getchar();
}