Untitled

#include <stdio.h>
#include <assert.h>
#define BLOCK_SIZE 16
#define N 1618 // available to all threads in device
//kernel
// optimized version of adjacent difference
__global__ void adj_diff_naive(int *result, int *input)
{
	// shorthand for threadIdx.x
	int tx = threadIdx.x;

	// allocate a __shared__ array, one element per thread
	__shared__ int s_data[BLOCK_SIZE];

	// each thread reads one element to s_data
	unsigned int i = blockDim.x * blockIdx.x + tx;
	s_data[tx] = input[i];

	// avoid race condition: ensure all loads complete
	// before continuing
	__syncthreads();

	if (tx < N)
	{
		int pr = s_data[tx];
		int pr2 = s_data[tx - 1];
		result[i] = pr - pr2;
	}
}

int main(int argc, char** argv)
{

	// pointer to host memory
	int *h_a;

	// size of array
	//int dimA = 256 * 1024; // 256K elements (1MB total)
	int dimA = BLOCK_SIZE;
	// pointer to device memory
	int *d_b, *d_a;

	// define block size
	int numThreadsPerBlock = 4;

	// compute number of blocks needed based on
	// array size and desired block size
	int numBlocks = dimA / numThreadsPerBlock;

	// allocate host and device memory
	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
	h_a = (int *)malloc(memSize);//...
	cudaMalloc((void **)&d_a, memSize); //aloc d_a //...
	cudaMalloc((void **)&d_b, memSize); //aloc d_b //...

	// Initialize input array on host
	for (int i = 0; i < dimA; ++i)
	{
		h_a[i] = i;
	}


	//Forma: cudaMemcpy( void *dst, void *src, size_t nbytes, enum cudaMemcpyKind direction);
	//
	// Copy host array to device array  ==> cudaMemcpyHostToDevice
	cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice); //...

	// launch kernel
	dim3 dimGrid(numBlocks);
	dim3 dimBlock(numThreadsPerBlock);


	adj_diff_naive << < numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(int) >> >(d_b, d_a);

	// block until the device has completed
	cudaThreadSynchronize();

	// device to host copy ==> cudaMemcpyDeviceToHost
	cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost); //...

	// verify the data returned to the host is correct
	for (int i = 0; i < dimA; i++)
	{
		assert(h_a[i] == dimA - 1 - i);
		//printf(d_b[i] + " ");
	}

	// free device memory
	cudaFree(d_a);
	cudaFree(d_b);

	// free host memory
	free(h_a);

	// If the program makes it this far, then the results
	// are correct
	printf("Correct!\n");

	return 0;
}