Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <assert.h>
- #define BLOCK_SIZE 16
- #define N 1618 // available to all threads in device
- //kernel
- // optimized version of adjacent difference
- __global__ void adj_diff_naive(int *result, int *input)
- {
- // shorthand for threadIdx.x
- int tx = threadIdx.x;
- // allocate a __shared__ array, one element per thread
- __shared__ int s_data[BLOCK_SIZE];
- // each thread reads one element to s_data
- unsigned int i = blockDim.x * blockIdx.x + tx;
- s_data[tx] = input[i];
- // avoid race condition: ensure all loads complete
- // before continuing
- __syncthreads();
- if (tx < N)
- {
- int pr = s_data[tx];
- int pr2 = s_data[tx - 1];
- result[i] = pr - pr2;
- }
- }
- int main(int argc, char** argv)
- {
- // pointer to host memory
- int *h_a;
- // size of array
- //int dimA = 256 * 1024; // 256K elements (1MB total)
- int dimA = BLOCK_SIZE;
- // pointer to device memory
- int *d_b, *d_a;
- // define block size
- int numThreadsPerBlock = 4;
- // compute number of blocks needed based on
- // array size and desired block size
- int numBlocks = dimA / numThreadsPerBlock;
- // allocate host and device memory
- size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
- h_a = (int *)malloc(memSize);//...
- cudaMalloc((void **)&d_a, memSize); //aloc d_a //...
- cudaMalloc((void **)&d_b, memSize); //aloc d_b //...
- // Initialize input array on host
- for (int i = 0; i < dimA; ++i)
- {
- h_a[i] = i;
- }
- //Forma: cudaMemcpy( void *dst, void *src, size_t nbytes, enum cudaMemcpyKind direction);
- //
- // Copy host array to device array ==> cudaMemcpyHostToDevice
- cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice); //...
- // launch kernel
- dim3 dimGrid(numBlocks);
- dim3 dimBlock(numThreadsPerBlock);
- adj_diff_naive << < numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(int) >> >(d_b, d_a);
- // block until the device has completed
- cudaThreadSynchronize();
- // device to host copy ==> cudaMemcpyDeviceToHost
- cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost); //...
- // verify the data returned to the host is correct
- for (int i = 0; i < dimA; i++)
- {
- assert(h_a[i] == dimA - 1 - i);
- //printf(d_b[i] + " ");
- }
- // free device memory
- cudaFree(d_a);
- cudaFree(d_b);
- // free host memory
- free(h_a);
- // If the program makes it this far, then the results
- // are correct
- printf("Correct!\n");
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement