Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // UCSC CMPE220 Advanced Parallel Processing
- // Prof. Heiner Leitz
- // Author: Marcelo Siero.
- // Modified from code by:: Andreas Goetz (agoetz@sdsc.edu)
- // CUDA program to perform 1D stencil operation in parallel on the GPU
- //
- // /* FIXME */ COMMENTS ThAT REQUIRE ATTENTION
- #include <iostream>
- #include <stdio.h>
- #include <cuda_device_runtime_api.h>
- // define vector length, stencil radius,
- //#define N (1024*1024*512l)
- #define N 100000000
- #define RADIUS 7
- #define GRIDSIZE (N - 1 - 2 * RADIUS) / BLOCKSIZE + 1
- #define BLOCKSIZE 1000
- #define EPS 1e-9
- #define SAFE_CALL( CallInstruction ) { \
- cudaError_t cuerr = CallInstruction; \
- if(cuerr != cudaSuccess) { \
- printf("CUDA error: %s at call \"" #CallInstruction "\"\n", cudaGetErrorString(cuerr)); \
- throw "error in CUDA API function, aborting..."; \
- } \
- }
- #define SAFE_KERNEL_CALL( KernelCallInstruction ){ \
- KernelCallInstruction; \
- cudaError_t cuerr = cudaGetLastError(); \
- if(cuerr != cudaSuccess) { \
- printf("CUDA error in kernel launch: %s at kernel \"" #KernelCallInstruction "\"\n", cudaGetErrorString(cuerr)); \
- throw "error in CUDA kernel launch, aborting..."; \
- } \
- cuerr = cudaDeviceSynchronize(); \
- if(cuerr != cudaSuccess) { \
- printf("CUDA error in kernel execution: %s at kernel \"" #KernelCallInstruction "\"\n", cudaGetErrorString(cuerr)); \
- throw "error in CUDA kernel execution, aborting..."; \
- } \
- }
- int gridSize = GRIDSIZE;
- int blockSize = BLOCKSIZE;
- cudaEvent_t start_gpu, stop_gpu;
- void cudaErrorCheck() {
- // FIXME: Add code that finds the last error for CUDA functions performed.
- // Upon getting an error have it print out a meaningful error message as
- // provided by the CUDA API, then exit with an error exit code.
- }
- void start_timer() {
- cudaEventCreate(&start_gpu);
- cudaEventCreate(&stop_gpu);
- cudaEventRecord(start_gpu);
- // FIXME: ADD TIMING CODE, HERE, USE GLOBAL VARIABLES AS NEEDED.
- }
- float stop_timer() {
- float time_gpu = 0;
- cudaDeviceSynchronize();
- cudaEventRecord(stop_gpu);
- cudaEventSynchronize(stop_gpu);
- cudaEventElapsedTime(&time_gpu, start_gpu, stop_gpu);
- // FIXME: ADD TIMING CODE, HERE, USE GLOBAL VARIABLES AS NEEDED.
- return(time_gpu);
- }
- cudaDeviceProp prop;
- void getDeviceProperties() {
- // FIXME: Implement this function so as to acquire and print the following
- // device properties:
- // Major and minor CUDA capability, total device global memory,
- // size of shared memory per block, number of registers per block,
- // warp size, max number of threads per block, number of multi-prccessors
- // (SMs) per device, Maximum number of threads per block dimension (x,y,z),
- // Maximumum number of blocks per grid dimension (x,y,z).
- //
- // These properties can be useful to dynamically optimize programs. For
- // instance the number of SMs can be useful as a heuristic to determine
- // how many is a good number of blocks to use. The total device global
- // memory might be important to know just how much data to operate on at
- // once.
- }
- void newline() { std::cout << std::endl; };
- void printThreadSizes() {
- int noOfThreads = gridSize * blockSize;
- printf("Blocks = %d\n", gridSize); // no. of blocks to launch.
- printf("Threads per block = %d\n", blockSize); // no. of threads to launch.
- printf("Total threads = %d\n", noOfThreads);
- printf("Number of grids = %d\n", (N + noOfThreads -1)/ noOfThreads);
- }
- // -------------------------------------------------------
- // CUDA device function that performs 1D stencil operation
- // -------------------------------------------------------
- __global__ void stencil_1D(double *in, double *out, long dim){
- long gindex = blockDim.x * blockIdx.x + threadIdx.x;// - RADIUS + RADIUS;
- const int sh_mem_size = 1024 + 2 * RADIUS + 1;
- __shared__ double shmem[sh_mem_size];
- shmem[threadIdx.x] = in[gindex];
- shmem[threadIdx.x + RADIUS] = in[gindex + RADIUS];
- shmem[threadIdx.x + 2 * RADIUS] = in[gindex + 2 * RADIUS];
- __syncthreads();
- double result = 0;
- for (int offset = -RADIUS; offset <= RADIUS; offset++) {
- int index = threadIdx.x + RADIUS + offset;
- result += shmem[index];
- }
- if (gindex < dim + RADIUS) {
- out[gindex + RADIUS] = result;
- }
- }
- __global__ void stencil_1D_slow(double *in, double *out, long dim){
- long gindex = threadIdx.x + blockDim.x * blockIdx.x + RADIUS;
- // Go through all data
- // Step all threads in a block to avoid synchronization problem
- /* FIXME PART 2 - MODIFIY PROGRAM TO USE SHARED MEMORY. */
- // Apply the stencil
- double result = 0;
- for (int offset = -RADIUS; offset <= RADIUS; offset++) {
- if (gindex + offset < dim + RADIUS)
- result += in[gindex + offset];
- }
- // Store the result
- if (gindex < dim + RADIUS)
- out[gindex] = result;
- }
- #define True 1
- #define False 0
- void checkResults(double *h_in, double *h_out, int DoCheck=True) {
- // DO NOT CHANGE THIS CODE.
- // CPU calculates the stencil from data in *h_in
- // if DoCheck is True (default) it compares it with *h_out
- // to check the operation of this code.
- // If DoCheck is set to False, it can be used to time the CPU.
- int i, j, ij;
- double result;
- int err = 0;
- for (i=RADIUS; i<N; i++) { // major index.
- result = 0;
- for (j=-RADIUS; j<=RADIUS; j++){
- ij = i+j;
- if (ij < N + RADIUS)
- result += h_in[ij];
- }
- if (DoCheck) { // print out some errors for debugging purposes.
- if (abs(h_out[i] - result) >= EPS) { // count errors.
- err++;
- if (err < 8) { // help debug
- printf("h_out[%d]=%lf should be %lf\n",i,h_out[i], result);
- };
- }
- } else { // for timing purposes.
- h_out[i - RADIUS] = result;
- }
- }
- if (DoCheck) { // report results.
- if (err != 0){
- printf("Error, %d elements do not match!\n", err);
- } else {
- printf("Success! All elements match CPU result.\n");
- }
- }
- }
- // ------------
- // main program
- // ------------
- int main(void)
- {
- double *h_in, *h_out;
- double *d_in, *d_out;
- long size = (N + 2 * RADIUS) * sizeof(double);
- int i;
- // allocate host memory
- h_in = new double[N + 2 * RADIUS];
- h_out = new double[N + 2 * RADIUS];
- getDeviceProperties();
- // initialize vector
- for (i=0; i<N; i++) {
- if (i < RADIUS)
- h_in[i] = 0;
- else
- h_in[i] = (i+1);
- }
- std::cout << std::endl;
- // allocate device memory
- cudaMalloc((void **)&d_in, size);
- cudaMalloc((void **)&d_out, size);
- cudaErrorCheck();
- // copy input data to device
- cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
- cudaErrorCheck();
- // Apply stencil by launching a sufficient number of blocks
- printf("\n---------------------------\n");
- printf("Launching 1D stencil kernel\n");
- printf("---------------------------\n");
- printf("Vector length = %ld (%ld MB)\n",N,N*sizeof(double)/1024/1024);
- printf("Stencil radius = %d\n",RADIUS);
- //----------------------------------------------------------
- // CODE TO RUN AND TIME THE STENCIL KERNEL.
- //----------------------------------------------------------
- newline();
- printThreadSizes();
- start_timer();
- SAFE_KERNEL_CALL((stencil_1D<<<gridSize,blockSize>>>(d_in, d_out, N)));
- std::cout << "Elapsed time: " << stop_timer() << std::endl;
- // copy results back to host
- cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);
- cudaErrorCheck();
- checkResults(h_in, h_out);
- //----------------------------------------------------------
- // deallocate device memory
- cudaFree(d_in);
- cudaFree(d_out);
- cudaErrorCheck();
- //=====================================================
- // Evaluate total time of execution with just the CPU.
- //=====================================================
- newline();
- std::cout << "Running stencil with the CPU.\n";
- start_timer();
- // Use checkResults to time CPU version of the stencil with False flag.
- checkResults(h_in, h_out, False);
- std::cout << "Elapsed time: " << stop_timer() << std::endl;
- //=====================================================
- // deallocate host memory
- free(h_in);
- free(h_out);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement