Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // System includes
- #define WIN32
- #include <stdio.h>
- #include <assert.h>
- // CUDA runtime
- #include <cuda_runtime.h>
- // Helper functions and utilities to work with CUDA
- #include <helper_functions.h>
- const unsigned int VEC_SIZE = 1073741824;
- //sumowanie (w pamięci globalnej) przez blok wątków 2*blockDim.x elementów - wersja z rozbieżnością wiązek
- __global__ void block_sum_rw(int* dane, int* wyniki) {
- //identyfikator pierwszej wartości czytanej zależny od identyfikatora wątku i bloku (suma 2 elementów)
- unsigned int i = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
- //odstęp między wartościami sumowanymi elementów
- for (unsigned int odstep = 1; odstep < 2 * blockDim.x; odstep *= 2) {
- //wykluczenie wątków w kolejnych etapach co 2,4,8,...
- if (threadIdx.x%odstep == 0)
- // test rozmiaru danych, jeśli potrzebny
- if (i + odstep < VEC_SIZE)
- dane[i] += dane[i + odstep];
- // synchronizacja gotowości danych
- __syncthreads();
- }
- // wątek 0 zapisuje wynik
- if (threadIdx.x == 0) {
- wyniki[blockIdx.x] = dane[i];
- }
- }
- //sumowanie (w pamięci globalnej) przez blok wątków 2*blockDim.x elementów - wersja bez rozbieżności wiązek
- __global__ void block_sum_brw(int* dane, int* wyniki) {
- //identyfikator pierwszej wartości czytanej zależny od identyfikatora wątku i bloku (suma 2 elementów)
- unsigned int i = blockIdx.x * 2 * blockDim.x + threadIdx.x;
- //odstęp między wartościami sumowanymi elementów
- for (unsigned int odstep = blockDim.x; odstep > 0; odstep >>= 1) {
- if (threadIdx.x < odstep) {
- dane[i] += dane[i + odstep];
- }
- __syncthreads();
- }
- // wątek 0 zapisuje wynik
- if (threadIdx.x == 0) {
- wyniki[blockIdx.x] = dane[i];
- }
- }
- void constantInit(int* data, int val)
- {
- for (unsigned int i = 0; i < VEC_SIZE; ++i)
- {
- data[i] = val;
- }
- }
- //1 - wersja RW, 3 - wersja BRW
- int reduceVector(int version, int block_size)
- {
- // Calculate required grid size (in blocks)
- int grid_size = VEC_SIZE / 2 / block_size;
- // Allocate host memory vector
- unsigned long long vec_mem = sizeof(int) * VEC_SIZE;
- unsigned long long res_mem = sizeof(int) * grid_size;
- int* h_vec = (int*)malloc(vec_mem);
- int* h_res = (int*)malloc(res_mem);
- // Initialize host memory
- constantInit(h_vec, 1);
- // Allocate device memory
- int* d_vec;
- int* d_res;
- // Initialize device memory
- cudaError_t error;
- error = cudaMalloc((void **)&d_vec, vec_mem);
- if (error != cudaSuccess)
- {
- printf("cudaMalloc d_vec returned error code %d, line(%d)\n", error, __LINE__);
- exit(EXIT_FAILURE);
- }
- error = cudaMalloc((void **)&d_res, res_mem);
- if (error != cudaSuccess)
- {
- printf("cudaMalloc d_res returned error code %d, line(%d)\n", error, __LINE__);
- exit(EXIT_FAILURE);
- }
- // copy host memory to device
- error = cudaMemcpy(d_vec, h_vec, vec_mem, cudaMemcpyHostToDevice);
- if (error != cudaSuccess)
- {
- printf("cudaMemcpy (d_vec,h_vec) returned error code %d, line(%d)\n", error, __LINE__);
- exit(EXIT_FAILURE);
- }
- error = cudaMemcpy(d_res, h_res, res_mem, cudaMemcpyHostToDevice);
- if (error != cudaSuccess)
- {
- printf("cudaMemcpy (d_res,h_res) returned error code %d, line(%d)\n", error, __LINE__);
- exit(EXIT_FAILURE);
- }
- // Create and start timer
- printf("Computing result using CUDA Kernel...\n");
- // Performs operation using CUDA kernel
- int turn = 0;
- if (version == 1)
- {
- do
- {
- if (turn == 0)
- block_sum_rw <<< grid_size, block_size >>> (d_vec, d_res);
- else
- block_sum_rw <<< grid_size, block_size >>> (d_res, d_vec);
- grid_size /= 2 * block_size;
- turn = abs(turn - 1);
- } while (grid_size >= 1);
- }
- else
- {
- do
- {
- if (turn == 0)
- block_sum_brw <<< grid_size, block_size >>> (d_vec, d_res);
- else
- block_sum_brw <<< grid_size, block_size >>> (d_res, d_vec);
- grid_size /= 2 * block_size;
- turn = abs(turn - 1);
- } while (grid_size >= 1);
- }
- printf("done\n");
- cudaDeviceSynchronize();
- // Copy result from device to host
- error = cudaMemcpy(h_vec, d_vec, vec_mem, cudaMemcpyDeviceToHost);
- if (error != cudaSuccess)
- {
- printf("cudaMemcpy (h_vec,d_vec) returned error code %d, line(%d)\n", error, __LINE__);
- exit(EXIT_FAILURE);
- }
- error = cudaMemcpy(h_res, d_res, res_mem, cudaMemcpyDeviceToHost);
- if (error != cudaSuccess)
- {
- printf("cudaMemcpy (h_res,d_res) returned error code %d, line(%d)\n", error, __LINE__);
- exit(EXIT_FAILURE);
- }
- printf("Checking computed result for correctness: ");
- bool correct = true;
- if (turn == 1 && h_res[0] != VEC_SIZE)
- {
- printf("Error! Vector reduced to value: %d\n", h_res[0]);
- correct = false;
- }
- else if (turn == 0 && h_vec[0] != VEC_SIZE)
- {
- printf("Error! Vector reduced to value: %d\n", h_vec[0]);
- correct = false;
- }
- printf("%s\n", correct ? "OK" : "FAIL");
- // Clean up memory
- free(h_vec);
- free(h_res);
- cudaFree(d_vec);
- cudaFree(d_res);
- cudaDeviceReset();
- if (correct)
- {
- return EXIT_SUCCESS;
- }
- else
- {
- return EXIT_FAILURE;
- }
- }
- /**
- * Program main
- */
- int main(int argc, char **argv)
- {
- printf("[Reduction Using CUDA] - Starting...\n");
- // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
- int devID = 0;
- if (checkCmdLineFlag(argc, (const char **)argv, "device"))
- {
- devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
- cudaSetDevice(devID);
- }
- cudaError_t error;
- cudaDeviceProp deviceProp;
- error = cudaGetDevice(&devID);
- if (error != cudaSuccess)
- {
- printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
- }
- error = cudaGetDeviceProperties(&deviceProp, devID);
- if (deviceProp.computeMode == cudaComputeModeProhibited)
- {
- fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
- exit(EXIT_SUCCESS);
- }
- if (error != cudaSuccess)
- {
- printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
- }
- else
- {
- printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
- }
- // Define block size
- int block_size = 512;
- int version = 1; //1 - RW, 3 - BRW
- printf("Block size: %d\n", block_size);
- int reduce_result = reduceVector(version, block_size);
- exit(reduce_result);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement