Untitled

// System includes
#define WIN32
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include <helper_functions.h>

const unsigned int VEC_SIZE = 1073741824;

//sumowanie (w pamięci globalnej) przez blok wątków 2*blockDim.x elementów - wersja z rozbieżnością wiązek
__global__ void block_sum_rw(int* dane, int* wyniki) {

	//identyfikator pierwszej wartości czytanej zależny od  identyfikatora wątku i bloku (suma 2 elementów)
	unsigned int i = 2 * (blockIdx.x * blockDim.x + threadIdx.x);

	//odstęp między wartościami sumowanymi elementów
	for (unsigned int odstep = 1; odstep < 2 * blockDim.x; odstep *= 2) {
		//wykluczenie wątków w kolejnych etapach co 2,4,8,...
		if (threadIdx.x%odstep == 0)
			// test rozmiaru danych, jeśli potrzebny
			if (i + odstep < VEC_SIZE)
				dane[i] += dane[i + odstep];
		// synchronizacja gotowości danych
		__syncthreads();
	}

	//  wątek 0 zapisuje wynik
	if (threadIdx.x == 0) {
		wyniki[blockIdx.x] = dane[i];
	}
}

//sumowanie (w pamięci globalnej) przez blok wątków 2*blockDim.x elementów - wersja bez rozbieżności wiązek
__global__ void block_sum_brw(int* dane, int* wyniki) {

	//identyfikator pierwszej wartości czytanej zależny od  identyfikatora wątku i bloku (suma 2 elementów)
	unsigned int i = blockIdx.x * 2 * blockDim.x + threadIdx.x;

	//odstęp między wartościami sumowanymi elementów
	for (unsigned int odstep = blockDim.x; odstep > 0; odstep >>= 1) {
		if (threadIdx.x < odstep) {
			dane[i] += dane[i + odstep];
		}
		__syncthreads();
	}

	//  wątek 0 zapisuje wynik
	if (threadIdx.x == 0) {
		wyniki[blockIdx.x] = dane[i];
	}
}

void constantInit(int* data, int val)
{
	for (unsigned int i = 0; i < VEC_SIZE; ++i)
	{
		data[i] = val;
	}
}

//1 - wersja RW, 3 - wersja BRW
int reduceVector(int version, int block_size)
{
	// Calculate required grid size (in blocks)
	int grid_size = VEC_SIZE / 2 / block_size;

	// Allocate host memory vector
	unsigned long long vec_mem = sizeof(int) * VEC_SIZE;
	unsigned long long res_mem = sizeof(int) * grid_size;
	int* h_vec = (int*)malloc(vec_mem);
	int* h_res = (int*)malloc(res_mem);

	// Initialize host memory
	constantInit(h_vec, 1);

	// Allocate device memory
	int* d_vec;
	int* d_res;

	// Initialize device memory
	cudaError_t error;
	error = cudaMalloc((void **)&d_vec, vec_mem);
	if (error != cudaSuccess)
	{
		printf("cudaMalloc d_vec returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}
	error = cudaMalloc((void **)&d_res, res_mem);
	if (error != cudaSuccess)
	{
		printf("cudaMalloc d_res returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}

	// copy host memory to device
	error = cudaMemcpy(d_vec, h_vec, vec_mem, cudaMemcpyHostToDevice);
	if (error != cudaSuccess)
	{
		printf("cudaMemcpy (d_vec,h_vec) returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}
	error = cudaMemcpy(d_res, h_res, res_mem, cudaMemcpyHostToDevice);
	if (error != cudaSuccess)
	{
		printf("cudaMemcpy (d_res,h_res) returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}

	// Create and start timer
	printf("Computing result using CUDA Kernel...\n");

	// Performs operation using CUDA kernel
	int turn = 0;
		if (version == 1)
		{
			do
			{
				if (turn == 0)
					block_sum_rw <<< grid_size, block_size >>> (d_vec, d_res);
				else
					block_sum_rw <<< grid_size, block_size >>> (d_res, d_vec);
				grid_size /= 2 * block_size;
				turn = abs(turn - 1);
			} while (grid_size >= 1);
		}
		else
		{
			do
			{
				if (turn == 0)
					block_sum_brw <<< grid_size, block_size >>> (d_vec, d_res);
				else
					block_sum_brw <<< grid_size, block_size >>> (d_res, d_vec);
				grid_size /= 2 * block_size;
				turn = abs(turn - 1);
			} while (grid_size >= 1);
		}

	printf("done\n");

	cudaDeviceSynchronize();

	// Copy result from device to host
	error = cudaMemcpy(h_vec, d_vec, vec_mem, cudaMemcpyDeviceToHost);
	if (error != cudaSuccess)
	{
		printf("cudaMemcpy (h_vec,d_vec) returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}
	error = cudaMemcpy(h_res, d_res, res_mem, cudaMemcpyDeviceToHost);
	if (error != cudaSuccess)
	{
		printf("cudaMemcpy (h_res,d_res) returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}

	printf("Checking computed result for correctness: ");
	bool correct = true;

	if (turn == 1 && h_res[0] != VEC_SIZE)
	{
		printf("Error! Vector reduced to value: %d\n", h_res[0]);
		correct = false;
	}
	else if (turn == 0 && h_vec[0] != VEC_SIZE)
	{
		printf("Error! Vector reduced to value: %d\n", h_vec[0]);
		correct = false;
	}

	printf("%s\n", correct ? "OK" : "FAIL");

	// Clean up memory
	free(h_vec);
	free(h_res);
	cudaFree(d_vec);
	cudaFree(d_res);

	cudaDeviceReset();

	if (correct)
	{
		return EXIT_SUCCESS;
	}
	else
	{
		return EXIT_FAILURE;
	}
}

/**
 * Program main
 */
int main(int argc, char **argv)
{
	printf("[Reduction Using CUDA] - Starting...\n");

	// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
	int devID = 0;

	if (checkCmdLineFlag(argc, (const char **)argv, "device"))
	{
		devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
		cudaSetDevice(devID);
	}

	cudaError_t error;
	cudaDeviceProp deviceProp;
	error = cudaGetDevice(&devID);

	if (error != cudaSuccess)
	{
		printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
	}

	error = cudaGetDeviceProperties(&deviceProp, devID);

	if (deviceProp.computeMode == cudaComputeModeProhibited)
	{
		fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
		exit(EXIT_SUCCESS);
	}

	if (error != cudaSuccess)
	{
		printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
	}
	else
	{
		printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
	}

	// Define block size
	int block_size = 512;
	int version = 1; //1 - RW, 3 - BRW

	printf("Block size: %d\n", block_size);

	int reduce_result = reduceVector(version, block_size);

	exit(reduce_result);
}