Untitled

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cuda.h>
#include <device_functions.h>
#include <stdio.h>
#include <iostream>
#include <conio.h>
#include <fstream>
#include <sstream>
#include <string>
#include <iomanip>

#define ARRAY_SIZE 12000
#define BLOCK_NUMBER 24
#define THREAD_NUMBER 512

__global__ void kernel1(int *inputData, int *outputData) {
	__shared__ int data[ARRAY_SIZE];
	int threadIndex = threadIdx.x;
	int index = blockIdx.x*blockDim.x + threadIdx.x;
	data[threadIndex] = inputData[index];
	__syncthreads();

	//reduction
	for (int i = 1; i < blockDim.x; i *= 2) {
		if (threadIndex % (2 * i) == 0) {
			data[threadIndex] += data[threadIndex + i];
		}
		__syncthreads();
	}

	if (threadIndex == 0) {
		outputData[blockIdx.x] = data[0];
	}
}

__global__ void kernel2(int *inputData, int *outputData) {
	__shared__ int data[ARRAY_SIZE];
	int threadIndex = threadIdx.x;
	int index = blockIdx.x*blockDim.x + threadIdx.x;
	data[threadIndex] = inputData[index];
	__syncthreads();

	//reduction
	for (int i = 1; i < blockDim.x; i *= 2) {
		int index = 2 * i * threadIndex;
		if (index < blockDim.x) {
			data[threadIndex] += data[threadIndex + i];
		}
		__syncthreads();
	}

	if (threadIndex == 0) {
		outputData[blockIdx.x] = data[0];
	}
}

__global__ void kernel3(int *inputData, int *outputData) {
	__shared__ int data[ARRAY_SIZE];
	int threadIndex = threadIdx.x;
	int index = blockIdx.x*blockDim.x + threadIdx.x;
	data[threadIndex] = inputData[index];
	__syncthreads();

	//reduction
	for (int s = blockDim.x / 2; s > 0; s >>= 1) {
		if (threadIndex < s) {
			data[threadIndex] += data[threadIndex + s];
		}
		__syncthreads();
	}

	if (threadIndex == 0) {
		outputData[blockIdx.x] = data[0];
	}
}

__global__ void kernel4(int *inputData, int *outputData) {
	__shared__ int data[THREAD_NUMBER];
	int threadIndex = threadIdx.x;
	int index = blockIdx.x*blockDim.x + threadIdx.x;
	data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
	__syncthreads();

	//reduction
	for (int s = blockDim.x / 2; s > 0; s >>= 1) {
		if (threadIndex < s) {
			data[threadIndex] += data[threadIndex + s];
		}
		__syncthreads();
	}

	if (threadIndex == 0) {
		outputData[blockIdx.x] = data[0];
	}
}

__global__ void kernel5(int *inputData, int *outputData) {
	__shared__ int data[THREAD_NUMBER];
	int threadIndex = threadIdx.x;
	int index = blockIdx.x*blockDim.x + threadIdx.x;
	data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
	__syncthreads();

	//reduction
	for (int s = blockDim.x / 2; s > 32; s>>=1) {
		if (threadIndex < s) {
			data[threadIndex] += data[threadIndex + s];
		}
		__syncthreads();
	}

	if (threadIndex < 32) {
		data[threadIndex] += data[threadIndex + 32];
		data[threadIndex] += data[threadIndex + 16];
		data[threadIndex] += data[threadIndex + 8];
		data[threadIndex] += data[threadIndex + 4];
		data[threadIndex] += data[threadIndex + 2];
		data[threadIndex] += data[threadIndex + 1];
	}

	if (threadIndex == 0) {
		outputData[blockIdx.x] = data[0];
	}
}

template<unsigned int blockSize> __global__ void kernel6(int *inputData, int *outputData) {
	__shared__ int data[THREAD_NUMBER];
	int threadIndex = threadIdx.x;
	int index = blockIdx.x*blockDim.x + threadIdx.x;
	data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
	__syncthreads();

	if (blockSize >= 512) {
		if (threadIndex < 256) {
			data[threadIndex] += data[threadIndex + 256];
			__syncthreads();
		}
	}

	if (blockSize >= 256) {
		if (threadIndex < 128) {
			data[threadIndex] += data[threadIndex + 128];
			__syncthreads();
		}
	}

	if (blockSize >= 128) {
		if (threadIndex < 64) {
			data[threadIndex] += data[threadIndex + 64];
			__syncthreads();
		}
	}

	if (threadIndex < 32) {
		data[threadIndex] += data[threadIndex + 32];
		data[threadIndex] += data[threadIndex + 16];
		data[threadIndex] += data[threadIndex + 8];
		data[threadIndex] += data[threadIndex + 4];
		data[threadIndex] += data[threadIndex + 2];
		data[threadIndex] += data[threadIndex + 1];
	}

	if (threadIndex == 0) {
		outputData[blockIdx.x] = data[0];
	}
}

template<unsigned int blockSize>__global__ void kernel7(int *inputData, int *outputData, unsigned int n) {
	__shared__ int data[THREAD_NUMBER];
	int threadIndex = threadIdx.x;
	int index = blockIdx.x*(blockSize*2) + threadIndex;
	int gridSize = blockSize * 2 * gridDim.x;
	data[threadIndex] = 0;

	while (index < n) {
		data[threadIndex] += inputData[index] + inputData[index + blockSize];
		index += gridSize;
	}
	__syncthreads();

	if (blockSize >= 512) {
		if (threadIndex < 256) {
			data[threadIndex] += data[threadIndex + 256];
			__syncthreads();
		}
	}

	if (blockSize >= 256) {
		if (threadIndex < 128) {
			data[threadIndex] += data[threadIndex + 128];
			__syncthreads();
		}
	}

	if (blockSize >= 128) {
		if (threadIndex < 64) {
			data[threadIndex] += data[threadIndex + 64];
			__syncthreads();
		}
	}

	if (threadIndex < 32) {
		data[threadIndex] += data[threadIndex + 32];
		data[threadIndex] += data[threadIndex + 16];
		data[threadIndex] += data[threadIndex + 8];
		data[threadIndex] += data[threadIndex + 4];
		data[threadIndex] += data[threadIndex + 2];
		data[threadIndex] += data[threadIndex + 1];
	}

	if (threadIndex == 0) {
		outputData[blockIdx.x] = data[0];
	}
}

int main() {
	int a[ARRAY_SIZE] = { 0 };
	int c_1[BLOCK_NUMBER] = { 0 };
	int c_2[BLOCK_NUMBER] = { 0 };
	int c_3[BLOCK_NUMBER] = { 0 };
	int c_4[BLOCK_NUMBER] = { 0 };
	int c_5[BLOCK_NUMBER] = { 0 };
	int c_6[BLOCK_NUMBER] = { 0 };
	int c_7[BLOCK_NUMBER] = { 0 };


	std::ifstream infile("Text.txt");

	int singleIn;
	int count = 0;
	while (infile >> singleIn) {
		a[count] = singleIn;
		count++;
	}

	int *device_a = 0;
	int *device_c_1 = 0;
	int *device_c_2 = 0;
	int *device_c_3 = 0;
	int *device_c_4 = 0;
	int *device_c_5 = 0;
	int *device_c_6 = 0;
	int *device_c_7 = 0;

	cudaEvent_t start1, stop1, start2, stop2, start3,stop3, start4,stop4,start5,stop5,start6,stop6,start7,stop7;
	cudaEventCreate(&start1);
	cudaEventCreate(&stop1);
	cudaEventCreate(&start2);
	cudaEventCreate(&stop2);
	cudaEventCreate(&start3);
	cudaEventCreate(&stop3);
	cudaEventCreate(&start4);
	cudaEventCreate(&stop4);
	cudaEventCreate(&start5);
	cudaEventCreate(&stop5);
	cudaEventCreate(&start6);
	cudaEventCreate(&stop6);
	cudaEventCreate(&start7);
	cudaEventCreate(&stop7);
	cudaError_t cudaStatus;

	cudaStatus = cudaMalloc((void**)&device_a, ARRAY_SIZE * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie1 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMalloc((void**)&device_c_1, BLOCK_NUMBER * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
	}
	cudaStatus = cudaMalloc((void**)&device_c_2, BLOCK_NUMBER * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
	}
	cudaStatus = cudaMalloc((void**)&device_c_3, BLOCK_NUMBER * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
	}
	cudaStatus = cudaMalloc((void**)&device_c_4, BLOCK_NUMBER * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
	}
	cudaStatus = cudaMalloc((void**)&device_c_5, BLOCK_NUMBER * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
	}
	cudaStatus = cudaMalloc((void**)&device_c_6, BLOCK_NUMBER * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
	}
	cudaStatus = cudaMalloc((void**)&device_c_7, BLOCK_NUMBER * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMemcpy(device_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie3 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaEventRecord(start1);
	kernel1 <<<BLOCK_NUMBER, THREAD_NUMBER >>> (device_a, device_c_1);
	cudaEventRecord(stop1);

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMemcpy(c_1, device_c_1, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaEventSynchronize(stop1);
	float millis1 = 0;
	cudaEventElapsedTime(&millis1, start1, stop1);
	int secondAdd1 = 0;
	for (int i = 0; i < BLOCK_NUMBER; i++) {
		secondAdd1 += c_1[i];
	}

	std::cout << "Kernel 1" << std::endl;
	std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis1 << "\t Result: \t" << secondAdd1 << std::endl;


	cudaEventRecord(start2);
	kernel2 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_2);
	cudaEventRecord(stop2);

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMemcpy(c_2, device_c_2, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaEventSynchronize(stop2);
	float millis2 = 0;
	cudaEventElapsedTime(&millis2, start2, stop2);
	int secondAdd2 = 0;
	for (int i = 0; i < BLOCK_NUMBER; i++) {
		secondAdd2 += c_2[i];
	}
	std::cout << "Kernel 2" << std::endl;
	std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis2 << "\t Result:\t" << secondAdd2 << std::endl;


	cudaEventRecord(start3);
	kernel3 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_3);
	cudaEventRecord(stop3);

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMemcpy(c_3, device_c_3, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaEventSynchronize(stop3);
	float millis3 = 0;
	cudaEventElapsedTime(&millis3, start3, stop3);
	int secondAdd3 = 0;
	for (int i = 0; i < BLOCK_NUMBER; i++) {
		secondAdd3 += c_3[i];
	}
	std::cout << "Kernel 3" << std::endl;
	std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis3 << "\t Result: \t" << secondAdd3 << std::endl;

	cudaEventRecord(start4);
	kernel4 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_4);
	cudaEventRecord(stop4);

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMemcpy(c_4, device_c_4, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaEventSynchronize(stop4);
	float millis4 = 0;
	cudaEventElapsedTime(&millis4, start4, stop4);
	int secondAdd4 = 0;
	for (int i = 0; i < BLOCK_NUMBER; i++) {
		secondAdd4 += c_4[i];
	}
	std::cout << "Kernel 4" << std::endl;
	std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis4 << "\t Result: \t" << secondAdd4 << std::endl;


	cudaEventRecord(start5);
	kernel5 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_5);
	cudaEventRecord(stop5);

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMemcpy(c_5, device_c_5, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaEventSynchronize(stop5);
	float millis5 = 0;
	cudaEventElapsedTime(&millis5, start5, stop5);
	int secondAdd5 = 0;
	for (int i = 0; i < BLOCK_NUMBER; i++) {
		secondAdd5 += c_5[i];
	}
	std::cout << "Kernel5" << std::endl;
	std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis5 << "\t Result: \t" << secondAdd5 << std::endl;


	cudaEventRecord(start6);
	kernel6<THREAD_NUMBER> << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_6);
	cudaEventRecord(stop6);

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMemcpy(c_6, device_c_6, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaEventSynchronize(stop1);
	float millis6 = 0;
	cudaEventElapsedTime(&millis6, start6, stop6);
	int secondAdd6 = 0;
	for (int i = 0; i < BLOCK_NUMBER; i++) {
		secondAdd6 += c_6[i];
	}
	std::cout << "Kernel 6" << std::endl;
	std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis6 << "\t Result: \t" << secondAdd6 << std::endl;

	cudaEventRecord(start7);
	kernel7<THREAD_NUMBER> << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_7,ARRAY_SIZE);
	cudaEventRecord(stop7);

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaStatus = cudaMemcpy(c_7, device_c_7, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
	}

	cudaEventSynchronize(stop7);
	float millis7 = 0;
	cudaEventElapsedTime(&millis7, start7, stop7);
	int secondAdd7 = 0;
	for (int i = 0; i < BLOCK_NUMBER; i++) {
		secondAdd7 += c_7[i];
	}
	std::cout << "Kernel 7" << std::endl;
	std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis7 << "\t Result: \t" << secondAdd7 << std::endl;

	int hostReductionResult = 0;
	for (int i = 0; i < ARRAY_SIZE; i++) {
		hostReductionResult += a[i];
	}
	std::cout << "Host reduction result: " << hostReductionResult << std::endl;

	getch();

	return 0;
}