CUDA


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
using namespace std;

#define SIZE_DARR 1000000
float hres[SIZE_DARR] = { 0 };
float hh[1] = { 0 };
float* dres, *dh;
float gpuTime;

int threadsPerBlock = 1024;
int blocksPerGrid = (SIZE_DARR + threadsPerBlock - 1) / threadsPerBlock;


__global__ void CalcIntegral(int n, float* dres, float* dh)
{
	float a = 1.0;
	float b = 8.0;
	float x = 0.0;

	dh[0] = (b - a) / n;

	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	while (tid < n)
	{
		x = a + dh[0] * (tid + 0.5);
		dres[tid] += (exp(x) + exp(-x)) / 2;
		tid += blockDim.x * gridDim.x;
	}

}

void experiment(int n)
{
	hres[SIZE_DARR] = { 0 };
	hh[1] = { 0 };

	cudaMalloc((void**)&dres, sizeof(float) * SIZE_DARR);
	cudaMalloc((void**)&dh, sizeof(float));
	cudaMemcpy(dres, hres, sizeof(float) * SIZE_DARR, cudaMemcpyKind::cudaMemcpyHostToDevice);
	cudaMemcpy(dh, hh, sizeof(float) * SIZE_DARR, cudaMemcpyKind::cudaMemcpyHostToDevice);

	cudaEvent_t start, stop;
	gpuTime = 0.0f;

	cout << endl << n << " Элементов" << endl << "Время (ms) \n";

	//for (int k = 1; k <= 100000000 / n; k++)
	//{
		cudaEventCreate(&start);
		cudaEventCreate(&stop);

		cudaEventRecord(start, 0);

		CalcIntegral<<<blocksPerGrid, threadsPerBlock>>> (n, dres, dh);

		cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);

		cudaEventElapsedTime(&gpuTime, start, stop);

		cudaEventDestroy(start);
		cudaEventDestroy(stop);

	//}

	cudaMemcpy(hres, dres, sizeof(float) * SIZE_DARR, cudaMemcpyKind::cudaMemcpyDeviceToHost);
	cudaMemcpy(hh, dh, sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);

	float result = 0.0f;
	for (int i = 0; i < n; i++)
	{
		result += hres[i];
	}
	result *= hh[0];

	cout.width(10);
	cout.setf(ios::right);
	cout << gpuTime << endl;

	cudaFree(dres);
	cudaFree(dh);
}

int main()
{
	setlocale(LC_CTYPE, "rus");

	cout << endl << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << endl;

	for (int i = 100; i <= SIZE_DARR; i *= 10)
	{
		experiment(i);
	}

	getchar();
    return 0;
}