Untitled

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "math.h"


//#define  N   		200000000
//#define  BLOCK_SIZE	1024

struct  cudaDeviceProp props;

int N = 10000000;
int BLOCK_SIZE = 1024;
float 	   *hArray;
float     *dArray;
int 	   blocks;

void computeUsingCPU(){
	for( int i=0; i<N; i++ ){
		hArray[i] = hArray[i] * hArray[i] * hArray[i] + hArray[i] * hArray[i] + hArray[i];
	}
}

void prepare(){
	hArray = (float*) malloc(sizeof(float)*N);
	memset(hArray, 0, sizeof(hArray));
	for(int i = 0; i < N; i++) {
		hArray[i] =  i + 1;
	}
}

void prologue(void) {
   	cudaMalloc((void**)&dArray, sizeof(hArray));
   	cudaMemcpy(dArray, hArray, sizeof(hArray), cudaMemcpyHostToDevice);
}

void epilogue(void) {
	cudaMemcpy(hArray, dArray, sizeof(hArray), cudaMemcpyDeviceToHost);
	cudaFree(dArray);
}


// Kernel
__global__ void pow3(float *A, int N) {
	int x = blockDim.x * blockIdx.x + threadIdx.x;

    if(x < N)
	    A[x] = A[x] * A[x] * A[x] + A[x] * A[x] + A[x];
}

struct Stopwatch
{
	clock_t _start;
	clock_t _stop;

	void start(){
		_start = clock();
	}

	void stop(){
		_stop = clock();
	}

	void init()
	{
		_start = 0;
		_stop = 0;
	}

	double getValue(){
		return ((double)((double)_stop - (double)_start)/(double)CLOCKS_PER_SEC);
	}
};


int main(int argc, char** argv)
{
	struct Stopwatch *copyToGPUTime = (struct Stopwatch*)malloc(sizeof(struct Stopwatch));
	copyToGPUTime->init();

	struct Stopwatch *computeDataUsingGPUTime = (struct Stopwatch*)malloc(sizeof(struct Stopwatch));
	computeDataUsingGPUTime->init();

	struct Stopwatch *copyFromGPUTime = (struct Stopwatch*)malloc(sizeof(struct Stopwatch));
	copyFromGPUTime->init();

	struct Stopwatch *computeUsingCPUTime = (struct Stopwatch*)malloc(sizeof(struct Stopwatch));
	computeUsingCPUTime->init();

	FILE *dFile = fopen("GPUresult.txt", "w");
	char string[5000] = {0};

	int	 devCnt;
	cudaGetDeviceCount(&devCnt);
	if(devCnt == 0)
	{
		perror("No CUDA devices available -- exiting.");
		return 1;
	}

	for(int i=1; i<=50; i++)
	{
		N = 1024*i*i*i*i*i;

		// GPU part ///////////////////////////////////////

		prepare();

		copyToGPUTime->start();
		prologue();
		copyToGPUTime->stop();

		blocks = N / BLOCK_SIZE;
		if(N % BLOCK_SIZE)
			blocks++;

		computeDataUsingGPUTime->start();
		pow3<<<blocks, BLOCK_SIZE>>>(dArray, N);
		cudaThreadSynchronize();
		computeDataUsingGPUTime->stop();


		copyFromGPUTime->start();
		epilogue();
		copyFromGPUTime->stop();


		// CPU part ///////////////////////////////////////

		computeUsingCPUTime->start();
		computeUsingCPU();
		computeUsingCPUTime->stop();


		// Print   ///////////////////////////////////////

		sprintf(string, "%d	%.12f	%.12f	%.12f	%.12f	%.12f\n",N, copyToGPUTime->getValue(), computeDataUsingGPUTime->getValue(), copyFromGPUTime->getValue(),copyToGPUTime->getValue() + computeDataUsingGPUTime->getValue() + copyFromGPUTime->getValue(), computeUsingCPUTime->getValue());
		fprintf(dFile, string, 0);
		printf(" GPU - copy data to memory: %.12fs\n GPU - computing: %.12fs\n GPU - copy data from memory: %.12fs\n CPU - computing: %.12fs\n",
		copyToGPUTime->getValue(), computeDataUsingGPUTime->getValue(), copyFromGPUTime->getValue(), computeUsingCPUTime->getValue());

	}


	fclose(dFile);

	free(hArray);
	free(copyToGPUTime);
	free(computeDataUsingGPUTime);
	free(copyFromGPUTime);
	free(computeUsingCPUTime);
	return 0;
}