Untitled

# include <stdlib.h>
# include <stdio.h>
# include <math.h>
# include <time.h>

#define BLOCK_SIZE 256

int main ( int argc, char *argv[] );

__global__ void reductionMaXKernel(double* devShared,double* devDiff, double* blockResults, int n) {

	unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

	// Load block in the shared memory
	if (i < n) devShared[i] = devDiff[i];
	else devShared[i] = 0;

	__syncthreads();

    // Do reduction in shared memory
	for (unsigned int s = blockDim.x/2; s > 0; s >>= 1) {
		if (tid < s) {
			if(devShared[i]<devShared[i + s])
			devShared[i]=devShared[i + s];
		}
		__syncthreads();
	}

	// Write result for this block to global memory
	if (tid == 0) blockResults[blockIdx.x] = devShared[i];
}

__global__ void calulateIterration(double *devW,double *devU,double *devDiff,int M,int N){

	int i= blockIdx.x * blockDim.x +threadIdx.x;
		if(i<M*N)
        devU[i] = devW[i];
		__syncthreads();

	if((i>=N )&& (i<(M*N-N)) && ((i%N)!=0) && ((i+1)%N!=0)){
			devW[i] = ( devU[i-1] + devU[i+1] + devU[i-N] + devU[i+N] ) / 4.0;

			  devDiff[i] = fabs ( devW[i] - devU[i] );
	}
	 else if(i<M*N)
		devDiff[i]=0;
	__syncthreads();
	}


int main ( int argc, char *argv[] )
{
  int M;
  int N;
	int size;
  double diff;
  double epsilon;
  int i;
  int j;
  double mean;
  char output_file[80];
  int success;
  double *dataw;
  double *datau;
  double *devW,*devU,*devDiff,*devBlockRes,*devShared;
  double **u;
  double **w;
  int 	numBlocks;


  if (argc != 5) {
    return 1;
  } else {
    success = sscanf ( argv[1], "%d", &M );
    success += sscanf ( argv[2], "%d", &N );
    success += sscanf ( argv[3], "%lf", &epsilon );
    success += sscanf ( argv[4], "%s", output_file );

    if (success != 4) {
      return 2;
    }
  }


  datau =(double *)calloc(M * N, sizeof(double));
  dataw = (double *)calloc(M * N, sizeof(double));
  w = (double **)calloc(M, sizeof(double*));
  u = (double **)calloc(M, sizeof(double*));
  for (i = 0; i < M; i++){
    u[i] = &datau[i * N];
	 w[i] = &dataw[i * N];
  }

  size=M * N * sizeof(double);
  cudaMalloc((void **) &devW, size);
  cudaMalloc((void **) &devU, size);
  cudaMalloc((void **) &devShared, size);
  cudaMalloc((void **) &devDiff, size);

/*
  Set the boundary values, which don't change.
*/
  for ( i = 1; i < M - 1; i++ )
  {
    w[i][0] = 100.0;
  }
  for ( i = 1; i < M - 1; i++ )
  {
    w[i][N-1] = 100.0;
  }
  for ( j = 0; j < N; j++ )
  {
    w[M-1][j] = 100.0;
  }
  for ( j = 0; j < N; j++ )
  {
    w[0][j] = 0.0;
  }
/*
  Average the boundary values, to come up with a reasonable
  initial value for the interior.
*/
  mean = 0.0;
  for ( i = 1; i < M - 1; i++ )
  {
    mean = mean + w[i][0];
  }
  for ( i = 1; i < M - 1; i++ )
  {
    mean = mean + w[i][N-1];
  }
  for ( j = 0; j < N; j++ )
  {
    mean = mean + w[M-1][j];
  }
  for ( j = 0; j < N; j++ )
  {
    mean = mean + w[0][j];
  }
  mean = mean / ( double ) ( 2 * M + 2 * N - 4 );
/*
  Initialize the interior solution to the mean value.
*/
  for ( i = 1; i < M - 1; i++ )
  {
    for ( j = 1; j < N - 1; j++ )
    {
      w[i][j] = mean;
    }
  }
/*
  iterate until the  new solution W differs from the old solution U
  by no more than EPSILON.
*/
int n = M*N;
cudaMemcpy(devW, dataw, size, cudaMemcpyHostToDevice);
numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
cudaMalloc((void **) &devBlockRes, numBlocks * sizeof(double));

  diff = epsilon;

  while ( epsilon <= diff )
  {
	 diff=0.0;
	n = M*N;
	calulateIterration<<<numBlocks, BLOCK_SIZE>>>(devW, devU,devDiff,M,N);

	reductionMaXKernel<<<numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(double)>>>(devShared,devDiff, devBlockRes, n);
	 cudaMemcpy(&diff, devBlockRes, sizeof(double), cudaMemcpyDeviceToHost);
	 printf("prva red: %6.2f \n", diff);

	while (numBlocks > 1) {
	  n = numBlocks;
	  numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
	  reductionMaXKernel<<<numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(double)>>>(devShared,devBlockRes, devBlockRes, n);
	  cudaMemcpy(&diff, devBlockRes, sizeof(double), cudaMemcpyDeviceToHost);
	  printf("u while d: %6.2f \n", diff);
	}
	cudaMemcpy(&diff, devBlockRes, sizeof(double), cudaMemcpyDeviceToHost);

	printf("%6.2f \n", diff);

  }

  cudaMemcpy(dataw, devW, M*N * sizeof(double), cudaMemcpyDeviceToHost);

  cudaFree(devW);
  cudaFree(devU);
  cudaFree(devDiff);

/*
  for ( i = 0; i < M; i++ )
    for ( j = 0; j < N; j++)
      printf("%6.2f ", w[i][j] );
*/
  return 0;

}