Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # include <stdlib.h>
- # include <stdio.h>
- # include <math.h>
- # include <time.h>
- #define BLOCK_SIZE 256
- int main ( int argc, char *argv[] );
- __global__ void reductionMaXKernel(double* devShared,double* devDiff, double* blockResults, int n) {
- unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
- // Load block in the shared memory
- if (i < n) devShared[i] = devDiff[i];
- else devShared[i] = 0;
- __syncthreads();
- // Do reduction in shared memory
- for (unsigned int s = blockDim.x/2; s > 0; s >>= 1) {
- if (tid < s) {
- if(devShared[i]<devShared[i + s])
- devShared[i]=devShared[i + s];
- }
- __syncthreads();
- }
- // Write result for this block to global memory
- if (tid == 0) blockResults[blockIdx.x] = devShared[i];
- }
- __global__ void calulateIterration(double *devW,double *devU,double *devDiff,int M,int N){
- int i= blockIdx.x * blockDim.x +threadIdx.x;
- if(i<M*N)
- devU[i] = devW[i];
- __syncthreads();
- if((i>=N )&& (i<(M*N-N)) && ((i%N)!=0) && ((i+1)%N!=0)){
- devW[i] = ( devU[i-1] + devU[i+1] + devU[i-N] + devU[i+N] ) / 4.0;
- devDiff[i] = fabs ( devW[i] - devU[i] );
- }
- else if(i<M*N)
- devDiff[i]=0;
- __syncthreads();
- }
- int main ( int argc, char *argv[] )
- {
- int M;
- int N;
- int size;
- double diff;
- double epsilon;
- int i;
- int j;
- double mean;
- char output_file[80];
- int success;
- double *dataw;
- double *datau;
- double *devW,*devU,*devDiff,*devBlockRes,*devShared;
- double **u;
- double **w;
- int numBlocks;
- if (argc != 5) {
- return 1;
- } else {
- success = sscanf ( argv[1], "%d", &M );
- success += sscanf ( argv[2], "%d", &N );
- success += sscanf ( argv[3], "%lf", &epsilon );
- success += sscanf ( argv[4], "%s", output_file );
- if (success != 4) {
- return 2;
- }
- }
- datau =(double *)calloc(M * N, sizeof(double));
- dataw = (double *)calloc(M * N, sizeof(double));
- w = (double **)calloc(M, sizeof(double*));
- u = (double **)calloc(M, sizeof(double*));
- for (i = 0; i < M; i++){
- u[i] = &datau[i * N];
- w[i] = &dataw[i * N];
- }
- size=M * N * sizeof(double);
- cudaMalloc((void **) &devW, size);
- cudaMalloc((void **) &devU, size);
- cudaMalloc((void **) &devShared, size);
- cudaMalloc((void **) &devDiff, size);
- /*
- Set the boundary values, which don't change.
- */
- for ( i = 1; i < M - 1; i++ )
- {
- w[i][0] = 100.0;
- }
- for ( i = 1; i < M - 1; i++ )
- {
- w[i][N-1] = 100.0;
- }
- for ( j = 0; j < N; j++ )
- {
- w[M-1][j] = 100.0;
- }
- for ( j = 0; j < N; j++ )
- {
- w[0][j] = 0.0;
- }
- /*
- Average the boundary values, to come up with a reasonable
- initial value for the interior.
- */
- mean = 0.0;
- for ( i = 1; i < M - 1; i++ )
- {
- mean = mean + w[i][0];
- }
- for ( i = 1; i < M - 1; i++ )
- {
- mean = mean + w[i][N-1];
- }
- for ( j = 0; j < N; j++ )
- {
- mean = mean + w[M-1][j];
- }
- for ( j = 0; j < N; j++ )
- {
- mean = mean + w[0][j];
- }
- mean = mean / ( double ) ( 2 * M + 2 * N - 4 );
- /*
- Initialize the interior solution to the mean value.
- */
- for ( i = 1; i < M - 1; i++ )
- {
- for ( j = 1; j < N - 1; j++ )
- {
- w[i][j] = mean;
- }
- }
- /*
- iterate until the new solution W differs from the old solution U
- by no more than EPSILON.
- */
- int n = M*N;
- cudaMemcpy(devW, dataw, size, cudaMemcpyHostToDevice);
- numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
- cudaMalloc((void **) &devBlockRes, numBlocks * sizeof(double));
- diff = epsilon;
- while ( epsilon <= diff )
- {
- diff=0.0;
- n = M*N;
- calulateIterration<<<numBlocks, BLOCK_SIZE>>>(devW, devU,devDiff,M,N);
- reductionMaXKernel<<<numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(double)>>>(devShared,devDiff, devBlockRes, n);
- cudaMemcpy(&diff, devBlockRes, sizeof(double), cudaMemcpyDeviceToHost);
- printf("prva red: %6.2f \n", diff);
- while (numBlocks > 1) {
- n = numBlocks;
- numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
- reductionMaXKernel<<<numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(double)>>>(devShared,devBlockRes, devBlockRes, n);
- cudaMemcpy(&diff, devBlockRes, sizeof(double), cudaMemcpyDeviceToHost);
- printf("u while d: %6.2f \n", diff);
- }
- cudaMemcpy(&diff, devBlockRes, sizeof(double), cudaMemcpyDeviceToHost);
- printf("%6.2f \n", diff);
- }
- cudaMemcpy(dataw, devW, M*N * sizeof(double), cudaMemcpyDeviceToHost);
- cudaFree(devW);
- cudaFree(devU);
- cudaFree(devDiff);
- /*
- for ( i = 0; i < M; i++ )
- for ( j = 0; j < N; j++)
- printf("%6.2f ", w[i][j] );
- */
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement