Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //nvcc heat_2d.cu -o heat_2d -Xlinker -framework,OpenGL,-framework,GLUT -I../common
- #include "cuda.h"
- #include "common/book.h"
- #include "common/cpu_anim.h"
- #define DIM 64
- #define PIXELS_PER_CELL 8
- // these exist on the GPU side
- texture<char,2> texIn;
- texture<char,2> texOut;
- __global__ void blend_kernel( char *dst, char dstOut ) {
- // map from threadIdx/BlockIdx to pixel position
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- int offset = x + y * blockDim.x * gridDim.x;
- char t, l, c, r, b;
- char tl,tr,bl,br;
- if (dstOut) {
- t = tex2D(texIn,x,y-1);
- l = tex2D(texIn,x-1,y);
- c = tex2D(texIn,x,y);
- r = tex2D(texIn,x+1,y);
- b = tex2D(texIn,x,y+1);
- tl = tex2D(texIn,x-1,y-1);
- tr = tex2D(texIn,x+1,y-1);
- bl = tex2D(texIn,x-1,y+1);
- br = tex2D(texIn,x+1,y+1);
- } else {
- t = tex2D(texOut,x,y-1);
- l = tex2D(texOut,x-1,y);
- c = tex2D(texOut,x,y);
- r = tex2D(texOut,x+1,y);
- b = tex2D(texOut,x,y+1);
- tl = tex2D(texOut,x-1,y-1);
- tr = tex2D(texOut,x+1,y-1);
- bl = tex2D(texOut,x-1,y+1);
- br = tex2D(texOut,x+1,y+1);
- }
- char count = t + l + r + b + tl + tr + bl + br; /* ranges from 0 - 4 */
- if( c ) {
- switch( count ) {
- case 2:
- case 3:
- dst[offset] = 1;
- break;
- default:
- dst[offset] = 0;
- }
- } else {
- switch( count ) {
- case 3:
- dst[offset] = 1;
- break;
- default:
- dst[offset] = 0;
- break;
- }
- }
- }
- // globals needed by the update routine
- struct DataBlock {
- unsigned char *output_bitmap;
- char *dev_inSrc;
- char *dev_outSrc;
- CPUAnimBitmap *bitmap;
- cudaEvent_t start, stop;
- float totalTime;
- float frames;
- };
- __device__ void set_color( unsigned char * data, int offset, unsigned char r, unsigned char g, unsigned char b, unsigned char a );
- __device__ void set_color( unsigned char * data, int offset, unsigned char r, unsigned char g, unsigned char b, unsigned char a ) {
- data[offset*4 + 0] = r;
- data[offset*4 + 1] = g;
- data[offset*4 + 2] = b;
- data[offset*4 + 3] = a;
- }
- __global__ void bool_to_color( unsigned char *optr, const char *outSrc ) {
- // map from threadIdx/BlockIdx to pixel position
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- int offset = (x*PIXELS_PER_CELL) + ((y*PIXELS_PER_CELL) * DIM * PIXELS_PER_CELL);
- char l = outSrc[x+y*DIM];
- for( int i = 0; i < PIXELS_PER_CELL; i++ ) {
- for( int j = 0; j < PIXELS_PER_CELL; j++ ) {
- if( l ) {
- set_color( optr, offset + (i) + (j * DIM * PIXELS_PER_CELL), 255, 255, 255, 255 );
- } else {
- set_color( optr, offset + (i) + (j * DIM * PIXELS_PER_CELL), 0, 0, 0, 255 );
- }
- }
- }
- }
- volatile char dstOut = true;
- void anim_gpu( DataBlock *d, int ticks ) {
- /* Start Timer */
- HANDLE_ERROR( cudaEventRecord( d->start, 0 ) );
- /* create kernel */
- dim3 blocks(DIM/16,DIM/16);
- dim3 threads(16,16);
- CPUAnimBitmap *bitmap = d->bitmap;
- /* figure out which buffer to write to */
- char * out = dstOut ? d->dev_outSrc : d->dev_inSrc;
- /* process state */
- blend_kernel<<<blocks,threads>>>( out, dstOut );
- /* flip buffer */
- dstOut = !dstOut;
- /* Turn Cells into pixels */
- bool_to_color<<<blocks,threads>>>( d->output_bitmap, out );
- /* Copy pixels back to host */
- HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost ) );
- /* Stop Timer */
- HANDLE_ERROR( cudaEventRecord( d->stop, 0 ) );
- HANDLE_ERROR( cudaEventSynchronize( d->stop ) );
- float elapsedTime;
- HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, d->start, d->stop ) );
- d->totalTime += elapsedTime;
- ++d->frames;
- printf( "Average Time per frame: %3.1f ms\n", d->totalTime/d->frames );
- }
- // clean up memory allocated on the GPU
- void anim_exit( DataBlock *d ) {
- cudaUnbindTexture( texIn );
- cudaUnbindTexture( texOut );
- HANDLE_ERROR( cudaFree( d->dev_inSrc ) );
- HANDLE_ERROR( cudaFree( d->dev_outSrc ) );
- HANDLE_ERROR( cudaEventDestroy( d->start ) );
- HANDLE_ERROR( cudaEventDestroy( d->stop ) );
- }
- void _zero_matrix( int imageSize ) {
- char *temp = (char*)malloc( imageSize );
- for (int i=0; i<DIM*DIM; i++) {
- temp[i] = 0;
- }
- free( temp );
- }
- void _create_life( DataBlock * data, int imageSize ) {
- char *temp = (char*)malloc( imageSize );
- // first square
- temp[1 + 6 * DIM] = 1;
- temp[2 + 6 * DIM] = 1;
- temp[1 + 7 * DIM] = 1;
- temp[2 + 7 * DIM] = 1;
- // little rectangle
- temp[31 + 2 * DIM] = 1;
- temp[31 + 3 * DIM] = 1;
- //second square
- temp[35 + 4 * DIM] = 1;
- temp[36 + 4 * DIM] = 1;
- temp[35 + 5 * DIM] = 1;
- temp[36 + 5 * DIM] = 1;
- //first glider
- temp[14 + 3 * DIM] = 1;
- temp[13 + 4 * DIM] = 1;
- temp[15 + 4 * DIM] = 1;
- temp[12 + 5 * DIM] = 1;
- temp[16 + 5 * DIM] = 1;
- temp[17 + 5 * DIM] = 1;
- temp[12 + 6 * DIM] = 1;
- temp[16 + 6 * DIM] = 1;
- temp[17 + 6 * DIM] = 1;
- temp[12 + 7 * DIM] = 1;
- temp[16 + 7 * DIM] = 1;
- temp[17 + 7 * DIM] = 1;
- temp[13 + 8 * DIM] = 1;
- temp[15 + 8 * DIM] = 1;
- temp[14 + 9 * DIM] = 1;
- // second glider
- temp[26 + 1 * DIM] = 1;
- temp[23 + 2 * DIM] = 1;
- temp[24 + 2 * DIM] = 1;
- temp[25 + 2 * DIM] = 1;
- temp[26 + 2 * DIM] = 1;
- temp[22 + 3 * DIM] = 1;
- temp[23 + 3 * DIM] = 1;
- temp[24 + 3 * DIM] = 1;
- temp[25 + 3 * DIM] = 1;
- temp[22 + 4 * DIM] = 1;
- temp[25 + 4 * DIM] = 1;
- temp[22 + 5 * DIM] = 1;
- temp[23 + 5 * DIM] = 1;
- temp[24 + 5 * DIM] = 1;
- temp[25 + 5 * DIM] = 1;
- temp[23 + 6 * DIM] = 1;
- temp[24 + 6 * DIM] = 1;
- temp[25 + 6 * DIM] = 1;
- temp[26 + 6 * DIM] = 1;
- temp[26 + 7 * DIM] = 1;
- // first offspring
- temp[23 + 10 * DIM] = 1;
- temp[25 + 10 * DIM] = 1;
- temp[24 + 11 * DIM] = 1;
- temp[25 + 11 * DIM] = 1;
- temp[24 + 12 * DIM] = 1;
- // second offspring
- temp[31 + 17 * DIM] = 1;
- temp[32 + 18 * DIM] = 1;
- temp[33 + 18 * DIM] = 1;
- temp[31 + 19 * DIM] = 1;
- temp[32 + 19 * DIM] = 1;
- // third offspring
- temp[36 + 25 * DIM] = 1;
- temp[38 + 25 * DIM] = 1;
- temp[37 + 26 * DIM] = 1;
- temp[38 + 26 * DIM] = 1;
- temp[37 + 27 * DIM] = 1;
- // fourth offspring
- temp[46 + 32 * DIM] = 1;
- temp[47 + 33 * DIM] = 1;
- temp[48 + 33 * DIM] = 1;
- temp[46 + 34 * DIM] = 1;
- temp[47 + 34 * DIM] = 1;
- //init the life into the first buffer
- HANDLE_ERROR( cudaMemcpy( data->dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice ) );
- free( temp );
- }
- int main( void ) {
- DataBlock data;
- CPUAnimBitmap bitmap( DIM * PIXELS_PER_CELL, DIM * PIXELS_PER_CELL, &data );
- data.bitmap = &bitmap;
- data.totalTime = 0;
- data.frames = 0;
- HANDLE_ERROR( cudaEventCreate( &data.start ) );
- HANDLE_ERROR( cudaEventCreate( &data.stop ) );
- int imageSize = bitmap.image_size();
- HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap, imageSize ) );
- // assume float == 4 chars in size (ie rgba)
- HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc, imageSize ) );
- HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc, imageSize ) );
- cudaChannelFormatDesc desc = cudaCreateChannelDesc<char>();
- HANDLE_ERROR( cudaBindTexture2D( NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(char) * DIM ) );
- HANDLE_ERROR( cudaBindTexture2D( NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(char) * DIM ) );
- //Zero the cell matrix
- _zero_matrix( imageSize );
- //Add LIFE!
- _create_life( &data, imageSize );
- bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu, (void (*)(void*))anim_exit );
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement