Advertisement
Guest User

Untitled

a guest
Nov 15th, 2011
271
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 34.91 KB | None | 0 0
  1. /* c-ray-f - a simple raytracing filter.
  2.  * Copyright (C) 2006 John Tsiombikas <nuclear@siggraph.org>
  3.  * No copyright for the non-functional additions added afterward..
  4.  *
  5.  * You are free to use, modify and redistribute this program under the
  6.  * terms of the GNU General Public License v2 or (at your option) later.
  7.  * see "http://www.gnu.org/licenses/gpl.txt" for details.
  8.  * ---------------------------------------------------------------------
  9.  * Usage:
  10.  *   compile:  cc -o c-ray-f c-ray-f.c -lm
  11.  *   run:      cat scene | ./c-ray-f >foo.ppm
  12.  *   enjoy:    display foo.ppm (with imagemagick)
  13.  *      or:    imgview foo.ppm (on IRIX)
  14.  * ---------------------------------------------------------------------
  15.  * Scene file format:
  16.  *   # sphere (many)
  17.  *   s  x y z  rad   r g b   shininess   reflectivity
  18.  *   # light (many)
  19.  *   l  x y z
  20.  *   # camera (one)
  21.  *   c  x y z  fov   tx ty tz
  22.  * ---------------------------------------------------------------------
  23.  */
  24. #include <stdio.h>
  25. #include <stdlib.h>
  26. #include <string.h>
  27. #include <math.h>
  28. #include <ctype.h>
  29. #include <errno.h>
  30.  
  31. /* find the appropriate way to define explicitly sized types */
  32. #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__)  /* C99 or GNU libc */
  33. #include <stdint.h>
  34. #elif defined(__unix__) || defined(unix) || defined(__MACH__)
  35. #include <sys/types.h>
  36. #elif defined(_MSC_VER) /* the nameless one */
  37. typedef unsigned __int8 uint8_t;
  38. typedef unsigned __int32 uint32_t;
  39. #endif
  40.  
  41.  
  42.  
  43. #include <stdio.h>
  44.  
  45. #define cudaErrorCheck(call) { cudaAssert(call,__FILE__,__LINE__); }
  46.  
  47. void cudaAssert(const cudaError err, const char *file, const int line)
  48. {
  49.     if( cudaSuccess != err) {                                                
  50.         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        
  51.                 file, line, cudaGetErrorString(err) );
  52.         exit(1);
  53.     }
  54. }
  55.  
  56.  
  57.  
  58.  
  59. int objCounter = 0;
  60.  
  61. struct parallelPixels { //STRUCT WHICH DIVIDES PIXELS FOR RENDER2 GLOBAL FUNCTION TO USE
  62.     signed int start[1];
  63.     signed int end[1];
  64. };
  65.  
  66.  
  67. struct intss {
  68.     u_int32_t one;
  69.     u_int32_t two;
  70.     u_int32_t three;
  71.     u_int32_t four;
  72.     u_int32_t five;
  73.     u_int32_t six;
  74.     u_int32_t seven;
  75.     u_int32_t eight;
  76.     u_int32_t nine;
  77.     u_int32_t ten;
  78. };
  79.    
  80.  
  81. struct vec3 {
  82.     double x;
  83.     double y;
  84.     double z;
  85. };
  86.  
  87. struct ray {
  88.     struct vec3 orig, dir;
  89. };
  90.  
  91. struct material {
  92.     struct vec3 col;    /* color */
  93.     double spow;        /* specular power */
  94.     double refl;        /* reflection intensity */
  95. };
  96.  
  97. struct sphere {
  98.     struct vec3 pos;
  99.     double rad;
  100.     struct material mat;
  101.     struct sphere *next;
  102. };
  103.  
  104. struct reflectdata {  //STRUCT WHICH CONTAINS THE DATA FOR TRACING FURTHER REFLECTION RAYS
  105.     struct ray ray;
  106.     int depth;
  107.     double reflection;
  108. };  
  109.  
  110. struct spoint {
  111.     struct vec3 pos, normal, vref;  /* position, normal and view reflection */
  112.     double dist;        /* parametric distance of intersection along the ray */
  113. };
  114.  
  115. struct camera {
  116.     struct vec3 pos, targ;
  117.     double fov;
  118. };
  119.  
  120. void render1(int xsz, int ysz, u_int32_t *fb, int samples);
  121. __global__ void render2(struct intss *device_fb, struct parallelPixels *pixelsPerCore, int samples, double *obj_list_flat, int numOpsPerCore, int lnumdev, struct camera camdev, struct vec3 *lightsdev, struct vec3 *uranddev, int *iranddev); //SPECIFY ARGUMENTS TO RENDER2~!!!!
  122. __device__ struct vec3 trace(struct ray ray, int depth, int *isReflect, struct reflectdata *Rdata, double *obj_list_flat, int lnumdev, struct vec3 *lightsdev); //two arguments added - one to check if a reflection ray must be made, the other to provide the arguments necessary for the reflection ray
  123. __device__ struct vec3 shade(double *obj, struct spoint *sp, int depth, int *isReflect, struct reflectdata *Rdata, double *obj_list_flat, int lnumdev, struct vec3 *lightsdev);
  124. __device__ struct vec3 reflect(struct vec3 v, struct vec3 n);
  125. __device__ struct vec3 cross_product(struct vec3 v1, struct vec3 v2);
  126. __device__ struct ray get_primary_ray(int x, int y, int sample, struct camera camdev, struct vec3 *uranddev, int *iranddev);
  127. __device__ struct vec3 get_sample_pos(int x, int y, int sample, struct vec3 *uranddev, int *iranddev);
  128. __device__ struct vec3 jitter(int x, int y, int s, struct vec3 *uranddev, int *iranddev);
  129. __device__ int ray_sphere(double *sph, struct ray ray, struct spoint *sp);
  130. void load_scene(FILE *fp);                //FOR NOW, THIS WILL NOT BE MADE PARALLEL
  131. void flatten_obj_list(struct sphere *obj_list, double *obj_list_flat, int objCounter);
  132. void flatten_sphere(struct sphere *sphere, double *sphere_flat);
  133. void get_ith_sphere(double *obj_list_flat, int index, double *sphere_flat);
  134. unsigned long get_msec(void);             //COUNTING TIME CANNOT BE DONE IN PARALLEL
  135. inline void check_cuda_errors(const char *filename, const int line_number);
  136.  
  137. #define MAX_LIGHTS      16              /* maximum number of lights */
  138. #define RAY_MAG         1000.0          /* trace rays of this magnitude */
  139. #define MAX_RAY_DEPTH   5               /* raytrace recursion limit */
  140. #define FOV             0.78539816      /* field of view in rads (pi/4) */
  141. #define HALF_FOV        (FOV * 0.5)
  142. #define ERR_MARGIN      1e-6            /* an arbitrary error margin to avoid surface acne */
  143.  
  144. /* bit-shift ammount for packing each color into a 32bit uint */
  145. #ifdef LITTLE_ENDIAN
  146. #define RSHIFT  16
  147. #define BSHIFT  0
  148. #else   /* big endian */
  149. #define RSHIFT  0
  150. #define BSHIFT  16
  151. #endif  /* endianess */
  152. #define GSHIFT  8   /* this is the same in both byte orders */
  153.  
  154. /* some helpful macros... */
  155. #define SQ(x)       ((x) * (x))
  156. #define MAX(a, b)   ((a) > (b) ? (a) : (b))
  157. #define MIN(a, b)   ((a) < (b) ? (a) : (b))
  158. #define DOT(a, b)   ((a).x * (b).x + (a).y * (b).y + (a).z * (b).z)
  159. #define NORMALIZE(a)  do {\
  160.     double len = sqrt(DOT(a, a));\
  161.     (a).x /= len; (a).y /= len; (a).z /= len;\
  162. } while(0);
  163.  
  164. /* global state for host*/
  165. int xres = 800;
  166. int yres = 600;
  167. double aspect = 1.333333;
  168. struct sphere *obj_list;
  169. double *obj_list_flat;
  170. struct vec3 lights[MAX_LIGHTS];
  171. int lnum = 0;
  172. struct camera cam;
  173.  
  174. #define NRAN    1024
  175. #define MASK    (NRAN - 1)
  176. struct vec3 urand[NRAN];
  177. int irand[NRAN];
  178.  
  179.  
  180. /*global state for device*/
  181. __device__ int xresdev = 800;
  182. __device__ int yresdev = 600;
  183. __device__ double aspectdev = 1.333333;
  184. //__device__ struct sphere *obj_listdev = 0;
  185. //__device__ struct vec3 lightsdev[MAX_LIGHTS];
  186. //__device__ int lnumdev = 0;
  187. //__device__ struct camera camdev;
  188.  
  189. //__device__ struct vec3 uranddev[NRAN];
  190. //__device__ int iranddev[NRAN];
  191.  
  192.  
  193. const char *usage = {
  194.     "Usage: c-ray-f [options]\n"
  195.     "  Reads a scene file from stdin, writes the image to stdout, and stats to stderr.\n\n"
  196.     "Options:\n"
  197.     "  -s WxH     where W is the width and H the height of the image\n"
  198.     "  -r <rays>  shoot <rays> rays per pixel (antialiasing)\n"
  199.     "  -i <file>  read from <file> instead of stdin\n"
  200.     "  -o <file>  write to <file> instead of stdout\n"
  201.     "  -h         this help screen\n\n"
  202. };
  203.  
  204.  
  205.  
  206. int main(int argc, char **argv) {
  207.     int i;
  208.     unsigned long rend_time, start_time;
  209.     u_int32_t *pixels;
  210.     int rays_per_pixel = 1;
  211.     FILE *infile = stdin, *outfile = stdout;
  212.  
  213.     for(i=1; i<argc; i++) {
  214.         if(argv[i][0] == '-' && argv[i][2] == 0) {
  215.             char *sep;
  216.             switch(argv[i][1]) {
  217.             case 's':
  218.                 if(!isdigit(argv[++i][0]) || !(sep = strchr(argv[i], 'x')) || !isdigit(*(sep + 1))) {
  219.                     fputs("-s must be followed by something like \"640x480\"\n", stderr);
  220.                     return EXIT_FAILURE;
  221.                 }
  222.                 xres = atoi(argv[i]);
  223.                 yres = atoi(sep + 1);
  224.                 aspect = (double)xres / (double)yres;
  225.                 break;
  226.  
  227.             case 'i':
  228.                 if(!(infile = fopen(argv[++i], "r"))) {
  229.                     fprintf(stderr, "failed to open input file %s: %s\n", argv[i], strerror(errno));
  230.                     return EXIT_FAILURE;
  231.                 }
  232.                 break;
  233.  
  234.             case 'o':
  235.                 if(!(outfile = fopen(argv[++i], "w"))) {
  236.                     fprintf(stderr, "failed to open output file %s: %s\n", argv[i], strerror(errno));
  237.                     return EXIT_FAILURE;
  238.                 }
  239.                 break;
  240.  
  241.             case 'r':
  242.                 if(!isdigit(argv[++i][0])) {
  243.                     fputs("-r must be followed by a number (rays per pixel)\n", stderr);
  244.                     return EXIT_FAILURE;
  245.                 }
  246.                 rays_per_pixel = atoi(argv[i]);
  247.                 break;
  248.  
  249.             case 'h':
  250.                 fputs(usage, stdout);
  251.                 return 0;
  252.                
  253.             default:
  254.                 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
  255.                 fputs(usage, stderr);
  256.                 return EXIT_FAILURE;
  257.             }
  258.         } else {
  259.             fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
  260.             fputs(usage, stderr);
  261.             return EXIT_FAILURE;
  262.         }
  263.     }
  264.  
  265.     if(!(pixels = (u_int32_t *)malloc(xres * yres * sizeof(*pixels)))) {
  266.         perror("pixel buffer allocation failed");
  267.         return EXIT_FAILURE;
  268.     }
  269.     load_scene(infile);
  270.  
  271.     /* initialize the random number tables for the jitter */
  272.     for(i=0; i<NRAN; i++) urand[i].x = (double)rand() / RAND_MAX - 0.5;
  273.     for(i=0; i<NRAN; i++) urand[i].y = (double)rand() / RAND_MAX - 0.5;
  274.     for(i=0; i<NRAN; i++) irand[i] = (int)(NRAN * ((double)rand() / RAND_MAX));
  275.  
  276.     xresdev = xres;
  277.     yresdev = yres;
  278.     aspectdev = aspect;
  279.  
  280.  
  281.  
  282.    
  283.     start_time = get_msec();
  284.     render1(xres, yres, pixels, rays_per_pixel);
  285.     rend_time = get_msec() - start_time;
  286.    
  287.     /* output statistics to stderr */
  288.     fprintf(stderr, "Rendering took: %lu seconds (%lu milliseconds)\n", rend_time / 1000, rend_time);
  289.  
  290.     /* output the image */
  291.     fprintf(outfile, "P6\n%d %d\n255\n", xres, yres);
  292.     for(i=0; i<xres * yres; i++) {
  293.         fputc((pixels[i] >> RSHIFT) & 0xff, outfile);
  294.         fputc((pixels[i] >> GSHIFT) & 0xff, outfile);
  295.         fputc((pixels[i] >> BSHIFT) & 0xff, outfile);
  296.     }
  297.     fflush(outfile);
  298.  
  299.     if(infile != stdin) fclose(infile);
  300.     if(outfile != stdout) fclose(outfile);
  301.     return 0;
  302. }
  303.  
  304. /* render a frame of xsz/ysz dimensions into the provided framebuffer */
  305. void render1(int xsz, int ysz, u_int32_t *fb, int samples)
  306. {
  307.  
  308.     int num_elementsParallelPixel = 4; //there will be an array of three structs which each determine the pixel one core will begin tracing at and the pixel it will end tracing at(x and y values)
  309.  
  310.     int block_size = 3;              //3 * 1 = total number of cores
  311.     int grid_size = 1;
  312.  
  313.     int totalOps = xsz * ysz;       //total number of pixels to have rays traced to (x coord. multiplied by y coord.)
  314.     int numOpsPerCore = totalOps/(block_size*grid_size);   // amount of rays to be traced by each core
  315.  
  316.     int num_bytes_ParallelPixel = num_elementsParallelPixel * sizeof(struct parallelPixels);
  317.     struct parallelPixels *device_pixelspercore = 0;        //device array of pixels per core(sent to all of the cores)
  318.     struct parallelPixels *host_pixelspercore = 0;          //host array of pixels per core(only contained by host)
  319.  
  320.     // malloc a host array
  321.     host_pixelspercore = (struct parallelPixels*)malloc(num_bytes_ParallelPixel);
  322.  
  323.     // cudaMalloc a device array
  324.     cudaErrorCheck( cudaMalloc((void**)&device_pixelspercore, num_bytes_ParallelPixel) );
  325.  
  326.         printf("cudaMalloc pixels success!\n");
  327.    
  328. //Ah, the frame buffer.
  329. //Originally, the frame buffer contained all of the pixels of the frame, sorted in order(of increasing x per y value?)
  330. //Anyway, all of the devices(cores) need some place to put the output of their computations.
  331. //They could choose to put the output into the frame buffer array!
  332. //But there's a problem.
  333. //Each device is writing multiple outputs to the array, based on how many rays it will be rendering.
  334. //Additionally, each device has only -one- index to this array, based on what number core it is!(grid number * block number * something else which uniquely identifies each core)
  335. //This means that each core can only save one element into the frame buffer array which won't get overwritten by another core!
  336. //So, what do we do?
  337. //Well, we could choose to send a array of struct of 4 u_int32_t's to the devices, which allows no outdata to get overwritten!
  338. //Then, we could transfer this struct array to a host array.
  339. //Finally, we could loop through all of the output in the struct array and sequentially put it into a regular u_int32_t array(the REAL frame buffer). ***Ah, did someone say "sequentially"? Is this an area which could possibly be made parallel as well?**
  340.  
  341.  
  342.  
  343.  
  344.  
  345.  
  346.  
  347.  
  348. //and now to determine the details of the 2D fb array...
  349. // code heavily taken from
  350. //http://forums.nvidia.com/index.php?s=5712c99a6838532e8e43081108fce9f8&showtopic=69403&st=20
  351. // AND http://pleasemakeanote.blogspot.com/2008/06/2d-arrays-in-c-using-malloc.html
  352.  
  353.     struct intss *device_fb = 0;
  354.     struct intss *host_fb = 0;
  355.  
  356.     //Special host 2D array is created(malloced)
  357.    
  358.     // Now to create the special device 2D Array
  359.  
  360.        int num_bytes_fb = (block_size*grid_size)*sizeof(struct intss);
  361.        
  362.  
  363.     host_fb = (struct intss*)malloc(num_bytes_fb);
  364.     cudaErrorCheck(cudaMalloc((void **)&device_fb, num_bytes_fb) );
  365.  
  366.     printf("cudamalloc framebuffer and host buffer success!\n");
  367.  
  368.  
  369.     //PUT DATA INTO PARALLEL PIXEL STRUCT PIXELSPERCORE!!
  370.  
  371.     int completeCore = 0;   //representing how many cores have the max data stored
  372.     int dataPerCore = 0;   //representing how much data is stored per core
  373.  
  374.     int y, x;              //representing the x and y values of the pixels which are stored in the pixelspercore array
  375.  
  376.     for (y=0; y<ysz; y++)
  377.     {//for each y value, store every possible (x,y) coord into pixelspercore structs
  378.         for (x=0;x<xsz; x++)
  379.         {//if there is no data yet in struct, then we must record the first (x,y) pixel which a given core will trace a ray to
  380.             if (dataPerCore == 0)
  381.             {
  382.                 host_pixelspercore[completeCore].start[0] = x;
  383.                 host_pixelspercore[completeCore].start[1] = y;
  384.             }
  385.        
  386.             if (dataPerCore==(numOpsPerCore-1))
  387.             {//If the maximum amount of pixels which can have rays traced per core are about to be stored/counted, record the last (x,y) pixel which a given core will trace a ray to in the struct
  388.                 host_pixelspercore[completeCore].end[0] = x;
  389.                 host_pixelspercore[completeCore].end[1] = y;
  390.                 completeCore++;    //one entire core data completed/stored
  391.             }
  392.  
  393.             dataPerCore++; //one part of core data stored
  394.            
  395.             if (dataPerCore == numOpsPerCore)
  396.                 dataPerCore = 0; //reset counter if entire core data has been stored
  397.         }
  398.     }
  399.  
  400.  
  401.     printf("parallelpixel data transfer success!\n");
  402.     //copy over host array which determines which pixels should have rays traced per core to device array
  403.     cudaMemcpy(device_pixelspercore, host_pixelspercore, num_bytes_ParallelPixel, cudaMemcpyHostToDevice);
  404.  
  405.     printf("pixelspercore memcopy success!\n");
  406.  
  407.     flatten_obj_list(obj_list,obj_list_flat,objCounter);
  408.  
  409.     double *obj_list_flat_dev = 0;
  410.    
  411.  
  412.  
  413.     //create obj_list_flat_dev array size of objCounter
  414.     cudaErrorCheck( cudaMalloc((void **)&obj_list_flat_dev, (sizeof(double) *objCounter*9)) );
  415.    
  416.     cudaMemcpy(&obj_list_flat_dev, &obj_list_flat, (sizeof(double) *objCounter*9), cudaMemcpyHostToDevice); //copying over flat sphere array to obj_listdevflat
  417.    
  418.  
  419.     printf("obj_list_flat_dev memcopy and malloc success!\n");
  420.     printf("device_fb = %d , device_pixelspercore = %d , samples = %d, obj_list_flat_dev = %d\n", device_fb, device_pixelspercore, samples, obj_list_flat_dev);
  421.  
  422. //lights and camera and whatnot
  423.  
  424.     int lnumdev = 0;
  425.     struct camera camdev;
  426.  
  427.     struct vec3 *lightsdev = 0;
  428.  
  429.     cudaErrorCheck(cudaMalloc((void **)&lightsdev, MAX_LIGHTS*sizeof(struct vec3)) );
  430.  
  431.     cudaMemcpy(&lightsdev, &lights, sizeof(struct vec3) * MAX_LIGHTS, cudaMemcpyHostToDevice);
  432.  
  433.         lnumdev = lnum; //remember to pass lnumdev into render2!
  434.         camdev = cam;   //remember to pass camdev into render2!
  435.  
  436.  
  437.  
  438. //urand and whatnot
  439.     struct vec3 *uranddev = 0;
  440.  
  441.     cudaErrorCheck(cudaMalloc((void **)&uranddev, NRAN*sizeof(struct vec3)) );
  442.  
  443.     cudaMemcpy(&uranddev, &urand, sizeof(struct vec3) * NRAN, cudaMemcpyHostToDevice); //remember to pass all of these into render2!!
  444.  
  445.  
  446. //irand and whatnot
  447.  
  448.     int *iranddev = 0;
  449.  
  450.     cudaErrorCheck(cudaMalloc((void **)&iranddev, NRAN*sizeof(int)) );
  451.  
  452.     cudaMemcpy(&iranddev, &irand, sizeof(int) * NRAN, cudaMemcpyHostToDevice); //remember to pass all of these into render2!!
  453.  
  454.  
  455.  
  456.  
  457.  
  458.  
  459.  
  460.  
  461.  
  462.  
  463.  
  464.  
  465.  
  466.  
  467.     render2<<<grid_size,block_size>>>(device_fb, device_pixelspercore, samples, obj_list_flat_dev, numOpsPerCore, lnumdev, camdev, lightsdev, uranddev, iranddev);
  468.     cudaPeekAtLastError(); // Checks for launch error
  469.     cudaErrorCheck( cudaThreadSynchronize() );
  470.  
  471.     //In all seriousness, all of the cores should now be operating on the ray tracing, if things are working correctly
  472.  
  473.  
  474.     //once done, copy contents of device array to host array  
  475.  
  476.  
  477.  
  478.  
  479.  
  480.     cudaMemcpy(host_fb, device_fb, num_bytes_fb, cudaMemcpyDeviceToHost);
  481.  
  482.  
  483.     printf("output %d ", host_fb[0].one);
  484.  
  485.     printf("output %d ", host_fb[1].one);
  486.  
  487.     printf("output %d ", host_fb[2].one);
  488.  
  489.  
  490.  
  491.  
  492.  
  493.  
  494.  
  495.  
  496.  
  497.     free(host_pixelspercore);  
  498.     cudaErrorCheck( cudaFree(device_fb) );
  499.     free(host_fb);  
  500.     cudaErrorCheck( cudaFree(obj_list_flat_dev) );
  501.     cudaErrorCheck( cudaFree(device_pixelspercore) );
  502.    
  503.  
  504.    
  505. }  
  506.  
  507.  
  508. __global__ void render2(struct intss *device_fb, struct parallelPixels *pixelsPerCore, int samples, double *obj_list_flat_dev, int numOpsPerCore, int lnumdev, struct camera camdev, struct vec3 *lightsdev, struct vec3 *uranddev, int *iranddev)            //SPECIFY ARGUMENTS!!!
  509. {
  510.     int index = blockIdx.x * blockDim.x + threadIdx.x; //DETERMINING INDEX BASED ON WHICH THREAD IS CURRENTLY RUNNING
  511. //    printf("index success!\n");  
  512.  
  513.       device_fb[index].one = index;
  514.  
  515.  
  516.  
  517.     u_int32_t answer;
  518.     int s;
  519.     int i = pixelsPerCore[index].start[0];   //x value of first pixel
  520. //    printf("x value success!\n");
  521.  
  522.  
  523.  
  524.  
  525.     if (i==-1)
  526.     {
  527.                 device_fb[index].one = (u_int32_t)0;
  528.                 device_fb[index].two = (u_int32_t)0;
  529.                 device_fb[index].three = (u_int32_t)0;
  530.                 device_fb[index].four = (u_int32_t)0;
  531.                 device_fb[index].five = (u_int32_t)0;
  532.                 device_fb[index].six = (u_int32_t)0;
  533.                 device_fb[index].seven = (u_int32_t)0;
  534.                 device_fb[index].eight = (u_int32_t)0;
  535.                 device_fb[index].nine = (u_int32_t)0;
  536.                 device_fb[index].ten = (u_int32_t)0;
  537.  
  538.  
  539.  
  540.  
  541.     return;
  542.      }                       //if a -1 value is placed in the start value, then there is no pertinent data here. return early. Note that this will not decrease speed of parallel operations, since the speed is determined by the slowest parallel operation..
  543.     int j = pixelsPerCore[index].start[1];   //y value of first pixel
  544. //    printf("y value success!\n");
  545.     int xsz = pixelsPerCore[index].end[0];   //x value of last pixel
  546. //    printf("x last value success!\n");
  547.     int ysz = pixelsPerCore[index].end[1];   //y value of last pixel
  548. //    printf("y last value success!\n");
  549.     int raysTraced = 0;                     // number of rays traced
  550.     int isReflect[1];                        //WHETHER OR NOT RAY TRACED WILL NEED A REFLECTION RAY AS WELL
  551.     isReflect[0] = 0;
  552.     struct reflectdata RData[1];           //ARRAY WHICH CONTAINS REFLECT DATA STRUCT TO BE PASSED ON TO TRACE FUNCTION
  553.    
  554.     double rcp_samples = 1.0 / (double)samples;
  555.  
  556.     /* for each subpixel, trace a ray through the scene, accumulate the
  557.      * colors of the subpixels of each pixel, then pack the color and
  558.      * put it into the framebuffer.
  559.      * XXX: assumes contiguous scanlines with NO padding, and 32bit pixels.
  560.      */
  561.     for(j; j<=ysz; j++) {
  562.         for(i; i<=xsz; i++) {
  563.             double r, g, b;
  564.             r = g = b = 0.0;
  565.            
  566.             for(s=0; s<samples; s++) {
  567.                 struct vec3 col = trace(get_primary_ray(i, j, s, camdev, uranddev, iranddev), 0, isReflect, RData, obj_list_flat_dev, lnumdev, lightsdev);
  568. //        printf("trace success!\n");  
  569.                 while (*isReflect)        //while there are still reflection rays to trace
  570.                 {
  571.                     struct vec3 rcol;    //holds the output of the reflection ray calculcation
  572.                     rcol = trace(RData->ray, RData->depth, isReflect, RData, obj_list_flat_dev, lnumdev, lightsdev);    //trace a reflection ray
  573.                     col.x += rcol.x * RData->reflection;       //I really am unsure about the usage of pointers here..
  574.                     col.y += rcol.y * RData->reflection;
  575.                     col.z += rcol.z * RData->reflection;
  576.                 }  
  577.                 r += col.x;
  578.                 g += col.y;
  579.                 b += col.z;
  580.             }
  581.  
  582.             r = r * rcp_samples;
  583.             g = g * rcp_samples;
  584.             b = b * rcp_samples;
  585.            
  586.  
  587.       answer =  (((u_int32_t)(MIN(r, 1.0) * 255.0) & 0xff) << RSHIFT |  
  588.                         ((u_int32_t)(MIN(g, 1.0) * 255.0) & 0xff) << GSHIFT |
  589.                         ((u_int32_t)(MIN(b, 1.0) * 255.0) & 0xff) << BSHIFT);
  590.  
  591.  
  592.  
  593.  
  594.                    
  595.             device_fb[index].one = answer;
  596.             raysTraced++;       //one pixel-post-ray-trace data has been stored!
  597.         if (raysTraced == numOpsPerCore) return;  //return if total rays per core have been traced
  598.         }
  599.     }
  600.  
  601. }
  602.  
  603.  
  604.  
  605. /* trace a ray throught the scene recursively (the recursion happens through                
  606.  * shade() to calculate reflection rays if necessary).
  607.  */
  608. __device__ struct vec3 trace(struct ray ray, int depth, int *isReflect, struct reflectdata *Rdata, double *obj_list_flat_dev, int lnumdev, struct vec3 *lightsdev) {
  609. //    printf("beginning of trace success!\n");
  610.     struct vec3 col;
  611.     struct spoint sp, nearest_sp;
  612. //    double *nearest_obj = (double *)malloc(9*sizeof(double));
  613.  
  614.     double nearest_obj[9];
  615.  
  616.     int iterCount = 0;
  617.  
  618.  //   double *flat_sphere = (double *)malloc(9*sizeof(double));
  619.     double flat_sphere[9];
  620.    
  621.     get_ith_sphere(obj_list_flat_dev, iterCount, flat_sphere); // populates flat_sphere
  622.  
  623.     /* if we reached the recursion limit, bail out */
  624.     if(depth >= MAX_RAY_DEPTH) {
  625.         col.x = col.y = col.z = 0.0;
  626.         return col;
  627.     }
  628.    
  629.     int i; 
  630.     /* find the nearest intersection ... */
  631.     while(flat_sphere) {
  632.         if(ray_sphere(flat_sphere, ray, &sp)) {
  633.             if(!nearest_obj || sp.dist < nearest_sp.dist) {
  634.         for (i=0; i<9; i++)
  635.         {
  636.             nearest_obj[i] = flat_sphere[i];
  637.         }
  638.                 nearest_sp = sp;
  639.             }
  640.         }
  641.         iterCount++;
  642.         get_ith_sphere(obj_list_flat_dev, iterCount, flat_sphere);
  643.     }
  644.  
  645.     /* and perform shading calculations as needed by calling shade() */
  646.     if(nearest_obj) {
  647. //   printf("every part of trace up to shade success!\n");
  648.         col = shade(nearest_obj, &nearest_sp, depth, isReflect, Rdata, obj_list_flat_dev, lnumdev, lightsdev);
  649.     } else {
  650.         col.x = col.y = col.z = 0.0;
  651.     }
  652.  
  653.  
  654.  
  655.     return col;
  656. }
  657.  
  658. /* Calculates direct illumination with the phong reflectance model.
  659.  * Also handles reflections by calling trace again, if necessary.
  660.  */
  661. __device__ struct vec3 shade(double *obj, struct spoint *sp, int depth, int *isReflect, struct reflectdata *Rdata, double *obj_list_flat_dev, int lnumdev, struct vec3 *lightsdev) {
  662.     int i;
  663.     struct vec3 col = {0, 0, 0};
  664.     int iterCount = 0;
  665.  
  666.     /* for all lights ... */
  667.     for(i=0; i<lnumdev; i++) {
  668.         double ispec, idiff;
  669.         struct vec3 ldir;
  670.         struct ray shadow_ray;
  671.  
  672. //        double *flat_sphere = (double *)malloc(9*sizeof(double));
  673.     double flat_sphere[9];
  674.         get_ith_sphere(obj_list_flat_dev, iterCount, flat_sphere); // populates flat_sphere
  675.  
  676.         int in_shadow = 0;
  677.  
  678.         ldir.x = lightsdev[i].x - sp->pos.x;
  679.         ldir.y = lightsdev[i].y - sp->pos.y;
  680.         ldir.z = lightsdev[i].z - sp->pos.z;
  681.  
  682.         shadow_ray.orig = sp->pos;
  683.         shadow_ray.dir = ldir;
  684.  
  685.         /* shoot shadow rays to determine if we have a line of sight with the light */
  686.         while(flat_sphere) {
  687.             if(ray_sphere(flat_sphere, shadow_ray, 0)) {
  688.                 in_shadow = 1;
  689.                 break;
  690.             }
  691.             iterCount++;
  692.             get_ith_sphere(obj_list_flat_dev, iterCount, flat_sphere);
  693.         }
  694.  
  695.         /* and if we're not in shadow, calculate direct illumination with the phong model. */
  696.         if(!in_shadow) {
  697.             NORMALIZE(ldir);
  698.  
  699.             idiff = MAX(DOT(sp->normal, ldir), 0.0);
  700.             ispec = obj[7] > 0.0 ? pow(MAX(DOT(sp->vref, ldir), 0.0), obj[7]) : 0.0;  // ASSUMING OBJ[7] = obj->mat.spow
  701.  
  702.             col.x += idiff * obj[4] + ispec;      // assuming obj[4] = obj->mat.col.x
  703.             col.y += idiff * obj[5] + ispec;      // assuming obj[5] = obj->mat.col.y
  704.             col.z += idiff * obj[6] + ispec;   //assuming obj[6] = obj->may.col.z
  705.         }
  706.     }
  707.  
  708.     /* Also, if the object is reflective, spawn a reflection ray, and call trace()
  709.      * to calculate the light arriving from the mirror direction.
  710.      */
  711.      //FOR EVERY REFLECTION RAY THAT IS SUPPOSED TO BE TRACED, THERE MUST BE A STRUCTURE SAVED, CONTAINING THE SPECIFIC PIXEL, THE RAY, AND DEPTH + 1. ALL OF THESE MUST BE STORED IN A "NEW" ARRAY, WHICH IS THEN ACCESSED IN RENDER2 FOLLOWING THE MAIN COMPUTATIONS.!!!!!!!!!!!*******************8
  712.     if(obj[8] > 0.0) {           //assuming obj[8] = obj->mat.refl
  713.         isReflect[0] = 1;    //set isReflect to affirmative
  714.  
  715.  
  716.         Rdata->ray.orig = sp->pos;     //SET VALUES OF REFLECTIONDATA STRUCT
  717.         Rdata->ray.dir = sp->vref;
  718.         Rdata->ray.dir.x *= RAY_MAG;
  719.         Rdata->ray.dir.y *= RAY_MAG;
  720.         Rdata->ray.dir.z *= RAY_MAG;
  721.         Rdata->depth = depth + 1;
  722.         Rdata->reflection = obj[8];
  723.        
  724.  
  725.     }
  726.     else
  727.         isReflect[0] = 0;      //IF THERE IS NO REFLECTION, SET ISREFLECT TO ZERO
  728.     return col;
  729. }
  730.  
  731. /* calculate reflection vector */
  732. __device__ struct vec3 reflect(struct vec3 v, struct vec3 n) {
  733.     struct vec3 res;
  734.     double dot = v.x * n.x + v.y * n.y + v.z * n.z;
  735.     res.x = -(2.0 * dot * n.x - v.x);
  736.     res.y = -(2.0 * dot * n.y - v.y);
  737.     res.z = -(2.0 * dot * n.z - v.z);
  738.     return res;
  739. }
  740.  
  741. __device__ struct vec3 cross_product(struct vec3 v1, struct vec3 v2) {
  742.     struct vec3 res;
  743.     res.x = v1.y * v2.z - v1.z * v2.y;
  744.     res.y = v1.z * v2.x - v1.x * v2.z;
  745.     res.z = v1.x * v2.y - v1.y * v2.x;
  746.     return res;
  747. }
  748.  
  749. /* determine the primary ray corresponding to the specified pixel (x, y) */
  750. __device__ struct ray get_primary_ray(int x, int y, int sample, struct camera camdev, struct vec3 *uranddev, int *iranddev) {
  751.     struct ray ray;
  752.     float m[3][3];
  753.     struct vec3 i, j = {0, 1, 0}, k, dir, orig, foo;
  754.  
  755.     k.x = camdev.targ.x - camdev.pos.x;
  756.     k.y = camdev.targ.y - camdev.pos.y;
  757.     k.z = camdev.targ.z - camdev.pos.z;
  758.     NORMALIZE(k);
  759.  
  760.     i = cross_product(j, k);
  761.     j = cross_product(k, i);
  762.     m[0][0] = i.x; m[0][1] = j.x; m[0][2] = k.x;
  763.     m[1][0] = i.y; m[1][1] = j.y; m[1][2] = k.y;
  764.     m[2][0] = i.z; m[2][1] = j.z; m[2][2] = k.z;
  765.    
  766.     ray.orig.x = ray.orig.y = ray.orig.z = 0.0;
  767.     ray.dir = get_sample_pos(x, y, sample, uranddev, iranddev);
  768.     ray.dir.z = 1.0 / HALF_FOV;
  769.     ray.dir.x *= RAY_MAG;
  770.     ray.dir.y *= RAY_MAG;
  771.     ray.dir.z *= RAY_MAG;
  772.    
  773.     dir.x = ray.dir.x + ray.orig.x;
  774.     dir.y = ray.dir.y + ray.orig.y;
  775.     dir.z = ray.dir.z + ray.orig.z;
  776.     foo.x = dir.x * m[0][0] + dir.y * m[0][1] + dir.z * m[0][2];
  777.     foo.y = dir.x * m[1][0] + dir.y * m[1][1] + dir.z * m[1][2];
  778.     foo.z = dir.x * m[2][0] + dir.y * m[2][1] + dir.z * m[2][2];
  779.  
  780.     orig.x = ray.orig.x * m[0][0] + ray.orig.y * m[0][1] + ray.orig.z * m[0][2] + camdev.pos.x;
  781.     orig.y = ray.orig.x * m[1][0] + ray.orig.y * m[1][1] + ray.orig.z * m[1][2] + camdev.pos.y;
  782.     orig.z = ray.orig.x * m[2][0] + ray.orig.y * m[2][1] + ray.orig.z * m[2][2] + camdev.pos.z;
  783.  
  784.     ray.orig = orig;
  785.     ray.dir.x = foo.x + orig.x;
  786.     ray.dir.y = foo.y + orig.y;
  787.     ray.dir.z = foo.z + orig.z;
  788.    
  789.     return ray;
  790. }
  791.  
  792.  
  793. __device__ struct vec3 get_sample_pos(int x, int y, int sample, struct vec3 *uranddev, int *iranddev) {
  794.     struct vec3 pt;
  795.  //   double xsz = 2.0, ysz = xresdev / aspectdev;   not being used by program?
  796.     /*static*/ double sf = 0.0;
  797.  
  798.     if(sf == 0.0) {
  799.         sf = 2.0 / (double)xresdev;
  800.     }
  801.  
  802.     pt.x = ((double)x / (double)xresdev) - 0.5;
  803.     pt.y = -(((double)y / (double)yresdev) - 0.65) / aspectdev;
  804.  
  805.     if(sample) {
  806.         struct vec3 jt = jitter(x, y, sample, uranddev, iranddev);
  807.         pt.x += jt.x * sf;
  808.         pt.y += jt.y * sf / aspectdev;
  809.     }
  810.     return pt;
  811. }
  812.  
  813. /* jitter function taken from Graphics Gems I. */
  814. __device__ struct vec3 jitter(int x, int y, int s, struct vec3 *uranddev, int *iranddev) {
  815.     struct vec3 pt;
  816.     pt.x = uranddev[(x + (y << 2) + iranddev[(x + s) & MASK]) & MASK].x;
  817.     pt.y = uranddev[(y + (x << 2) + iranddev[(y + s) & MASK]) & MASK].y;
  818.     return pt;
  819. }
  820.  
  821. /* Calculate ray-sphere intersection, and return {1, 0} to signify hit or no hit.
  822.  * Also the surface point parameters like position, normal, etc are returned through
  823.  * the sp pointer if it is not NULL.
  824.  */
  825. __device__ int ray_sphere(double *sph, struct ray ray, struct spoint *sp) {
  826.     double a, b, c, d, sqrt_d, t1, t2;
  827.    
  828.     a = SQ(ray.dir.x) + SQ(ray.dir.y) + SQ(ray.dir.z);
  829.     b = 2.0 * ray.dir.x * (ray.orig.x - sph[0]) +
  830.                 2.0 * ray.dir.y * (ray.orig.y - sph[1]) +
  831.                 2.0 * ray.dir.z * (ray.orig.z - sph[2]);
  832.     c = SQ(sph[0]) + SQ(sph[1]) + SQ(sph[2]) +
  833.                 SQ(ray.orig.x) + SQ(ray.orig.y) + SQ(ray.orig.z) +
  834.                 2.0 * (-sph[0] * ray.orig.x - sph[1] * ray.orig.y - sph[2] * ray.orig.z) - SQ(sph[3]);
  835.    
  836.     if((d = SQ(b) - 4.0 * a * c) < 0.0) return 0;
  837.  
  838.     sqrt_d = sqrt(d);
  839.     t1 = (-b + sqrt_d) / (2.0 * a);
  840.     t2 = (-b - sqrt_d) / (2.0 * a);
  841.  
  842.     if((t1 < ERR_MARGIN && t2 < ERR_MARGIN) || (t1 > 1.0 && t2 > 1.0)) return 0;
  843.  
  844.     if(sp) {
  845.         if(t1 < ERR_MARGIN) t1 = t2;
  846.         if(t2 < ERR_MARGIN) t2 = t1;
  847.         sp->dist = t1 < t2 ? t1 : t2;
  848.        
  849.         sp->pos.x = ray.orig.x + ray.dir.x * sp->dist;
  850.         sp->pos.y = ray.orig.y + ray.dir.y * sp->dist;
  851.         sp->pos.z = ray.orig.z + ray.dir.z * sp->dist;
  852.        
  853.         sp->normal.x = (sp->pos.x - sph[0]) / sph[3];
  854.         sp->normal.y = (sp->pos.y - sph[1]) / sph[3];
  855.         sp->normal.z = (sp->pos.z - sph[2]) / sph[3];
  856.  
  857.         sp->vref = reflect(ray.dir, sp->normal);
  858.         NORMALIZE(sp->vref);
  859.     }
  860.     return 1;
  861. }
  862.  
  863. /* Load the scene from an extremely simple scene description file */
  864. #define DELIM   " \t\n"
  865. void load_scene(FILE *fp) {
  866.     char line[256], *ptr, type;
  867.  
  868.     obj_list = (sphere *)malloc(sizeof(struct sphere));
  869.     obj_list->next = 0;
  870.     objCounter = 0;
  871.    
  872.     while((ptr = fgets(line, 256, fp))) {
  873.         int i;
  874.         struct vec3 pos, col;
  875.         double rad, spow, refl;
  876.        
  877.         while(*ptr == ' ' || *ptr == '\t') ptr++;
  878.         if(*ptr == '#' || *ptr == '\n') continue;
  879.  
  880.         if(!(ptr = strtok(line, DELIM))) continue;
  881.         type = *ptr;
  882.        
  883.         for(i=0; i<3; i++) {
  884.             if(!(ptr = strtok(0, DELIM))) break;
  885.             *((double*)&pos.x + i) = atof(ptr);
  886.         }
  887.  
  888.         if(type == 'l') {
  889.             lights[lnum++] = pos;
  890.             continue;
  891.         }
  892.  
  893.         if(!(ptr = strtok(0, DELIM))) continue;
  894.         rad = atof(ptr);
  895.  
  896.         for(i=0; i<3; i++) {
  897.             if(!(ptr = strtok(0, DELIM))) break;
  898.             *((double*)&col.x + i) = atof(ptr);
  899.         }
  900.  
  901.         if(type == 'c') {
  902.             cam.pos = pos;
  903.             cam.targ = col;
  904.             cam.fov = rad;
  905.             continue;
  906.         }
  907.  
  908.         if(!(ptr = strtok(0, DELIM))) continue;
  909.         spow = atof(ptr);
  910.  
  911.         if(!(ptr = strtok(0, DELIM))) continue;
  912.         refl = atof(ptr);
  913.  
  914.         if(type == 's') {
  915.             objCounter++;
  916.             struct sphere *sph = (sphere *)malloc(sizeof(*sph));
  917.             sph->next = obj_list->next;
  918.             obj_list->next = sph;
  919.  
  920.             sph->pos = pos;
  921.             sph->rad = rad;
  922.             sph->mat.col = col;
  923.             sph->mat.spow = spow;
  924.             sph->mat.refl = refl;
  925.  
  926.         } else {
  927.             fprintf(stderr, "unknown type: %c\n", type);
  928.         }
  929.     }
  930. }
  931.  
  932. void flatten_sphere(struct sphere *sphere, double *sphere_flat) {
  933.     struct vec3 pos = sphere->pos;
  934.     double rad = sphere->rad;
  935.     struct material mat = sphere->mat;
  936.  
  937.     sphere_flat[0] = pos.x;
  938.     sphere_flat[1] = pos.y;
  939.     sphere_flat[2] = pos.z;
  940.     sphere_flat[3] = rad;
  941.     sphere_flat[4] = mat.col.x;
  942.     sphere_flat[5] = mat.col.y;
  943.     sphere_flat[6] = mat.col.z;
  944.     sphere_flat[7] = mat.spow;
  945.     sphere_flat[8] = mat.refl;
  946. }
  947.  
  948. void flatten_obj_list(struct sphere *obj_list, double *obj_list_flat, int objCounter) {
  949.     obj_list_flat = (double *)malloc(9*objCounter*sizeof(double));
  950.  
  951.  
  952.     for (int i = 0; i < objCounter; i++) {
  953.         struct sphere *sphere = obj_list;
  954.         double sphere_flat[9];
  955.         flatten_sphere(sphere, sphere_flat);
  956.  
  957.         for (int j = 0; j < objCounter; j++) {
  958.             for (int k = 0; k < 9; k++) {
  959.                 obj_list_flat[9*j+k] = sphere_flat[k];
  960.             }
  961.         }
  962.  
  963.         obj_list = obj_list->next;
  964.         i++;
  965.     }
  966. }
  967.  
  968. __device__ void get_ith_sphere(double *obj_list_flat, int index, double *flat_sphere) {
  969.     int base_index = index * 9;
  970.  
  971.     for (int i = 0; i <= 9; i++) {
  972.         flat_sphere[i] = obj_list_flat[base_index + i];
  973.     }
  974.  
  975.  
  976. }
  977.  
  978.  
  979. inline void check_cuda_errors(const char *filename, const int line_number)
  980. {
  981.     #ifdef DEBUG
  982.     cudaThreadSynchronize();
  983.     cudaError_t error = cudaGetLastError();
  984.     if(error != cudaSuccess)
  985.     {
  986.         printf("CUDA error at %s:%i: %s\n", filename, line_number, cudaGetErrorString(error));
  987.         exit(-1);
  988.     }
  989.     #endif
  990. }  
  991.  
  992.  
  993. /* provide a millisecond-resolution timer for each system */
  994. #if defined(__unix__) || defined(unix) || defined(__MACH__)
  995. #include <time.h>
  996. #include <sys/time.h>
  997. unsigned long get_msec(void) {
  998.     static struct timeval timeval, first_timeval;
  999.    
  1000.     gettimeofday(&timeval, 0);
  1001.     if(first_timeval.tv_sec == 0) {
  1002.         first_timeval = timeval;
  1003.         return 0;
  1004.     }
  1005.     return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
  1006. }
  1007. #elif defined(__WIN32__) || defined(WIN32)
  1008. #include <windows.h>
  1009. unsigned long get_msec(void) {
  1010.     return GetTickCount();
  1011. }
  1012. #else
  1013. #error "I don't know how to measure time on your platform"
  1014. #endif
  1015.  
  1016.  
  1017.  
  1018.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement