Advertisement
Guest User

Untitled

a guest
Mar 26th, 2017
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 10.07 KB | None | 0 0
  1. // includes, system
  2. #include <stdlib.h>
  3. #include <stdio.h>
  4. #include <string.h>
  5. #include <math.h>
  6. // includes, project
  7.  
  8. #include "cu.h"
  9. #include <cuda_runtime.h>
  10. #include <cufft.h>
  11. #include <cufftXt.h>
  12. #include <helper_functions.h>
  13. #include <helper_cuda.h>
  14.  
  15.  
  16. __global__ void ShortIntArray2FloatMatrixAndZeros(int C, int O, short int * trace, cufftReal * matrix) {
  17.     matrix[blockIdx.x*(2 * C) + threadIdx.x] = (float)trace[blockIdx.x*O + threadIdx.x];
  18.     //int j = blockIdx.x*(2 * windowSize) + threadIdx.x;
  19.     //matrix[j + windowSize] = 0;
  20. }
  21.  
  22. __global__ void conjugateAndMultiply(int step, cufftComplex * __restrict__ matrixa, cufftComplex * __restrict__ matrixb, cufftComplex * __restrict__ matrixc) {
  23.     //(a+ib)*(c-id)=a*c-i*a*d+i*b*c+b*d=ac+bd+i(bc-ad)
  24.     int i = threadIdx.x + blockIdx.x*step;
  25.     matrixc[i].x = matrixa[i].x*matrixb[i].x + matrixa[i].y*matrixb[i].y;
  26.     matrixc[i].y = matrixa[i].y*matrixb[i].x - matrixa[i].x*matrixb[i].y;
  27. }
  28.  
  29. __global__ void getMax(int halfProbeWindowSize, int windowSize, float * maxArray, int * maxArrayInd, cufftReal * __restrict__ d_matrixtemp) {
  30.     float max = 0;
  31.     int maxInd = 0;
  32.     int windowStart = blockIdx.x * (2 * windowSize);
  33.  
  34.     for (int i = (512 - halfProbeWindowSize); i != halfProbeWindowSize; i++) { //i++
  35.         if (max < d_matrixtemp[windowStart + i]) {
  36.             max = d_matrixtemp[windowStart + i];
  37.             maxInd = i;
  38.         }
  39.         if (i == 511) {
  40.             i = -1;
  41.         }
  42.     }
  43.  
  44.     if (maxInd > 256) {
  45.         maxInd = maxInd - 512;
  46.     }
  47.     maxArray[blockIdx.x] = max;
  48.     maxArrayInd[blockIdx.x] = maxInd;
  49. }
  50.  
  51.  
  52. __global__ void updateRef(int * __restrict__ hostMaxArrayInd, cufftComplex * deviceFFTOutput_ref, cufftComplex * __restrict__ deviceFFTOutput) {
  53.  
  54.     if (hostMaxArray[blockIdx.x] > 4) {
  55.         deviceFFTOutput_ref[blockIdx.x * (DATASIZE/2 + 1) + threadIdx.x] = deviceFFTOutput[blockIdx.x * (DATASIZE/2 + 1) + threadIdx.x];
  56.     }
  57. }
  58.  
  59.  
  60. pointers initializeCUDA(short int * h_traces, config conf) {
  61.  
  62.     // Receives traces from digitizer (IVICOM), copies the first reference trace to the device memory
  63.     // and allocates all the memory on the device.
  64.     // Does the operations for the first full reference trace.
  65.  
  66.     int windowSize = conf.windowSize;
  67.     int jumpSize = conf.jumpSize;
  68.     int N = conf.N;
  69.     int numRecords = conf.numRecords;
  70.      
  71.     // initialize constants
  72.     const int DATASIZE = 2 * windowSize;
  73.     const int BATCHSIZE = ((N - windowSize) / jumpSize);
  74.    
  75.     // generate output struct variable
  76.     pointers ptrs;
  77.  
  78.     // allocate memory on device for traces, ref_trace, padded matrices, fftOutputs, convOutput, maxInd and maxOutput
  79.     short int *d_traces;                //Allocate memory for tracematrix on device
  80.     checkCudaErrors(cudaMalloc((void**)&d_traces, N * numRecords * sizeof(short int)));  
  81.     short int *d_trace_ref;             //Allocate memory for reference trace on device
  82.     checkCudaErrors(cudaMalloc((void**)&d_trace_ref, N * sizeof(short int)));
  83.     short int *d_trace_current;             //Allocate memory for analyzed trace on device
  84.     checkCudaErrors(cudaMalloc((void**)&d_trace_current, N * sizeof(short int)));
  85.  
  86.     cufftReal *d_matrix;                //Allocate memory for padded trace to be tested on device
  87.     checkCudaErrors(cudaMalloc((void**)&d_matrix, DATASIZE * BATCHSIZE * sizeof(cufftReal)));  
  88.     cufftReal *d_matrixref;             //Allocate memory for padded reference trace
  89.     checkCudaErrors(cudaMalloc((void**)&d_matrixref, DATASIZE * BATCHSIZE * sizeof(cufftReal)));    
  90.     cufftReal *d_matrixtemp;            //Allocate memory for IFFT output matrix
  91.     checkCudaErrors(cudaMalloc((void**)&d_matrixtemp, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
  92.  
  93.     cufftComplex *deviceFFTOutput;      // Allocate memory for the tested trace FFT output
  94.     checkCudaErrors(cudaMalloc((void**)&deviceFFTOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
  95.     cufftComplex *deviceFFTOutput_ref;  // Allocate memory for the reference trace FFT Output
  96.     checkCudaErrors(cudaMalloc((void**)&deviceFFTOutput_ref, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
  97.     cufftComplex *deviceConvOutput;     // Allocate memory for the convolution output (in the Frequency domain)
  98.     checkCudaErrors(cudaMalloc((void**)&deviceConvOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
  99.  
  100.     float *deviceMaxArray;              // Allocate memory for the array of max amplitudes
  101.     checkCudaErrors(cudaMalloc((void**)&deviceMaxArray, BATCHSIZE * sizeof(float)));
  102.     int *deviceMaxArrayInd;             // Allocate memory for the array of max index (measured shift)
  103.     checkCudaErrors(cudaMalloc((void**)&deviceMaxArrayInd, BATCHSIZE * sizeof(int)));
  104.  
  105.     float *hostMaxArray;
  106.     checkCudaErrors(cudaMallocHost((void**)&hostMaxArray, BATCHSIZE * sizeof(float)));
  107.     int *hostMaxArrayInd;
  108.     checkCudaErrors(cudaMallocHost((void**)&hostMaxArrayInd, BATCHSIZE * sizeof(int)));
  109.  
  110.     ptrs.d_traces = d_traces;
  111.     ptrs.d_trace_ref = d_trace_ref;
  112.     ptrs.d_trace_current = d_trace_current;
  113.     ptrs.d_matrix = d_matrix;
  114.     ptrs.d_matrixref = d_matrixref;
  115.     ptrs.d_matrixtemp = d_matrixtemp;
  116.     ptrs.deviceFFTOutput = deviceFFTOutput;
  117.     ptrs.deviceFFTOutput_ref = deviceFFTOutput_ref;
  118.     ptrs.deviceConvOutput = deviceConvOutput;
  119.     ptrs.deviceMaxArray = deviceMaxArray;
  120.     ptrs.deviceMaxArrayInd = deviceMaxArrayInd;
  121.     ptrs.hostMaxArray = hostMaxArray;
  122.     ptrs.hostMaxArrayInd = hostMaxArrayInd;
  123.  
  124.     // testing constants
  125.             cufftComplex *hostFFTOutput;        // TESTING CONSTANT ONLY: Allocate memory on HOST for the result of the FFT
  126.             checkCudaErrors(cudaMallocHost((void**)&hostFFTOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
  127.             cufftReal *h_matrix;                // TESTING CONSTANT ONLY: Allocate memory on HOST for the result of the IFFT
  128.             checkCudaErrors(cudaMallocHost((void**)&h_matrix, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
  129.             ptrs.realTest = h_matrix;
  130.             //ptrs.complexTest = hostFFTOutput;
  131.  
  132.  
  133.     // create cuFFT plans and handles for FFT and IFFT
  134.     cufftHandle plan;
  135.     cufftHandle plan2;
  136.  
  137.     int rank = 1;
  138.     int n[] = { DATASIZE };
  139.     int istride = 1, ostride = 1;
  140.     int idist = DATASIZE, odist = (DATASIZE / 2 + 1);
  141.     int inembed[] = { 0 }, onembed[] = { 0 };
  142.     int batch = BATCHSIZE;
  143.  
  144.     cufftPlanMany(&plan, rank, n,                   //FFT
  145.         inembed, istride, idist,                    //The real-to-complex transform is implicitly a forward transform
  146.         onembed, ostride, odist, CUFFT_R2C, batch);
  147.  
  148.     cufftPlanMany(&plan2, rank, n,                  //IFFT
  149.         inembed, istride, odist,                    // The complex-to-real transform is implicitly inverse.
  150.         onembed, ostride, idist, CUFFT_C2R, batch);
  151.  
  152.     // Initialize first trace
  153.     checkCudaErrors(cudaMemcpy(d_trace_ref, h_traces, N * sizeof(short int), cudaMemcpyHostToDevice));
  154.     // Zero-pad first trace
  155.     cudaMemset(&d_matrixref, 0, DATASIZE * BATCHSIZE * sizeof(cufftReal));
  156.     ShortIntArray2FloatMatrixAndZeros << < BATCHSIZE, DATASIZE / 2 >> > (conf.windowSize, conf.jumpSize, ptrs.d_trace_ref, ptrs.d_matrixref);
  157.     // FFT first trace
  158.     checkCudaErrors(cufftExecR2C(plan, d_matrixref, deviceFFTOutput_ref));
  159.  
  160.     ptrs.plan = plan;
  161.     ptrs.plan2 = plan2;
  162.  
  163.     return ptrs;
  164. }
  165.  
  166.  
  167. void executeCUDA(short int * h_traces, config conf, pointers ptrs) {
  168.  
  169.     //int threshold = 4;
  170.     const int DATASIZE = 2 * conf.windowSize;
  171.     const int BATCHSIZE = ((conf.N - conf.windowSize) / conf.jumpSize);
  172.  
  173.     // COPY HOST TRACE TO DEVICE MEMORY
  174.     printf("%d, %d\n", conf.N, conf.numRecords);
  175.     checkCudaErrors(cudaMemcpy(ptrs.d_traces, h_traces, conf.N * conf.numRecords * sizeof(short int), cudaMemcpyHostToDevice));
  176.  
  177.    
  178.     for (int i = 0; i < conf.numRecords; i++) {
  179.         printf("%d", i);
  180.         ptrs.d_trace_current = &(ptrs.d_traces[i*DATASIZE/2*BATCHSIZE]);
  181.        
  182.         // PAD EACH WINDOW WITH ZEROES  --> PODE-SE FAZER AOS TRACES TODOS DE UMA VEZ...
  183.         cudaMemset(&ptrs.d_matrix, 0, DATASIZE * BATCHSIZE * sizeof(cufftReal));
  184.  
  185.         ShortIntArray2FloatMatrixAndZeros << < BATCHSIZE, DATASIZE / 2 >> > (conf.windowSize, conf.jumpSize, ptrs.d_trace_current, ptrs.d_matrix);
  186.  
  187.         // FFT EACH WINDOW              -->  PODE-SE FAZER AOS TRACES TODOS DE UMA VEZ...
  188.         checkCudaErrors(cufftExecR2C(ptrs.plan, ptrs.d_matrix, ptrs.deviceFFTOutput));
  189.        
  190.         // CONVOLVE
  191.         conjugateAndMultiply << < BATCHSIZE, DATASIZE / 2 + 1 >> >(conf.windowSize, ptrs.deviceFFTOutput, ptrs.deviceFFTOutput_ref, ptrs.deviceConvOutput);
  192.  
  193.         // IFFT
  194.         checkCudaErrors(cufftExecC2R(ptrs.plan2, ptrs.deviceConvOutput, ptrs.d_matrixtemp));
  195.        
  196.         // FIND MAX SHIFT WITHIN SHORT WINDOW
  197.         getMax << < BATCHSIZE, 1 >> > (conf.probeWindowSize / 2, conf.jumpSize, ptrs.deviceMaxArray, ptrs.deviceMaxArrayInd, ptrs.d_matrixtemp);
  198.        
  199.         // UPDATE REFERENCE
  200.         updateRef <<< BATCHSIZE, DATASIZE/2 + 1 >>> (ptrs.deviceMaxArrayInd, ptrs.deviceFFTOutput_ref, ptrs.deviceFFTOutput);
  201.  
  202.         // COPY RESULTS FROM DEVICE TO HOST
  203.         checkCudaErrors(cudaMemcpy(ptrs.hostMaxArray, ptrs.deviceMaxArray, BATCHSIZE * sizeof(float), cudaMemcpyDeviceToHost));
  204.         checkCudaErrors(cudaMemcpy(ptrs.hostMaxArrayInd, ptrs.deviceMaxArrayInd, BATCHSIZE * sizeof(int), cudaMemcpyDeviceToHost));
  205.  
  206.     }
  207.    
  208. }
  209.  
  210.  
  211. void finalizeCUDA(int N, pointers ptrs) {
  212.     // PRINT RESULTS TO FILE
  213.     FILE *f = fopen("C:/Users/FOCUS-DAS/Desktop/file.txt", "w");
  214.     if (f == NULL)
  215.     {
  216.         printf("Error opening file!\n");
  217.         exit(1);
  218.     }
  219.  
  220.     for (int i = 0; i < N; i++) {
  221.         //printf("%d:\t %f\t+ j %f\n", i, (h_matrix[i].x), (h_matrix[i].y));
  222.         //fprintf(f, "%f\t%f\n",hostFFTOutput[i].x, hostFFTOutput[i].y);
  223.         //fprintf(f, "%f\n",ptrs.realTest[i]);
  224.         fprintf(f, "%d\n", ptrs.hostMaxArrayInd[i]);
  225.         //fprintf(f, "%d\t%d\n", h_trace[i],h_trace2[i]);
  226.     }
  227.     fclose(f);
  228.  
  229.     cudaDeviceReset();
  230. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement