Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // includes, system
- #include <stdlib.h>
- #include <stdio.h>
- #include <string.h>
- #include <math.h>
- // includes, project
- #include "cu.h"
- #include <cuda_runtime.h>
- #include <cufft.h>
- #include <cufftXt.h>
- #include <helper_functions.h>
- #include <helper_cuda.h>
- __global__ void ShortIntArray2FloatMatrixAndZeros(int C, int O, short int * trace, cufftReal * matrix) {
- matrix[blockIdx.x*(2 * C) + threadIdx.x] = (float)trace[blockIdx.x*O + threadIdx.x];
- //int j = blockIdx.x*(2 * windowSize) + threadIdx.x;
- //matrix[j + windowSize] = 0;
- }
- __global__ void conjugateAndMultiply(int step, cufftComplex * __restrict__ matrixa, cufftComplex * __restrict__ matrixb, cufftComplex * __restrict__ matrixc) {
- //(a+ib)*(c-id)=a*c-i*a*d+i*b*c+b*d=ac+bd+i(bc-ad)
- int i = threadIdx.x + blockIdx.x*step;
- matrixc[i].x = matrixa[i].x*matrixb[i].x + matrixa[i].y*matrixb[i].y;
- matrixc[i].y = matrixa[i].y*matrixb[i].x - matrixa[i].x*matrixb[i].y;
- }
- __global__ void getMax(int halfProbeWindowSize, int windowSize, float * maxArray, int * maxArrayInd, cufftReal * __restrict__ d_matrixtemp) {
- float max = 0;
- int maxInd = 0;
- int windowStart = blockIdx.x * (2 * windowSize);
- for (int i = (512 - halfProbeWindowSize); i != halfProbeWindowSize; i++) { //i++
- if (max < d_matrixtemp[windowStart + i]) {
- max = d_matrixtemp[windowStart + i];
- maxInd = i;
- }
- if (i == 511) {
- i = -1;
- }
- }
- if (maxInd > 256) {
- maxInd = maxInd - 512;
- }
- maxArray[blockIdx.x] = max;
- maxArrayInd[blockIdx.x] = maxInd;
- }
- __global__ void updateRef(int * __restrict__ hostMaxArrayInd, cufftComplex * deviceFFTOutput_ref, cufftComplex * __restrict__ deviceFFTOutput) {
- if (hostMaxArray[blockIdx.x] > 4) {
- deviceFFTOutput_ref[blockIdx.x * (DATASIZE/2 + 1) + threadIdx.x] = deviceFFTOutput[blockIdx.x * (DATASIZE/2 + 1) + threadIdx.x];
- }
- }
- pointers initializeCUDA(short int * h_traces, config conf) {
- // Receives traces from digitizer (IVICOM), copies the first reference trace to the device memory
- // and allocates all the memory on the device.
- // Does the operations for the first full reference trace.
- int windowSize = conf.windowSize;
- int jumpSize = conf.jumpSize;
- int N = conf.N;
- int numRecords = conf.numRecords;
- // initialize constants
- const int DATASIZE = 2 * windowSize;
- const int BATCHSIZE = ((N - windowSize) / jumpSize);
- // generate output struct variable
- pointers ptrs;
- // allocate memory on device for traces, ref_trace, padded matrices, fftOutputs, convOutput, maxInd and maxOutput
- short int *d_traces; //Allocate memory for tracematrix on device
- checkCudaErrors(cudaMalloc((void**)&d_traces, N * numRecords * sizeof(short int)));
- short int *d_trace_ref; //Allocate memory for reference trace on device
- checkCudaErrors(cudaMalloc((void**)&d_trace_ref, N * sizeof(short int)));
- short int *d_trace_current; //Allocate memory for analyzed trace on device
- checkCudaErrors(cudaMalloc((void**)&d_trace_current, N * sizeof(short int)));
- cufftReal *d_matrix; //Allocate memory for padded trace to be tested on device
- checkCudaErrors(cudaMalloc((void**)&d_matrix, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
- cufftReal *d_matrixref; //Allocate memory for padded reference trace
- checkCudaErrors(cudaMalloc((void**)&d_matrixref, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
- cufftReal *d_matrixtemp; //Allocate memory for IFFT output matrix
- checkCudaErrors(cudaMalloc((void**)&d_matrixtemp, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
- cufftComplex *deviceFFTOutput; // Allocate memory for the tested trace FFT output
- checkCudaErrors(cudaMalloc((void**)&deviceFFTOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
- cufftComplex *deviceFFTOutput_ref; // Allocate memory for the reference trace FFT Output
- checkCudaErrors(cudaMalloc((void**)&deviceFFTOutput_ref, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
- cufftComplex *deviceConvOutput; // Allocate memory for the convolution output (in the Frequency domain)
- checkCudaErrors(cudaMalloc((void**)&deviceConvOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
- float *deviceMaxArray; // Allocate memory for the array of max amplitudes
- checkCudaErrors(cudaMalloc((void**)&deviceMaxArray, BATCHSIZE * sizeof(float)));
- int *deviceMaxArrayInd; // Allocate memory for the array of max index (measured shift)
- checkCudaErrors(cudaMalloc((void**)&deviceMaxArrayInd, BATCHSIZE * sizeof(int)));
- float *hostMaxArray;
- checkCudaErrors(cudaMallocHost((void**)&hostMaxArray, BATCHSIZE * sizeof(float)));
- int *hostMaxArrayInd;
- checkCudaErrors(cudaMallocHost((void**)&hostMaxArrayInd, BATCHSIZE * sizeof(int)));
- ptrs.d_traces = d_traces;
- ptrs.d_trace_ref = d_trace_ref;
- ptrs.d_trace_current = d_trace_current;
- ptrs.d_matrix = d_matrix;
- ptrs.d_matrixref = d_matrixref;
- ptrs.d_matrixtemp = d_matrixtemp;
- ptrs.deviceFFTOutput = deviceFFTOutput;
- ptrs.deviceFFTOutput_ref = deviceFFTOutput_ref;
- ptrs.deviceConvOutput = deviceConvOutput;
- ptrs.deviceMaxArray = deviceMaxArray;
- ptrs.deviceMaxArrayInd = deviceMaxArrayInd;
- ptrs.hostMaxArray = hostMaxArray;
- ptrs.hostMaxArrayInd = hostMaxArrayInd;
- // testing constants
- cufftComplex *hostFFTOutput; // TESTING CONSTANT ONLY: Allocate memory on HOST for the result of the FFT
- checkCudaErrors(cudaMallocHost((void**)&hostFFTOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
- cufftReal *h_matrix; // TESTING CONSTANT ONLY: Allocate memory on HOST for the result of the IFFT
- checkCudaErrors(cudaMallocHost((void**)&h_matrix, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
- ptrs.realTest = h_matrix;
- //ptrs.complexTest = hostFFTOutput;
- // create cuFFT plans and handles for FFT and IFFT
- cufftHandle plan;
- cufftHandle plan2;
- int rank = 1;
- int n[] = { DATASIZE };
- int istride = 1, ostride = 1;
- int idist = DATASIZE, odist = (DATASIZE / 2 + 1);
- int inembed[] = { 0 }, onembed[] = { 0 };
- int batch = BATCHSIZE;
- cufftPlanMany(&plan, rank, n, //FFT
- inembed, istride, idist, //The real-to-complex transform is implicitly a forward transform
- onembed, ostride, odist, CUFFT_R2C, batch);
- cufftPlanMany(&plan2, rank, n, //IFFT
- inembed, istride, odist, // The complex-to-real transform is implicitly inverse.
- onembed, ostride, idist, CUFFT_C2R, batch);
- // Initialize first trace
- checkCudaErrors(cudaMemcpy(d_trace_ref, h_traces, N * sizeof(short int), cudaMemcpyHostToDevice));
- // Zero-pad first trace
- cudaMemset(&d_matrixref, 0, DATASIZE * BATCHSIZE * sizeof(cufftReal));
- ShortIntArray2FloatMatrixAndZeros << < BATCHSIZE, DATASIZE / 2 >> > (conf.windowSize, conf.jumpSize, ptrs.d_trace_ref, ptrs.d_matrixref);
- // FFT first trace
- checkCudaErrors(cufftExecR2C(plan, d_matrixref, deviceFFTOutput_ref));
- ptrs.plan = plan;
- ptrs.plan2 = plan2;
- return ptrs;
- }
- void executeCUDA(short int * h_traces, config conf, pointers ptrs) {
- //int threshold = 4;
- const int DATASIZE = 2 * conf.windowSize;
- const int BATCHSIZE = ((conf.N - conf.windowSize) / conf.jumpSize);
- // COPY HOST TRACE TO DEVICE MEMORY
- printf("%d, %d\n", conf.N, conf.numRecords);
- checkCudaErrors(cudaMemcpy(ptrs.d_traces, h_traces, conf.N * conf.numRecords * sizeof(short int), cudaMemcpyHostToDevice));
- for (int i = 0; i < conf.numRecords; i++) {
- printf("%d", i);
- ptrs.d_trace_current = &(ptrs.d_traces[i*DATASIZE/2*BATCHSIZE]);
- // PAD EACH WINDOW WITH ZEROES --> PODE-SE FAZER AOS TRACES TODOS DE UMA VEZ...
- cudaMemset(&ptrs.d_matrix, 0, DATASIZE * BATCHSIZE * sizeof(cufftReal));
- ShortIntArray2FloatMatrixAndZeros << < BATCHSIZE, DATASIZE / 2 >> > (conf.windowSize, conf.jumpSize, ptrs.d_trace_current, ptrs.d_matrix);
- // FFT EACH WINDOW --> PODE-SE FAZER AOS TRACES TODOS DE UMA VEZ...
- checkCudaErrors(cufftExecR2C(ptrs.plan, ptrs.d_matrix, ptrs.deviceFFTOutput));
- // CONVOLVE
- conjugateAndMultiply << < BATCHSIZE, DATASIZE / 2 + 1 >> >(conf.windowSize, ptrs.deviceFFTOutput, ptrs.deviceFFTOutput_ref, ptrs.deviceConvOutput);
- // IFFT
- checkCudaErrors(cufftExecC2R(ptrs.plan2, ptrs.deviceConvOutput, ptrs.d_matrixtemp));
- // FIND MAX SHIFT WITHIN SHORT WINDOW
- getMax << < BATCHSIZE, 1 >> > (conf.probeWindowSize / 2, conf.jumpSize, ptrs.deviceMaxArray, ptrs.deviceMaxArrayInd, ptrs.d_matrixtemp);
- // UPDATE REFERENCE
- updateRef <<< BATCHSIZE, DATASIZE/2 + 1 >>> (ptrs.deviceMaxArrayInd, ptrs.deviceFFTOutput_ref, ptrs.deviceFFTOutput);
- // COPY RESULTS FROM DEVICE TO HOST
- checkCudaErrors(cudaMemcpy(ptrs.hostMaxArray, ptrs.deviceMaxArray, BATCHSIZE * sizeof(float), cudaMemcpyDeviceToHost));
- checkCudaErrors(cudaMemcpy(ptrs.hostMaxArrayInd, ptrs.deviceMaxArrayInd, BATCHSIZE * sizeof(int), cudaMemcpyDeviceToHost));
- }
- }
- void finalizeCUDA(int N, pointers ptrs) {
- // PRINT RESULTS TO FILE
- FILE *f = fopen("C:/Users/FOCUS-DAS/Desktop/file.txt", "w");
- if (f == NULL)
- {
- printf("Error opening file!\n");
- exit(1);
- }
- for (int i = 0; i < N; i++) {
- //printf("%d:\t %f\t+ j %f\n", i, (h_matrix[i].x), (h_matrix[i].y));
- //fprintf(f, "%f\t%f\n",hostFFTOutput[i].x, hostFFTOutput[i].y);
- //fprintf(f, "%f\n",ptrs.realTest[i]);
- fprintf(f, "%d\n", ptrs.hostMaxArrayInd[i]);
- //fprintf(f, "%d\t%d\n", h_trace[i],h_trace2[i]);
- }
- fclose(f);
- cudaDeviceReset();
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement