Untitled

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project

#include "cu.h"
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <helper_functions.h>
#include <helper_cuda.h>


__global__ void ShortIntArray2FloatMatrixAndZeros(int C, int O, short int * trace, cufftReal * matrix) {
    matrix[blockIdx.x*(2 * C) + threadIdx.x] = (float)trace[blockIdx.x*O + threadIdx.x];
    //int j = blockIdx.x*(2 * windowSize) + threadIdx.x;
    //matrix[j + windowSize] = 0;
}

__global__ void conjugateAndMultiply(int step, cufftComplex * __restrict__ matrixa, cufftComplex * __restrict__ matrixb, cufftComplex * __restrict__ matrixc) {
    //(a+ib)*(c-id)=a*c-i*a*d+i*b*c+b*d=ac+bd+i(bc-ad)
    int i = threadIdx.x + blockIdx.x*step;
    matrixc[i].x = matrixa[i].x*matrixb[i].x + matrixa[i].y*matrixb[i].y;
    matrixc[i].y = matrixa[i].y*matrixb[i].x - matrixa[i].x*matrixb[i].y;
}

__global__ void getMax(int halfProbeWindowSize, int windowSize, float * maxArray, int * maxArrayInd, cufftReal * __restrict__ d_matrixtemp) {
    float max = 0;
    int maxInd = 0;
    int windowStart = blockIdx.x * (2 * windowSize);

    for (int i = (512 - halfProbeWindowSize); i != halfProbeWindowSize; i++) { //i++
        if (max < d_matrixtemp[windowStart + i]) {
            max = d_matrixtemp[windowStart + i];
            maxInd = i;
        }
        if (i == 511) {
            i = -1;
        }
    }

    if (maxInd > 256) {
        maxInd = maxInd - 512;
    }
    maxArray[blockIdx.x] = max;
    maxArrayInd[blockIdx.x] = maxInd;
}


__global__ void updateRef(int * __restrict__ hostMaxArrayInd, cufftComplex * deviceFFTOutput_ref, cufftComplex * __restrict__ deviceFFTOutput) {

    if (hostMaxArray[blockIdx.x] > 4) {
        deviceFFTOutput_ref[blockIdx.x * (DATASIZE/2 + 1) + threadIdx.x] = deviceFFTOutput[blockIdx.x * (DATASIZE/2 + 1) + threadIdx.x];
    }
}


pointers initializeCUDA(short int * h_traces, config conf) {

    // Receives traces from digitizer (IVICOM), copies the first reference trace to the device memory
    // and allocates all the memory on the device.
    // Does the operations for the first full reference trace.

    int windowSize = conf.windowSize;
    int jumpSize = conf.jumpSize;
    int N = conf.N;
    int numRecords = conf.numRecords;

    // initialize constants
    const int DATASIZE = 2 * windowSize;
    const int BATCHSIZE = ((N - windowSize) / jumpSize);

    // generate output struct variable
    pointers ptrs;

    // allocate memory on device for traces, ref_trace, padded matrices, fftOutputs, convOutput, maxInd and maxOutput
    short int *d_traces;                //Allocate memory for tracematrix on device
    checkCudaErrors(cudaMalloc((void**)&d_traces, N * numRecords * sizeof(short int)));
    short int *d_trace_ref;             //Allocate memory for reference trace on device
    checkCudaErrors(cudaMalloc((void**)&d_trace_ref, N * sizeof(short int)));
    short int *d_trace_current;             //Allocate memory for analyzed trace on device
    checkCudaErrors(cudaMalloc((void**)&d_trace_current, N * sizeof(short int)));

    cufftReal *d_matrix;                //Allocate memory for padded trace to be tested on device
    checkCudaErrors(cudaMalloc((void**)&d_matrix, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
    cufftReal *d_matrixref;             //Allocate memory for padded reference trace
    checkCudaErrors(cudaMalloc((void**)&d_matrixref, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
    cufftReal *d_matrixtemp;            //Allocate memory for IFFT output matrix
    checkCudaErrors(cudaMalloc((void**)&d_matrixtemp, DATASIZE * BATCHSIZE * sizeof(cufftReal)));

    cufftComplex *deviceFFTOutput;      // Allocate memory for the tested trace FFT output
    checkCudaErrors(cudaMalloc((void**)&deviceFFTOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
    cufftComplex *deviceFFTOutput_ref;  // Allocate memory for the reference trace FFT Output
    checkCudaErrors(cudaMalloc((void**)&deviceFFTOutput_ref, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
    cufftComplex *deviceConvOutput;     // Allocate memory for the convolution output (in the Frequency domain)
    checkCudaErrors(cudaMalloc((void**)&deviceConvOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));

    float *deviceMaxArray;              // Allocate memory for the array of max amplitudes
    checkCudaErrors(cudaMalloc((void**)&deviceMaxArray, BATCHSIZE * sizeof(float)));
    int *deviceMaxArrayInd;             // Allocate memory for the array of max index (measured shift)
    checkCudaErrors(cudaMalloc((void**)&deviceMaxArrayInd, BATCHSIZE * sizeof(int)));

    float *hostMaxArray;
    checkCudaErrors(cudaMallocHost((void**)&hostMaxArray, BATCHSIZE * sizeof(float)));
    int *hostMaxArrayInd;
    checkCudaErrors(cudaMallocHost((void**)&hostMaxArrayInd, BATCHSIZE * sizeof(int)));

    ptrs.d_traces = d_traces;
    ptrs.d_trace_ref = d_trace_ref;
    ptrs.d_trace_current = d_trace_current;
    ptrs.d_matrix = d_matrix;
    ptrs.d_matrixref = d_matrixref;
    ptrs.d_matrixtemp = d_matrixtemp;
    ptrs.deviceFFTOutput = deviceFFTOutput;
    ptrs.deviceFFTOutput_ref = deviceFFTOutput_ref;
    ptrs.deviceConvOutput = deviceConvOutput;
    ptrs.deviceMaxArray = deviceMaxArray;
    ptrs.deviceMaxArrayInd = deviceMaxArrayInd;
    ptrs.hostMaxArray = hostMaxArray;
    ptrs.hostMaxArrayInd = hostMaxArrayInd;

    // testing constants
            cufftComplex *hostFFTOutput;        // TESTING CONSTANT ONLY: Allocate memory on HOST for the result of the FFT
            checkCudaErrors(cudaMallocHost((void**)&hostFFTOutput, (DATASIZE / 2 + 1) * BATCHSIZE * sizeof(cufftComplex)));
            cufftReal *h_matrix;                // TESTING CONSTANT ONLY: Allocate memory on HOST for the result of the IFFT
            checkCudaErrors(cudaMallocHost((void**)&h_matrix, DATASIZE * BATCHSIZE * sizeof(cufftReal)));
            ptrs.realTest = h_matrix;
            //ptrs.complexTest = hostFFTOutput;


    // create cuFFT plans and handles for FFT and IFFT
    cufftHandle plan;
    cufftHandle plan2;

    int rank = 1;
    int n[] = { DATASIZE };
    int istride = 1, ostride = 1;
    int idist = DATASIZE, odist = (DATASIZE / 2 + 1);
    int inembed[] = { 0 }, onembed[] = { 0 };
    int batch = BATCHSIZE;

    cufftPlanMany(&plan, rank, n,                   //FFT
        inembed, istride, idist,                    //The real-to-complex transform is implicitly a forward transform
        onembed, ostride, odist, CUFFT_R2C, batch);

    cufftPlanMany(&plan2, rank, n,                  //IFFT
        inembed, istride, odist,                    // The complex-to-real transform is implicitly inverse.
        onembed, ostride, idist, CUFFT_C2R, batch);

    // Initialize first trace
    checkCudaErrors(cudaMemcpy(d_trace_ref, h_traces, N * sizeof(short int), cudaMemcpyHostToDevice));
    // Zero-pad first trace
    cudaMemset(&d_matrixref, 0, DATASIZE * BATCHSIZE * sizeof(cufftReal));
    ShortIntArray2FloatMatrixAndZeros << < BATCHSIZE, DATASIZE / 2 >> > (conf.windowSize, conf.jumpSize, ptrs.d_trace_ref, ptrs.d_matrixref);
    // FFT first trace
    checkCudaErrors(cufftExecR2C(plan, d_matrixref, deviceFFTOutput_ref));

    ptrs.plan = plan;
    ptrs.plan2 = plan2;

    return ptrs;
}


void executeCUDA(short int * h_traces, config conf, pointers ptrs) {

    //int threshold = 4;
    const int DATASIZE = 2 * conf.windowSize;
    const int BATCHSIZE = ((conf.N - conf.windowSize) / conf.jumpSize);

    // COPY HOST TRACE TO DEVICE MEMORY
    printf("%d, %d\n", conf.N, conf.numRecords);
    checkCudaErrors(cudaMemcpy(ptrs.d_traces, h_traces, conf.N * conf.numRecords * sizeof(short int), cudaMemcpyHostToDevice));


    for (int i = 0; i < conf.numRecords; i++) {
        printf("%d", i);
        ptrs.d_trace_current = &(ptrs.d_traces[i*DATASIZE/2*BATCHSIZE]);

        // PAD EACH WINDOW WITH ZEROES  --> PODE-SE FAZER AOS TRACES TODOS DE UMA VEZ...
        cudaMemset(&ptrs.d_matrix, 0, DATASIZE * BATCHSIZE * sizeof(cufftReal));

        ShortIntArray2FloatMatrixAndZeros << < BATCHSIZE, DATASIZE / 2 >> > (conf.windowSize, conf.jumpSize, ptrs.d_trace_current, ptrs.d_matrix);

        // FFT EACH WINDOW              -->  PODE-SE FAZER AOS TRACES TODOS DE UMA VEZ...
        checkCudaErrors(cufftExecR2C(ptrs.plan, ptrs.d_matrix, ptrs.deviceFFTOutput));

        // CONVOLVE
        conjugateAndMultiply << < BATCHSIZE, DATASIZE / 2 + 1 >> >(conf.windowSize, ptrs.deviceFFTOutput, ptrs.deviceFFTOutput_ref, ptrs.deviceConvOutput);

        // IFFT
        checkCudaErrors(cufftExecC2R(ptrs.plan2, ptrs.deviceConvOutput, ptrs.d_matrixtemp));

        // FIND MAX SHIFT WITHIN SHORT WINDOW
        getMax << < BATCHSIZE, 1 >> > (conf.probeWindowSize / 2, conf.jumpSize, ptrs.deviceMaxArray, ptrs.deviceMaxArrayInd, ptrs.d_matrixtemp);

        // UPDATE REFERENCE
        updateRef <<< BATCHSIZE, DATASIZE/2 + 1 >>> (ptrs.deviceMaxArrayInd, ptrs.deviceFFTOutput_ref, ptrs.deviceFFTOutput);

        // COPY RESULTS FROM DEVICE TO HOST
        checkCudaErrors(cudaMemcpy(ptrs.hostMaxArray, ptrs.deviceMaxArray, BATCHSIZE * sizeof(float), cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaMemcpy(ptrs.hostMaxArrayInd, ptrs.deviceMaxArrayInd, BATCHSIZE * sizeof(int), cudaMemcpyDeviceToHost));

    }

}


void finalizeCUDA(int N, pointers ptrs) {
    // PRINT RESULTS TO FILE
    FILE *f = fopen("C:/Users/FOCUS-DAS/Desktop/file.txt", "w");
    if (f == NULL)
    {
        printf("Error opening file!\n");
        exit(1);
    }

    for (int i = 0; i < N; i++) {
        //printf("%d:\t %f\t+ j %f\n", i, (h_matrix[i].x), (h_matrix[i].y));
        //fprintf(f, "%f\t%f\n",hostFFTOutput[i].x, hostFFTOutput[i].y);
        //fprintf(f, "%f\n",ptrs.realTest[i]);
        fprintf(f, "%d\n", ptrs.hostMaxArrayInd[i]);
        //fprintf(f, "%d\t%d\n", h_trace[i],h_trace2[i]);
    }
    fclose(f);

    cudaDeviceReset();
}