Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- #include <stdlib.h>
- #include <time.h>
- #include <math.h>
- // 652.052 us - naive, 1.942 ms - shared memory
- // cpu: 5 ms
- // memcpy - H to D : 10.368 us, 5 invocari, total: 86,008 kB
- // memcpy - D to H : 3.904 us, 2 invocari, total: 32,768 kB
- // total: 450.74 ms.
- //#define COMP_SW__PRINT_SIGNAL
- //#define COMP_SW__PRINT_TEMPLATE
- //#define COMP_SW__PRINT_TEMPLATE_PADD
- //#define COMP_SW__PRINT_MEDII_PADD
- #define COMP_SW__PRINT_RESULT_CPU
- #define COMP_SW__PRINT_RESULT_GPU
- #define DIM_SIGNAL (2559u)
- #define DIM_TEMPLATE (512u)
- #define DIM_XCORR (DIM_SIGNAL - DIM_TEMPLATE + 1)
- //#if ((DIM_SIG % DIM_TEMPLATE) != (DIM_TEMPLATE - 1))
- //#error "Alege o alta dimensiune a semnalului / template-ului care satisface conditia."
- //#endif
- #define NO_THREADS_BLOCK DIM_TEMPLATE
- #define NO_BLOCKS (DIM_SIGNAL / DIM_TEMPLATE)
- // NO_BLOCKS = DIM_SIG / DIM_TEMPLATE, pt DIM_SIG % DIM_TEMPLATE == (DIM_TEMPLATE - 1)
- double getRandom(double min, double max)
- {
- double range = (max - min);
- double divv = RAND_MAX / range;
- double rezultat = (min + ((double)rand() / divv));
- return rezultat;
- }
- //_______________________________________________________________________
- // Calcul Template Matching with Cross-Correlation --- Secvential (CPU)
- //_______________________________________________________________________
- void CPU_xCorr(double* xcorr, double* sig, double* tem, double medSig, double medTem, unsigned int dimTem, unsigned int dimSig)
- {
- double sumNumarator = 0.0;
- double sumNumitor_S = 0.0;
- double sumNumitor_T = 0.0;
- for (unsigned int t = 0; t <= (dimSig - dimTem); ++t)
- {
- sumNumarator = 0.0;
- sumNumitor_S = 0.0;
- sumNumitor_T = 0.0;
- for (unsigned int i = 0; i < dimTem; ++i)
- {
- double l_S = sig[i + t] - medSig;
- double l_T = tem[i] - medTem;
- sumNumarator += l_S * l_T;
- sumNumitor_S += l_S * l_S;
- sumNumitor_T += l_T * l_T;
- }
- xcorr[t] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
- }
- }
- //_______________________________________________________________________
- // Calcul Template Matching with Cross-Correlation --- Paralel (GPU)
- //_______________________________________________________________________
- __global__ void naive__GPU_xCorr_Template_Matching(double* xcorr, double* sig, double* tem, double medSig, double medTem, unsigned int dimTem, unsigned int dimSig)
- {
- double sumNumarator = 0.0;
- double sumNumitor_S = 0.0;
- double sumNumitor_T = 0.0;
- unsigned int thrd_offs = blockDim.x * blockIdx.x + threadIdx.x;
- for (unsigned int i = 0; i < dimTem; ++i)
- {
- double l_S = sig[thrd_offs + i] - medSig;
- double l_T = tem[i] - medTem;
- sumNumarator += l_S * l_T;
- sumNumitor_S += l_S * l_S;
- sumNumitor_T += l_T * l_T;
- }
- xcorr[thrd_offs] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
- }
- __global__ void shamem__GPU_xCorr_Template_Matching(double* xcorr, double* sig, double* tem, double* medSig, double* medTem)
- {
- double sumNumarator = 0.0;
- double sumNumitor_S = 0.0;
- double sumNumitor_T = 0.0;
- unsigned int thrd_offs = blockDim.x * blockIdx.x + threadIdx.x;
- __shared__ double sham_sig[2 * DIM_TEMPLATE - 1];
- __shared__ double sham_tem[DIM_TEMPLATE];
- __shared__ double sham_xcorr[DIM_TEMPLATE]; // [(2 * DIM_TEMPLATE - 1) - DIM_TEMPLATE + 1]
- __shared__ double sham_medSig[DIM_TEMPLATE];
- __shared__ double sham_medTem[DIM_TEMPLATE];
- for (unsigned char i = 0; i <= 1; ++i)
- {
- sham_sig[threadIdx.x + DIM_TEMPLATE * i] = sig[thrd_offs + DIM_TEMPLATE - 1];
- }
- sham_tem[threadIdx.x] = tem[thrd_offs];
- sham_medTem[threadIdx.x] = medTem[thrd_offs];
- sham_medSig[threadIdx.x] = medSig[thrd_offs];
- __syncthreads();
- for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
- {
- double l_S = sham_sig[((threadIdx.x + i) % DIM_TEMPLATE) + threadIdx.x] - sham_medSig[threadIdx.x];
- double l_T = sham_tem[((threadIdx.x + i) % DIM_TEMPLATE)] - sham_medTem[threadIdx.x];
- sumNumarator += l_S * l_T;
- sumNumitor_S += l_S * l_S;
- sumNumitor_T += l_T * l_T;
- }
- sham_xcorr[threadIdx.x] = sumNumarator / (sqrt(sumNumitor_S) * sqrt(sumNumitor_T));
- __syncthreads();
- xcorr[thrd_offs] = sham_xcorr[threadIdx.x];
- }
- int main(void)
- {
- //_______________________________________________________________________
- // Calcul numar de threads necesare per block. Calcul numar de blocks.
- //_______________________________________________________________________
- dim3 dimGrid(NO_BLOCKS);
- dim3 dimBlock(NO_THREADS_BLOCK);
- cudaError_t cudaStatus;
- cudaEvent_t start, stop;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- clock_t cpu_start, cpu_end;
- double elap;
- srand(time(NULL));
- float elapsedTime;
- double *dev_signal, *hos_signal;
- double *dev_templa, *hos_templa;
- double *dev_xcorr, *hos_xcorr, *hos_xcorr_compare;
- double hos_medTemplate[DIM_TEMPLATE * NO_BLOCKS];
- double hos_medSignal[DIM_TEMPLATE * NO_BLOCKS];
- double *dev_medSignal, *dev_medTemplate;
- //_______________________________________________________________________
- // Alocare memorie pentru vectorii: semnal, template, rezultat
- // Cronometrare.
- //_______________________________________________________________________
- hos_signal = (double*)malloc(sizeof(double) * DIM_SIGNAL);
- hos_templa = (double*)malloc(sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
- hos_xcorr = (double*)malloc(sizeof(double) * DIM_XCORR);
- hos_xcorr_compare = (double*)malloc(sizeof(double) * DIM_XCORR);
- printf("\ndim_template: %d\n", DIM_TEMPLATE);
- printf("\ndim_threads_per_block: %d\n", NO_THREADS_BLOCK);
- printf("\ndim_grid: %d\n", NO_BLOCKS);
- printf("\ndim_xcorr: %d\n", DIM_XCORR);
- cudaStatus = cudaSetDevice(0);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_signal, sizeof(double) * DIM_SIGNAL);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed! - device signal\n");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_templa, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed! - device template\n");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_xcorr, sizeof(double) * DIM_XCORR);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed! - device xcorr\n");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_medTemplate, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed! - device medTemplate\n");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_medSignal, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed! - device medSignal\n");
- goto Error;
- }
- //_______________________________________________________________________
- // Initializare semnal cu valori random intre MIN si MAX. Calcul medie.
- //_______________________________________________________________________
- const double minn = -10.0;
- const double maxx = 10.0;
- for (unsigned int i = 0; i < DIM_TEMPLATE * NO_BLOCKS; i++)
- {
- hos_medSignal[i] = 0;
- hos_medTemplate[i] = 0;
- }
- for (unsigned int i = 0; i < DIM_SIGNAL; i++)
- {
- double rezultat = getRandom(minn, maxx);
- hos_signal[i] = rezultat;
- hos_medSignal[0] += hos_signal[i];
- }
- hos_medSignal[0] /= (double)DIM_SIGNAL;
- printf("\nmedia semnal: %f\n", hos_medSignal[0]);
- //_______________________________________________________________________
- // Initializare template cu valori in (-1,1) (normalizate). Calcul medie.
- //_______________________________________________________________________
- for (unsigned int i = 0; i < DIM_TEMPLATE; i++)
- {
- hos_templa[i] = getRandom(-1.0, 1.0);
- hos_medTemplate[0] += hos_templa[i];
- }
- hos_medTemplate[0] /= (double)DIM_TEMPLATE;
- printf("\nmedia template: %f\n", hos_medTemplate[0]);
- #ifdef COMP_SW__PRINT_SIGNAL
- // Afisare semnal.
- {
- printf("SIGNAL: RAND(%f, %f) :\n\n", minn, maxx);
- for (unsigned int i = 0; i < DIM_SIGNAL; ++i)
- {
- if (i % 10 == 0) printf("\n");
- printf("%f ", hos_signal[i]);
- }
- printf("\n\n");
- }
- #endif
- #ifdef COMP_SW__PRINT_TEMPLATE
- // Afisare template.
- {
- printf("TEMPLATE: RAND(-1, 1) :\n\n");
- for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
- {
- if (i % 10 == 0) printf("\n");
- printf("%f ", hos_templa[i]);
- }
- printf("\n\n");
- }
- #endif
- // padding template
- for (unsigned int i = DIM_TEMPLATE; i < DIM_TEMPLATE * NO_BLOCKS; i++)
- {
- hos_templa[i] = hos_templa[i % DIM_TEMPLATE];
- }
- // padding medii
- for (unsigned int i = 1; i < DIM_TEMPLATE * NO_BLOCKS; i++)
- {
- hos_medSignal[i] = hos_medSignal[0];
- hos_medTemplate[i] = hos_medTemplate[0];
- }
- #ifdef COMP_SW__PRINT_TEMPLATE_PADD
- // Afisare template cu padding.
- {
- printf("TEMPLATE padded: RAND(-1, 1) :\n\n");
- for (unsigned int i = 0; i < DIM_TEMPLATE * NO_BLOCKS; ++i)
- {
- if (i % 10 == 0) printf("\n");
- printf("%f ", hos_templa[i]);
- }
- printf("\n\n");
- }
- #endif
- #ifdef COMP_SW__PRINT_MEDII_PADD
- // Afisare medii cu padding.
- {
- printf("Medii: Signal:\n\n");
- for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
- {
- if (i % 10 == 0) printf("\n");
- printf("%f ", hos_medTemplate[i]);
- }
- printf("\n\n");
- printf("Medii: Template:\n\n");
- for (unsigned int i = 0; i < DIM_TEMPLATE; ++i)
- {
- if (i % 10 == 0) printf("\n");
- printf("%f ", hos_medSignal[i]);
- }
- printf("\n\n");
- }
- #endif
- //_______________________________________________________________________
- // Transmitere date de intrare catre device (GPU)
- // Cronometrare.
- //_______________________________________________________________________
- // startCPUTimer();
- cudaStatus = cudaMemcpy(dev_signal, hos_signal, sizeof(double) * DIM_SIGNAL, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed! - to device, signal\n");
- goto Error;
- }
- cudaStatus = cudaMemcpy(dev_templa, hos_templa, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed! - to device, template\n");
- goto Error;
- }
- cudaStatus = cudaMemcpy(dev_xcorr, hos_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed! - to device, xcorr\n");
- goto Error;
- }
- cudaStatus = cudaMemcpy(dev_medTemplate, hos_medTemplate, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed! - to device, medTemplate\n");
- goto Error;
- }
- cudaStatus = cudaMemcpy(dev_medSignal, hos_medSignal, sizeof(double) * DIM_TEMPLATE * NO_BLOCKS, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed! - to device, medSignal\n");
- goto Error;
- }
- printf("II transfer");
- //stopCPUTimer();
- //printf("time[TRANSFER-H-D]: %f\n\n", elapsedTime);
- //_______________________________________________________________________
- // Pornire kernel. Cronometrare.
- //_______________________________________________________________________
- cudaEventRecord(start, 0);
- shamem__GPU_xCorr_Template_Matching <<< dimGrid, dimBlock >>>(dev_xcorr, dev_signal, dev_templa, dev_medSignal, dev_medTemplate);
- cudaError_t eroare = cudaGetLastError();
- printf("\nCudaLastError:\n");
- printf(cudaGetErrorString(eroare));
- printf("\n\n");
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaThreadSynchronize();
- //_______________________________________________________________________
- // Calcul timp de executie kernel. Afisare rezultate.
- //_______________________________________________________________________
- cudaEventElapsedTime(&elapsedTime, start, stop);
- printf("time[KERNEL_SHAMEM]: %f\n\n", elapsedTime);
- cudaStatus = cudaMemcpy(hos_xcorr, dev_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed! - to host, xcorr");
- goto Error;
- }
- #ifdef COMP_SW__PRINT_RESULT_GPU
- for (unsigned int i = 0; i < DIM_XCORR; ++i)
- {
- if (i % 10 == 0) printf("\n");
- printf("%f ", hos_xcorr[i]);
- }
- printf("\n_______________________________________________________________________\n\n");
- #endif
- //_______________________________________________________________________
- // Calcul CPU. Cronometrare.
- //_______________________________________________________________________
- cpu_start = clock();
- CPU_xCorr(hos_xcorr_compare, hos_signal, hos_templa, hos_medSignal[0], hos_medTemplate[0], DIM_TEMPLATE, DIM_SIGNAL);
- printf("\ntime[CPU]: %f\n", ((double)(clock() - cpu_start))/CLOCKS_PER_SEC);
- #ifdef COMP_SW__PRINT_RESULT_CPU
- for (unsigned int i = 0; i < DIM_XCORR; ++i)
- {
- if (i % 10 == 0) printf("\n");
- printf("%f ", hos_xcorr_compare[i]);
- }
- #endif
- for (unsigned int i = 0; i < DIM_XCORR; ++i)
- {
- if (hos_xcorr[i] != hos_xcorr_compare[i])
- {
- printf("\n\n\nNOT EQUAL\n\n\n");
- //goto Error;
- goto tryagain;
- }
- }
- tryagain:
- printf("trying old_ver\n\n");
- cudaEventRecord(start, 0);
- naive__GPU_xCorr_Template_Matching <<< dimGrid, dimBlock >>>(dev_xcorr, dev_signal, dev_templa, hos_medSignal[0], hos_medTemplate[0], DIM_TEMPLATE, DIM_SIGNAL);
- cudaEventRecord(stop, 0);
- cudaEventSynchronize(stop);
- cudaThreadSynchronize();
- cudaEventElapsedTime(&elapsedTime, start, stop);
- printf("time[KERNEL_NAIVE]: %f\n\n", elapsedTime);
- cudaStatus = cudaMemcpy(hos_xcorr, dev_xcorr, sizeof(double) * DIM_XCORR, cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed! - to host, xcorr");
- goto Error;
- }
- for (unsigned int i = 0; i < DIM_XCORR; ++i)
- {
- if (hos_xcorr[i] != hos_xcorr_compare[i])
- {
- printf("\n\n\nNOT EQUAL\n\n\n");
- goto Error;
- }
- }
- printf("\n\n\nRESULTS EQUAL\n\n\n");
- goto Succes;
- //_______________________________________________________________________
- // Tratare erori. Elibereare memorie.
- //_______________________________________________________________________
- Error:
- printf("Eroare!\n\n");
- Succes:
- cudaFree(hos_xcorr);
- printf("freed hos_xcorr!\n\n");
- cudaFree(hos_signal);
- printf("freed hos_signal!\n\n");
- cudaFree(hos_templa);
- printf("freed hos_templa!\n\n");
- cudaFree(dev_xcorr);
- printf("freed dev_xcorr!\n\n");
- cudaFree(dev_signal);
- printf("freed dev_signal!\n\n");
- cudaFree(dev_templa);
- printf("freed dev_templa!\n\n");
- cudaFree(dev_medSignal);
- printf("freed dev_medSignal!\n\n");
- cudaFree(dev_medTemplate);
- printf("freed dev_medTemplate!\n\n");
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement