Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cuda_runtime.h>
- #include <cstdio>
- #include <cstdlib>
- #include <ctime>
- #include "../../../../Desktop/code/GPUTimer.h"
- void sumaVector(const float in[], const float* in2, float* out, size_t SIZE) {
- for (size_t i = 0; i < SIZE; i++)
- out[i] = in[i] + in2[i];
- }
- __global__ void sumaVectorG(const float* in, const float* in2, float* out, size_t SIZE) {
- size_t i = blockDim.x * blockIdx.x + threadIdx.x;
- out[i] = in[i] + in2[i];
- }
- void sumaVectorCPU(float* in, float* in2, float* out, size_t SIZE) {
- sumaVector(in, in2, out, SIZE);
- /*
- printf("Resultados en CPU\n");
- for (size_t i = 0; i < SIZE; i++)
- {
- printf("h_out[%d]= %.f ------ h_in[%d]= %.f--------h_in2[%d]= %.f\n", i, out[i], i, in[i], i, in2[i]);
- }
- */
- }
- void sumaVectorGPU(const float* in, const float* in2, float* out, size_t SIZE, size_t BYTES) {
- float* d_in;
- float* d_in2;
- float* d_out;
- cudaMalloc(&d_in, BYTES);
- cudaMalloc(&d_in2, BYTES);
- cudaMalloc(&d_out, BYTES);
- cudaMemcpy(d_in, in, BYTES, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in2, in2, BYTES, cudaMemcpyHostToDevice);
- const size_t THREADS = 258;
- const size_t BLOCKS = ceil((float)SIZE / THREADS);
- //GpuTimer timer;
- //timer.Start();
- sumaVectorG << <BLOCKS, THREADS >> > (d_in, d_in2, d_out, SIZE);
- //timer.Stop();
- //printf("Elapsed time on GPU (discarding memory transfers): %lf\n", timer.Elapsed() / 1000);
- cudaMemcpy(out, d_out, BYTES, cudaMemcpyDeviceToHost);
- /*
- printf("Resultados en GPU\n");
- for (size_t i = 0; i < SIZE; i++)
- {
- printf("h_out[%d]= %.f ------ h_in[%d]= %.f--------h_in2[%d]= %.f\n", i, out[i], i, in[i], i, in2[i]);
- }
- */
- cudaFree(d_in);
- cudaFree(d_in2);
- cudaFree(d_out);
- d_in =d_in2=d_out= NULL;
- }
- bool compare(const float* CPU, const float* GPU, const size_t SIZE) {
- for (size_t i = 0; i < SIZE; i++)
- {
- if (CPU[i] != GPU[i])
- return false;
- }
- return true;
- }
- int main()
- {
- const size_t SIZE = 64;
- const size_t BYTES = sizeof(float) * SIZE;
- const long long TRIALS = 100000000;
- float* h_in = (float*)malloc(BYTES);
- float* h_in2 = (float*)malloc(BYTES);
- float* h_out = (float*)malloc(BYTES);
- float* h_outD = (float*)malloc(BYTES);
- for (size_t i = 0; i < SIZE; i++)
- {
- h_in[i] = i;
- h_in2[i] = i + rand() % 20;
- }
- clock_t t;
- t = clock();
- for (size_t i = 0; i < TRIALS; i++) {
- sumaVectorCPU(h_in, h_in2, h_out, SIZE);
- }
- t = clock() - t;
- printf("CPU - average time elapsed: %f\n", ((float)t) / (TRIALS * CLOCKS_PER_SEC));
- GpuTimer timer;
- timer.Start();
- for (size_t i = 0; i < TRIALS; i++){
- sumaVectorGPU(h_in, h_in2, h_outD, SIZE, BYTES);
- }
- timer.Stop();
- printf("GPU- average time elapsed: %f\n", timer.Elapsed() / (TRIALS * 1000));
- if (compare(h_out, h_outD, SIZE)) {
- printf("Ha sido todo un exito\n");
- }
- else {
- printf("Ha sido todo un FRACASO........ABSOLUTOOOO\n");
- }
- free(h_in);
- free(h_in2);
- free(h_out);
- free(h_outD);
- h_in = h_in2 =h_out =h_outD=NULL;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement