Untitled

omp_set_num_threads(system->getPUCount());
#pragma omp parallel
{
    unsigned int cpu_thread_id = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();

PU pu = listOfPUs[cpu_thread_id];

    //threads are active until all data is processed
    while (finish_0 < N) {
        //the my_start and my_finish are private to a specific device.
        int my_start = 0;
        int my_finish = 0;

#pragma omp critical (chunkdetermination_0)
{
    start_0 = finish_0;
    finish_0 = start_0 + pu.getChunkSize();

    if(finish_0 > N)
        finish_0 = N;

    my_start = start_0;
    my_finish = finish_0;
}

if(pu.getType() == GPU) {

            int myN = my_finish-my_start;

            CudaSafeCall(cudaSetDevice(pu.getId()));

            unsigned int nbytes_per_kernel = sizeof(double)*myN;

            //memory allocation
            CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
            CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));

            CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
            CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
            //data transfer
            CudaSafeCall(cudaMemcpy(d_a, a+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
            CudaSafeCall(cudaMemcpy(d_c, c+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));


            //block and grid values
            dim3 gpu_threads(128);
            dim3 gpu_blocks(myN/gpu_threads.x);
            if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;

            //execute kernel
            kernel_0<<<gpu_blocks,gpu_threads>>>( d_a,  d_c, myN);

            //data transfer device to host
            CudaSafeCall(cudaMemcpy(c+my_start+1, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));

            //sycnhronize devices
            CudaSafeCall(cudaDeviceSynchronize());

            // //free device memory
            CudaSafeCall(cudaFree(d_a));
            CudaSafeCall(cudaFree(d_c));
        }

void hybrid_function_0 (double *a, double *c)

//device variable declarations
double *d_a;
double *d_c;

//start and finish determine the chunk size of a device
int start_0     = 0;
int finish_0    = 0;

vector<PU> listOfPUs = system->getPUs();

printf("Num devices = %dn", system->getPUCount());
omp_set_num_threads(system->getPUCount());
//one thread of the CPU controls one GPU device. The rest of CPU threads will be used to process data.
#pragma omp parallel
{
    unsigned int cpu_thread_id = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();

    PU pu = listOfPUs[cpu_thread_id];

    //threads are active until all data is processed
    while (finish_0 < N) {
        //the my_start and my_finish are private to a specific device.
        int my_start = 0;
        int my_finish = 0;

        //the determination of chunks should be performed sequentially, in order to avoid two or more devices processing the same data.
        #pragma omp critical (chunkdetermination_0)
        {
            start_0 = finish_0;
            finish_0 = start_0 + pu.getChunkSize();

            if(finish_0 > N)
                finish_0 = N;

            my_start = start_0;
            my_finish = finish_0;
        }

        //devices with id less than nDevices are GPU devices. The host CPU has id = nDevices
        if(pu.getType() == GPU) {

            int myN = my_finish-my_start;

            printf("device_idt%dtpu_idt%dttypet%stprocessingt%d-%d (%lu KB)n", cpu_thread_id, pu.getId(), pu.getTypeAsString(), my_start, my_finish, sizeof(double)*myN/1000);
            CudaSafeCall(cudaSetDevice(pu.getId()));

            unsigned int nbytes_per_kernel = sizeof(double)*myN;

            //memory allocation
            CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
            CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));

            CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
            CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
            //data transfer
            CudaSafeCall(cudaMemcpy(d_a, a+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
            CudaSafeCall(cudaMemcpy(d_c, c+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));


            //block and grid values
            dim3 gpu_threads(128);
            dim3 gpu_blocks(myN/gpu_threads.x);
            if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;

            //execute kernel
            kernel_0<<<gpu_blocks,gpu_threads>>>( d_a,  d_c, myN);

            //data transfer device to host
            CudaSafeCall(cudaMemcpy(c+my_start+1, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));

            //sycnhronize devices
            CudaSafeCall(cudaDeviceSynchronize());

            // //free device memory
            CudaSafeCall(cudaFree(d_a));
            CudaSafeCall(cudaFree(d_c));
        }
        //execute on host
        else if (pu.getType() == CPU) {
            omp_set_num_threads(pu.getNumCores());
            #pragma omp parallel for
            for (int  i = my_start; i < my_finish; i++)
            {
                c[i] = a[i];
            }
        }
        //execute on MIC
        else if (pu.getType() == MIC) {
            #pragma offload target(mic: cpu_thread_id) in(a[my_start:my_finish]) in(c[my_start:my_finish])  out(c[my_start:my_finish])
            {
                #pragma omp parallel for
                for (int  i = my_start; i < my_finish; i++)
                {
                    c[i] = a[i];
                }
            }
        }
    }
}
}