Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- omp_set_num_threads(system->getPUCount());
- #pragma omp parallel
- {
- unsigned int cpu_thread_id = omp_get_thread_num();
- unsigned int num_cpu_threads = omp_get_num_threads();
- PU pu = listOfPUs[cpu_thread_id];
- //threads are active until all data is processed
- while (finish_0 < N) {
- //the my_start and my_finish are private to a specific device.
- int my_start = 0;
- int my_finish = 0;
- #pragma omp critical (chunkdetermination_0)
- {
- start_0 = finish_0;
- finish_0 = start_0 + pu.getChunkSize();
- if(finish_0 > N)
- finish_0 = N;
- my_start = start_0;
- my_finish = finish_0;
- }
- if(pu.getType() == GPU) {
- int myN = my_finish-my_start;
- CudaSafeCall(cudaSetDevice(pu.getId()));
- unsigned int nbytes_per_kernel = sizeof(double)*myN;
- //memory allocation
- CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
- CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
- CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
- CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
- //data transfer
- CudaSafeCall(cudaMemcpy(d_a, a+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
- CudaSafeCall(cudaMemcpy(d_c, c+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
- //block and grid values
- dim3 gpu_threads(128);
- dim3 gpu_blocks(myN/gpu_threads.x);
- if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
- //execute kernel
- kernel_0<<<gpu_blocks,gpu_threads>>>( d_a, d_c, myN);
- //data transfer device to host
- CudaSafeCall(cudaMemcpy(c+my_start+1, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
- //sycnhronize devices
- CudaSafeCall(cudaDeviceSynchronize());
- // //free device memory
- CudaSafeCall(cudaFree(d_a));
- CudaSafeCall(cudaFree(d_c));
- }
- void hybrid_function_0 (double *a, double *c)
- //device variable declarations
- double *d_a;
- double *d_c;
- //start and finish determine the chunk size of a device
- int start_0 = 0;
- int finish_0 = 0;
- vector<PU> listOfPUs = system->getPUs();
- printf("Num devices = %dn", system->getPUCount());
- omp_set_num_threads(system->getPUCount());
- //one thread of the CPU controls one GPU device. The rest of CPU threads will be used to process data.
- #pragma omp parallel
- {
- unsigned int cpu_thread_id = omp_get_thread_num();
- unsigned int num_cpu_threads = omp_get_num_threads();
- PU pu = listOfPUs[cpu_thread_id];
- //threads are active until all data is processed
- while (finish_0 < N) {
- //the my_start and my_finish are private to a specific device.
- int my_start = 0;
- int my_finish = 0;
- //the determination of chunks should be performed sequentially, in order to avoid two or more devices processing the same data.
- #pragma omp critical (chunkdetermination_0)
- {
- start_0 = finish_0;
- finish_0 = start_0 + pu.getChunkSize();
- if(finish_0 > N)
- finish_0 = N;
- my_start = start_0;
- my_finish = finish_0;
- }
- //devices with id less than nDevices are GPU devices. The host CPU has id = nDevices
- if(pu.getType() == GPU) {
- int myN = my_finish-my_start;
- printf("device_idt%dtpu_idt%dttypet%stprocessingt%d-%d (%lu KB)n", cpu_thread_id, pu.getId(), pu.getTypeAsString(), my_start, my_finish, sizeof(double)*myN/1000);
- CudaSafeCall(cudaSetDevice(pu.getId()));
- unsigned int nbytes_per_kernel = sizeof(double)*myN;
- //memory allocation
- CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
- CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
- CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
- CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
- //data transfer
- CudaSafeCall(cudaMemcpy(d_a, a+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
- CudaSafeCall(cudaMemcpy(d_c, c+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
- //block and grid values
- dim3 gpu_threads(128);
- dim3 gpu_blocks(myN/gpu_threads.x);
- if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
- //execute kernel
- kernel_0<<<gpu_blocks,gpu_threads>>>( d_a, d_c, myN);
- //data transfer device to host
- CudaSafeCall(cudaMemcpy(c+my_start+1, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
- //sycnhronize devices
- CudaSafeCall(cudaDeviceSynchronize());
- // //free device memory
- CudaSafeCall(cudaFree(d_a));
- CudaSafeCall(cudaFree(d_c));
- }
- //execute on host
- else if (pu.getType() == CPU) {
- omp_set_num_threads(pu.getNumCores());
- #pragma omp parallel for
- for (int i = my_start; i < my_finish; i++)
- {
- c[i] = a[i];
- }
- }
- //execute on MIC
- else if (pu.getType() == MIC) {
- #pragma offload target(mic: cpu_thread_id) in(a[my_start:my_finish]) in(c[my_start:my_finish]) out(c[my_start:my_finish])
- {
- #pragma omp parallel for
- for (int i = my_start; i < my_finish; i++)
- {
- c[i] = a[i];
- }
- }
- }
- }
- }
- }
Add Comment
Please, Sign In to add comment