Guest User

Untitled

a guest
Jun 14th, 2018
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.46 KB | None | 0 0
  1. omp_set_num_threads(system->getPUCount());
  2. #pragma omp parallel
  3. {
  4. unsigned int cpu_thread_id = omp_get_thread_num();
  5. unsigned int num_cpu_threads = omp_get_num_threads();
  6.  
  7. PU pu = listOfPUs[cpu_thread_id];
  8.  
  9. //threads are active until all data is processed
  10. while (finish_0 < N) {
  11. //the my_start and my_finish are private to a specific device.
  12. int my_start = 0;
  13. int my_finish = 0;
  14.  
  15. #pragma omp critical (chunkdetermination_0)
  16. {
  17. start_0 = finish_0;
  18. finish_0 = start_0 + pu.getChunkSize();
  19.  
  20. if(finish_0 > N)
  21. finish_0 = N;
  22.  
  23. my_start = start_0;
  24. my_finish = finish_0;
  25. }
  26.  
  27. if(pu.getType() == GPU) {
  28.  
  29. int myN = my_finish-my_start;
  30.  
  31. CudaSafeCall(cudaSetDevice(pu.getId()));
  32.  
  33. unsigned int nbytes_per_kernel = sizeof(double)*myN;
  34.  
  35. //memory allocation
  36. CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
  37. CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
  38.  
  39. CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
  40. CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
  41. //data transfer
  42. CudaSafeCall(cudaMemcpy(d_a, a+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
  43. CudaSafeCall(cudaMemcpy(d_c, c+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
  44.  
  45.  
  46. //block and grid values
  47. dim3 gpu_threads(128);
  48. dim3 gpu_blocks(myN/gpu_threads.x);
  49. if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
  50.  
  51. //execute kernel
  52. kernel_0<<<gpu_blocks,gpu_threads>>>( d_a, d_c, myN);
  53.  
  54. //data transfer device to host
  55. CudaSafeCall(cudaMemcpy(c+my_start+1, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
  56.  
  57. //sycnhronize devices
  58. CudaSafeCall(cudaDeviceSynchronize());
  59.  
  60. // //free device memory
  61. CudaSafeCall(cudaFree(d_a));
  62. CudaSafeCall(cudaFree(d_c));
  63. }
  64.  
  65. void hybrid_function_0 (double *a, double *c)
  66.  
  67. //device variable declarations
  68. double *d_a;
  69. double *d_c;
  70.  
  71. //start and finish determine the chunk size of a device
  72. int start_0 = 0;
  73. int finish_0 = 0;
  74.  
  75. vector<PU> listOfPUs = system->getPUs();
  76.  
  77. printf("Num devices = %dn", system->getPUCount());
  78. omp_set_num_threads(system->getPUCount());
  79. //one thread of the CPU controls one GPU device. The rest of CPU threads will be used to process data.
  80. #pragma omp parallel
  81. {
  82. unsigned int cpu_thread_id = omp_get_thread_num();
  83. unsigned int num_cpu_threads = omp_get_num_threads();
  84.  
  85. PU pu = listOfPUs[cpu_thread_id];
  86.  
  87. //threads are active until all data is processed
  88. while (finish_0 < N) {
  89. //the my_start and my_finish are private to a specific device.
  90. int my_start = 0;
  91. int my_finish = 0;
  92.  
  93. //the determination of chunks should be performed sequentially, in order to avoid two or more devices processing the same data.
  94. #pragma omp critical (chunkdetermination_0)
  95. {
  96. start_0 = finish_0;
  97. finish_0 = start_0 + pu.getChunkSize();
  98.  
  99. if(finish_0 > N)
  100. finish_0 = N;
  101.  
  102. my_start = start_0;
  103. my_finish = finish_0;
  104. }
  105.  
  106. //devices with id less than nDevices are GPU devices. The host CPU has id = nDevices
  107. if(pu.getType() == GPU) {
  108.  
  109. int myN = my_finish-my_start;
  110.  
  111. printf("device_idt%dtpu_idt%dttypet%stprocessingt%d-%d (%lu KB)n", cpu_thread_id, pu.getId(), pu.getTypeAsString(), my_start, my_finish, sizeof(double)*myN/1000);
  112. CudaSafeCall(cudaSetDevice(pu.getId()));
  113.  
  114. unsigned int nbytes_per_kernel = sizeof(double)*myN;
  115.  
  116. //memory allocation
  117. CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
  118. CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
  119.  
  120. CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
  121. CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
  122. //data transfer
  123. CudaSafeCall(cudaMemcpy(d_a, a+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
  124. CudaSafeCall(cudaMemcpy(d_c, c+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
  125.  
  126.  
  127. //block and grid values
  128. dim3 gpu_threads(128);
  129. dim3 gpu_blocks(myN/gpu_threads.x);
  130. if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
  131.  
  132. //execute kernel
  133. kernel_0<<<gpu_blocks,gpu_threads>>>( d_a, d_c, myN);
  134.  
  135. //data transfer device to host
  136. CudaSafeCall(cudaMemcpy(c+my_start+1, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
  137.  
  138. //sycnhronize devices
  139. CudaSafeCall(cudaDeviceSynchronize());
  140.  
  141. // //free device memory
  142. CudaSafeCall(cudaFree(d_a));
  143. CudaSafeCall(cudaFree(d_c));
  144. }
  145. //execute on host
  146. else if (pu.getType() == CPU) {
  147. omp_set_num_threads(pu.getNumCores());
  148. #pragma omp parallel for
  149. for (int i = my_start; i < my_finish; i++)
  150. {
  151. c[i] = a[i];
  152. }
  153. }
  154. //execute on MIC
  155. else if (pu.getType() == MIC) {
  156. #pragma offload target(mic: cpu_thread_id) in(a[my_start:my_finish]) in(c[my_start:my_finish]) out(c[my_start:my_finish])
  157. {
  158. #pragma omp parallel for
  159. for (int i = my_start; i < my_finish; i++)
  160. {
  161. c[i] = a[i];
  162. }
  163. }
  164. }
  165. }
  166. }
  167. }
Add Comment
Please, Sign In to add comment