daily pastebin goal
7%
SHARE
TWEET

Untitled

a guest Jun 14th, 2018 51 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. omp_set_num_threads(system->getPUCount());
  2. #pragma omp parallel
  3. {
  4.     unsigned int cpu_thread_id = omp_get_thread_num();
  5.     unsigned int num_cpu_threads = omp_get_num_threads();
  6.    
  7. PU pu = listOfPUs[cpu_thread_id];
  8.  
  9.     //threads are active until all data is processed
  10.     while (finish_0 < N) {
  11.         //the my_start and my_finish are private to a specific device.
  12.         int my_start = 0;
  13.         int my_finish = 0;
  14.    
  15. #pragma omp critical (chunkdetermination_0)
  16. {
  17.     start_0 = finish_0;
  18.     finish_0 = start_0 + pu.getChunkSize();
  19.  
  20.     if(finish_0 > N)
  21.         finish_0 = N;
  22.  
  23.     my_start = start_0;
  24.     my_finish = finish_0;
  25. }
  26.    
  27. if(pu.getType() == GPU) {
  28.  
  29.             int myN = my_finish-my_start;
  30.  
  31.             CudaSafeCall(cudaSetDevice(pu.getId()));
  32.  
  33.             unsigned int nbytes_per_kernel = sizeof(double)*myN;
  34.  
  35.             //memory allocation
  36.             CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
  37.             CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
  38.  
  39.             CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
  40.             CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
  41.             //data transfer
  42.             CudaSafeCall(cudaMemcpy(d_a, a+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
  43.             CudaSafeCall(cudaMemcpy(d_c, c+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
  44.  
  45.  
  46.             //block and grid values
  47.             dim3 gpu_threads(128);
  48.             dim3 gpu_blocks(myN/gpu_threads.x);
  49.             if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
  50.  
  51.             //execute kernel
  52.             kernel_0<<<gpu_blocks,gpu_threads>>>( d_a,  d_c, myN);
  53.  
  54.             //data transfer device to host
  55.             CudaSafeCall(cudaMemcpy(c+my_start+1, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
  56.  
  57.             //sycnhronize devices
  58.             CudaSafeCall(cudaDeviceSynchronize());
  59.  
  60.             // //free device memory
  61.             CudaSafeCall(cudaFree(d_a));
  62.             CudaSafeCall(cudaFree(d_c));
  63.         }
  64.    
  65. void hybrid_function_0 (double *a, double *c)
  66.    
  67. //device variable declarations
  68. double *d_a;
  69. double *d_c;
  70.  
  71. //start and finish determine the chunk size of a device
  72. int start_0     = 0;
  73. int finish_0    = 0;
  74.  
  75. vector<PU> listOfPUs = system->getPUs();
  76.  
  77. printf("Num devices = %dn", system->getPUCount());
  78. omp_set_num_threads(system->getPUCount());
  79. //one thread of the CPU controls one GPU device. The rest of CPU threads will be used to process data.
  80. #pragma omp parallel
  81. {
  82.     unsigned int cpu_thread_id = omp_get_thread_num();
  83.     unsigned int num_cpu_threads = omp_get_num_threads();
  84.  
  85.     PU pu = listOfPUs[cpu_thread_id];
  86.  
  87.     //threads are active until all data is processed
  88.     while (finish_0 < N) {
  89.         //the my_start and my_finish are private to a specific device.
  90.         int my_start = 0;
  91.         int my_finish = 0;
  92.  
  93.         //the determination of chunks should be performed sequentially, in order to avoid two or more devices processing the same data.
  94.         #pragma omp critical (chunkdetermination_0)
  95.         {
  96.             start_0 = finish_0;
  97.             finish_0 = start_0 + pu.getChunkSize();
  98.  
  99.             if(finish_0 > N)
  100.                 finish_0 = N;
  101.  
  102.             my_start = start_0;
  103.             my_finish = finish_0;
  104.         }
  105.  
  106.         //devices with id less than nDevices are GPU devices. The host CPU has id = nDevices
  107.         if(pu.getType() == GPU) {
  108.  
  109.             int myN = my_finish-my_start;
  110.  
  111.             printf("device_idt%dtpu_idt%dttypet%stprocessingt%d-%d (%lu KB)n", cpu_thread_id, pu.getId(), pu.getTypeAsString(), my_start, my_finish, sizeof(double)*myN/1000);
  112.             CudaSafeCall(cudaSetDevice(pu.getId()));
  113.  
  114.             unsigned int nbytes_per_kernel = sizeof(double)*myN;
  115.  
  116.             //memory allocation
  117.             CudaSafeCall(cudaMalloc((void**)&d_a, nbytes_per_kernel));
  118.             CudaSafeCall(cudaMalloc((void**)&d_c, nbytes_per_kernel));
  119.  
  120.             CudaSafeCall(cudaMemset(d_a, 0, nbytes_per_kernel));
  121.             CudaSafeCall(cudaMemset(d_c, 0, nbytes_per_kernel));
  122.             //data transfer
  123.             CudaSafeCall(cudaMemcpy(d_a, a+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
  124.             CudaSafeCall(cudaMemcpy(d_c, c+my_start+1, nbytes_per_kernel, cudaMemcpyHostToDevice));
  125.  
  126.  
  127.             //block and grid values
  128.             dim3 gpu_threads(128);
  129.             dim3 gpu_blocks(myN/gpu_threads.x);
  130.             if( myN % gpu_threads.x != 0 ) gpu_blocks.x+=1;
  131.  
  132.             //execute kernel
  133.             kernel_0<<<gpu_blocks,gpu_threads>>>( d_a,  d_c, myN);
  134.  
  135.             //data transfer device to host
  136.             CudaSafeCall(cudaMemcpy(c+my_start+1, d_c, nbytes_per_kernel, cudaMemcpyDeviceToHost));
  137.  
  138.             //sycnhronize devices
  139.             CudaSafeCall(cudaDeviceSynchronize());
  140.  
  141.             // //free device memory
  142.             CudaSafeCall(cudaFree(d_a));
  143.             CudaSafeCall(cudaFree(d_c));
  144.         }
  145.         //execute on host
  146.         else if (pu.getType() == CPU) {
  147.             omp_set_num_threads(pu.getNumCores());
  148.             #pragma omp parallel for
  149.             for (int  i = my_start; i < my_finish; i++)
  150.             {
  151.                 c[i] = a[i];
  152.             }
  153.         }
  154.         //execute on MIC
  155.         else if (pu.getType() == MIC) {
  156.             #pragma offload target(mic: cpu_thread_id) in(a[my_start:my_finish]) in(c[my_start:my_finish])  out(c[my_start:my_finish])
  157.             {
  158.                 #pragma omp parallel for
  159.                 for (int  i = my_start; i < my_finish; i++)
  160.                 {
  161.                     c[i] = a[i];
  162.                 }
  163.             }
  164.         }
  165.     }
  166. }
  167. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top