Advertisement
phystota

temp

Dec 6th, 2024
27
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 1.09 KB | None | 0 0
  1. #define NUM_STREAMS 4 // Number of CUDA streams
  2. cudaStream_t streams[NUM_STREAMS];
  3.  
  4. // Create CUDA streams
  5. for (int i = 0; i < NUM_STREAMS; i++) {
  6.     cudaStreamCreate(&streams[i]);
  7. }
  8.  
  9. // Divide Batch into chunks and assign to streams
  10. for (int i = 0; i < NUM_STREAMS; i++) {
  11.     // Calculate batch_start, batch_size, offsets, etc.
  12.  
  13.     // Asynchronously copy input data to device
  14.     cudaMemcpyAsync(*device_input_ptr + input_offset, host_input_pinned + input_offset,
  15.                    input_chunk_size, cudaMemcpyHostToDevice, streams[i]);
  16.  
  17.     // Launch kernels in the stream
  18.     conv_forward_gpu_part(*device_output_ptr, *device_input_ptr, *device_mask_ptr,
  19.                           batch_start, batch_size, Map_out, Channel, Height, Width, K, streams[i]);
  20.  
  21.     // Asynchronously copy output data back to host
  22.     cudaMemcpyAsync(host_output_pinned + output_offset, *device_output_ptr + output_offset,
  23.                    output_chunk_size, cudaMemcpyDeviceToHost, streams[i]);
  24. }
  25.  
  26. // Synchronize all streams
  27. for (int i = 0; i < NUM_STREAMS; i++) {
  28.     cudaStreamSynchronize(streams[i]);
  29. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement