Advertisement
Guest User

Untitled

a guest
Feb 27th, 2017
241
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 19.50 KB | None | 0 0
  1. I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
  2. I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
  3. I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
  4. I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
  5. I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
  6. ==91215== NVPROF is profiling process 91215, command: python alexnet_benchmark.py
  7. I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties:
  8. name: TITAN X (Pascal)
  9. major: 6 minor: 1 memoryClockRate (GHz) 1.531
  10. pciBusID 0000:04:00.0
  11. Total memory: 11.90GiB
  12. Free memory: 11.74GiB
  13. W tensorflow/stream_executor/cuda/cuda_driver.cc:590] creating context when one is currently active; existing: 0x2f1e9b0
  14. I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 1 with properties:
  15. name: Tesla K40c
  16. major: 3 minor: 5 memoryClockRate (GHz) 0.745
  17. pciBusID 0000:84:00.0
  18. Total memory: 11.17GiB
  19. Free memory: 11.09GiB
  20. I tensorflow/core/common_runtime/gpu/gpu_device.cc:777] Peer access not supported between device ordinals 0 and 1
  21. I tensorflow/core/common_runtime/gpu/gpu_device.cc:777] Peer access not supported between device ordinals 1 and 0
  22. I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 1
  23. I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0: Y N
  24. I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 1: N Y
  25. I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:04:00.0)
  26. I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K40c, pci bus id: 0000:84:00.0)
  27. ==91215== Profiling application: python alexnet_benchmark.py
  28. ==91215== Profiling result:
  29. Time(%) Time Calls Avg Min Max Name
  30. 18.90% 1.60256s 1448 1.1067ms 813.35us 6.0754ms cudnn_maxwell_gcgemm_64x64_tn_batched
  31. 13.27% 1.12468s 5170 217.54us 27.170us 585.95us void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<float, int=32, int=8>(float const *, tensorflow::functor::Dimension<int=3>, tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<float, int=32, int=8>*)
  32. 8.95% 758.56ms 2438 311.14us 196.27us 565.62us void fft2d_r2c_16x16<float, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int)
  33. 5.50% 465.84ms 221 2.1079ms 2.0377ms 2.6011ms maxwell_scudnn_128x64_large_nn
  34. 5.01% 424.98ms 902 471.16us 240.65us 2.1123ms void fft2d_r2c_32x32<float, bool=1>(float2*, float const *, int, int, int, int, int, int, int, int, int)
  35. 4.31% 365.17ms 1332 274.16us 17.344us 1.5425ms void flip_filter<float, float>(float*, float const *, int, int, int, int)
  36. 4.12% 348.94ms 1100 317.22us 133.13us 671.80us void tensorflow::BiasNHWCKernel<float>(int, float const *, float const , tensorflow::BiasNHWCKernel<float>*, int)
  37. 4.06% 344.01ms 112 3.0715ms 2.8623ms 3.3058ms maxwell_scudnn_128x64_stridedB_splitK_medium_nn
  38. 4.01% 340.27ms 1219 279.14us 176.10us 473.81us void fft2d_c2r_16x16<float, bool=0, bool=1>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float)
  39. 3.76% 318.48ms 1100 289.53us 126.09us 593.15us void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, long>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const , float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const , Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const > const > const > const , Eigen::GpuDevice>, long>(float, int=1)
  40. 2.92% 247.11ms 330 748.82us 725.34us 908.01us void tensorflow::functor::PadInputCustomKernelNHWC<float, int=4>(int, float const *, tensorflow::functor::Dimension<int=4>, tensorflow::functor::PadInputCustomKernelNHWC<float, int=4>*, float const *, float const *)
  41. 2.88% 244.24ms 451 541.55us 208.84us 1.3232ms void fft2d_c2r_32x32<float, bool=0, bool=1>(float*, float2 const *, int, int, int, int, int, int, int, int, int, float, float)
  42. 2.83% 240.06ms 550 436.47us 186.57us 907.11us void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, long>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_product_op<float const , float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const , Eigen::TensorConversionOp<float, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_cmp_op<float const , float const , Eigen::internal::ComparisonName>, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const , Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const >, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, long>, int=16, Eigen::MakePointer> const > const > const > const > const > const , Eigen::GpuDevice>, long>(float, int=1)
  43. 2.67% 226.48ms 330 686.30us 231.40us 1.1047ms void cudnn::detail::pooling_bw_kernel_max<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=1>>(cudnnTensorStruct, float const *, cudnn::detail::pooling_bw_kernel_max<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=1>>, float const , cudnn::detail::pooling_bw_kernel_max<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=1>>, float const , cudnn::detail::pooling_bw_kernel_max<float, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=1>>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float)
  44. 2.65% 224.91ms 116 1.9389ms 1.7271ms 3.1765ms maxwell_scudnn_winograd_128x128_tile228n_nt
  45. 2.56% 217.34ms 660 329.30us 108.58us 614.65us void tensorflow::_GLOBAL__N__66_tmpxft_0000200b_00000000_9_maxpooling_op_gpu_cu_compute_52_cpp1_ii_af505f36::MaxPoolForwardNHWC<float>(int, float const *, int, int, int, int, int, int, int, int, int, int, int, tensorflow::_GLOBAL__N__66_tmpxft_0000200b_00000000_9_maxpooling_op_gpu_cu_compute_52_cpp1_ii_af505f36::MaxPoolForwardNHWC<float>*, __int64*)
  46. 2.46% 208.23ms 330 631.00us 612.86us 723.01us void tensorflow::functor::SwapDimension1And2InTensor3<float>(int, float const *, tensorflow::functor::Dimension<int=3>, tensorflow::functor::SwapDimension1And2InTensor3<float>*)
  47. 2.00% 169.85ms 222 765.08us 735.14us 920.33us cudnn_maxwell_cgemm_64x64_tn_batched
  48. 1.69% 143.55ms 1210 118.63us 864ns 363.12us [CUDA memcpy DtoH]
  49. 1.47% 124.57ms 550 226.49us 92.516us 506.90us void tensorflow::BiasGradNHWC_SharedAtomics<float>(int, float const *, tensorflow::BiasGradNHWC_SharedAtomics<float>*, int)
  50. 0.77% 65.374ms 2090 31.279us 3.0720us 89.924us void tensorflow::functor::SwapDimension0And2InTensor3<float>(int, float const *, tensorflow::functor::Dimension<int=3>, tensorflow::functor::SwapDimension0And2InTensor3<float>*)
  51. 0.76% 64.379ms 334 192.75us 63.939us 277.23us void setTensor4d_kernel<float, float, int=16, int=16>(cudnnTensor4dStruct, float*, float)
  52. 0.37% 30.965ms 4 7.7412ms 5.3534ms 9.8128ms void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1>*, kernel_grad_params, int, int, float, int)
  53. 0.28% 23.488ms 3 7.8293ms 4.6348ms 12.187ms void cudnn::detail::implicit_convolve_sgemm<float, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0>(int, int, int, float const *, int, cudnn::detail::implicit_convolve_sgemm<float, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0>*, float const *, kernel_conv_params, int, float, float)
  54. 0.25% 21.196ms 3 7.0652ms 4.1175ms 10.374ms void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int)
  55. 0.22% 18.361ms 6 3.0601ms 2.5114ms 3.7605ms maxwell_scudnn_128x128_stridedB_splitK_small_nn
  56. 0.15% 12.868ms 2 6.4340ms 6.3960ms 6.4719ms maxwell_scudnn_128x64_stridedB_splitK_small_nn
  57. 0.13% 11.059ms 3 3.6863ms 3.0853ms 4.5920ms maxwell_scudnn_128x128_small_nn
  58. 0.13% 10.793ms 1 10.793ms 10.793ms 10.793ms void cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=0>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=0>*, kernel_conv_params, int, int, float, float)
  59. 0.11% 9.4712ms 2 4.7356ms 3.2551ms 6.2161ms maxwell_scudnn_128x64_stridedB_small_nn
  60. 0.11% 9.1170ms 2 4.5585ms 3.5786ms 5.5384ms void cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float)
  61. 0.09% 7.5156ms 5 1.5031ms 586.30us 2.9270ms void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const *, float*, int)
  62. 0.08% 7.1280ms 1 7.1280ms 7.1280ms 7.1280ms maxwell_scudnn_128x64_small_nn
  63. 0.07% 6.0585ms 2 3.0292ms 2.4185ms 3.6400ms maxwell_scudnn_128x128_stridedB_small_nn
  64. 0.07% 5.7305ms 1 5.7305ms 5.7305ms 5.7305ms void cudnn::detail::implicit_convolve_sgemm<float, int=128, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0>(int, int, int, float const *, int, cudnn::detail::implicit_convolve_sgemm<float, int=128, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0>*, float const *, kernel_conv_params, int, float, float)
  65. 0.06% 5.2956ms 1 5.2956ms 5.2956ms 5.2956ms void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int)
  66. 0.06% 5.2647ms 1 5.2647ms 5.2647ms 5.2647ms void cudnn::detail::implicit_convolve_sgemm<float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0>(int, int, int, float const *, int, cudnn::detail::implicit_convolve_sgemm<float, int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0>*, float const *, kernel_conv_params, int, float, float)
  67. 0.06% 5.0949ms 1 5.0949ms 5.0949ms 5.0949ms void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float)
  68. 0.05% 4.5246ms 1 4.5246ms 4.5246ms 4.5246ms void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, int=3, int=3, int=3, int=0, bool=0>(int, int, int, float const *, int, float const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, int=3, int=3, int=3, int=0, bool=0>*, kernel_conv_params, int, int, float, float)
  69. 0.05% 3.9295ms 1 3.9295ms 3.9295ms 3.9295ms void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int)
  70. 0.04% 3.2639ms 116 28.136us 2.2720us 438.90us void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, int>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseUnaryOp<Eigen::internal::scalar_right<float, float, Eigen::internal::scalar_product_op<float, float>>, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, int>, int=16, Eigen::MakePointer> const > const > const , Eigen::GpuDevice>, int>(float, int=1)
  71. 0.03% 2.9212ms 116 25.183us 19.841us 34.657us void cudnn::winograd::generateWinogradTilesKernel<int=0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)
  72. 0.01% 506.58us 11 46.052us 1.6000us 434.00us [CUDA memcpy DtoD]
  73. 0.01% 498.81us 6 83.134us 2.1120us 438.07us void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, int=1, int=1, int>, int=16, Eigen::MakePointer>, Eigen::TensorCwiseUnaryOp<Eigen::internal::scalar_right<float, float, Eigen::internal::scalar_sum_op<float, float>>, Eigen::TensorMap<Eigen::Tensor<float const , int=1, int=1, int>, int=16, Eigen::MakePointer> const > const > const , Eigen::GpuDevice>, int>(float, int=1)
  74. 0.01% 443.03us 229 1.9340us 1.8240us 3.2000us cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)
  75. 0.00% 395.98us 1 395.98us 395.98us 395.98us void tensorflow::functor::FillPhiloxRandomKernelLaunch<tensorflow::random::NormalDistribution<tensorflow::random::PhiloxRandom, float>>(tensorflow::random::PhiloxRandom, tensorflow::random::PhiloxRandomResultElementType*, __int64, tensorflow::functor::FillPhiloxRandomKernelLaunch<tensorflow::random::NormalDistribution<tensorflow::random::PhiloxRandom, float>>)
  76. 0.00% 394.99us 5 78.998us 14.848us 136.42us void tensorflow::functor::FillPhiloxRandomKernelLaunch<tensorflow::random::TruncatedNormalDistribution<tensorflow::random::SingleSampleAdapter<tensorflow::random::PhiloxRandom>, float>>(tensorflow::random::PhiloxRandom, tensorflow::random::PhiloxRandomResultElementType*, __int64, tensorflow::functor::FillPhiloxRandomKernelLaunch<tensorflow::random::TruncatedNormalDistribution<tensorflow::random::SingleSampleAdapter<tensorflow::random::PhiloxRandom>, float>>)
  77. 0.00% 391.96us 562 697ns 576ns 6.1440us [CUDA memset]
  78. 0.00% 250.98us 120 2.0910us 1.8880us 2.2720us cudnn::maxwell::gemm::computeWgradOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams)
  79. 0.00% 225.48us 124 1.8180us 1.6960us 2.1120us cudnn::maxwell::gemm::computeBOffsetsKernel(cudnn::maxwell::gemm::ComputeBOffsetsParams)
  80. 0.00% 223.02us 120 1.8580us 1.3440us 11.008us void cudnn::maxwell::gemm::setOutputKernel<float>(int, float*, float)
  81. 0.00% 12.032us 8 1.5040us 1.2480us 2.3040us [CUDA memcpy HtoD]
  82.  
  83. ==91215== API calls:
  84. Time(%) Time Calls Avg Min Max Name
  85. 63.82% 7.19684s 221 32.565ms 339.51us 51.307ms cuCtxSynchronize
  86. 9.98% 1.12557s 2 562.79ms 556.85ms 568.72ms cuDevicePrimaryCtxRetain
  87. 9.04% 1.01963s 21882 46.596us 8.7490us 652.66ms cudaLaunch
  88. 6.53% 736.70ms 492101 1.4970us 705ns 827.32us cuEventQuery
  89. 5.41% 609.86ms 8 76.233ms 22.279us 609.14ms cudaStreamCreateWithFlags
  90. 3.14% 354.64ms 64 5.5412ms 1.4091ms 21.912ms cuEventSynchronize
  91. 0.63% 71.197ms 150919 471ns 280ns 766.28us cudaSetupArgument
  92. 0.24% 26.535ms 2 13.267ms 9.1017ms 17.433ms cuMemAlloc
  93. 0.20% 23.006ms 1210 19.012us 8.8600us 133.16us cuMemcpyDtoHAsync
  94. 0.18% 20.677ms 9742 2.1220us 1.0350us 25.143us cudaStreamWaitEvent
  95. 0.18% 19.789ms 7376 2.6820us 1.2110us 28.776us cudaEventRecord
  96. 0.16% 18.386ms 21882 840ns 334ns 767.97us cudaConfigureCall
  97. 0.11% 12.298ms 550 22.359us 13.816us 56.188us cuMemsetD32Async
  98. 0.09% 9.8094ms 3 3.2698ms 1.4747ms 4.2015ms cuMemHostAlloc
  99. 0.06% 6.7944ms 10107 672ns 274ns 762.78us cudaGetLastError
  100. 0.05% 5.1881ms 2562 2.0250us 870ns 14.234us cuEventRecord
  101. 0.04% 4.0525ms 660 6.1400us 2.8600us 811.73us cudaStreamQuery
  102. 0.02% 2.3163ms 1217 1.9030us 1.0010us 14.494us cuStreamWaitEvent
  103. 0.02% 2.2673ms 4 566.82us 514.32us 618.66us cuMemGetInfo
  104. 0.02% 1.8821ms 376 5.0050us 211ns 186.80us cuDeviceGetAttribute
  105. 0.02% 1.7974ms 338 5.3170us 1.9140us 19.423us cudaEventCreate
  106. 0.01% 1.1840ms 338 3.5020us 1.7090us 16.352us cudaEventDestroy
  107. 0.01% 1.0757ms 2 537.86us 505.23us 570.48us cuDeviceTotalMem
  108. 0.01% 1.0731ms 3 357.69us 14.289us 531.29us cudaMalloc
  109. 0.01% 803.24us 2 401.62us 384.83us 418.41us cudaGetDeviceProperties
  110. 0.01% 691.32us 4 172.83us 158.26us 192.44us cuDeviceTotalMem
  111. 0.00% 491.16us 2 245.58us 171.56us 319.60us cuDeviceGetProperties
  112. 0.00% 388.12us 8 48.515us 22.529us 168.51us cuStreamCreate
  113. 0.00% 282.75us 6 47.125us 36.269us 64.555us cuDeviceGetName
  114. 0.00% 210.05us 146 1.4380us 628ns 6.6940us cuEventCreate
  115. 0.00% 202.05us 10 20.204us 15.019us 34.114us cudaMemsetAsync
  116. 0.00% 193.47us 11 17.587us 14.836us 29.834us cudaMemcpyAsync
  117. 0.00% 147.38us 146 1.0090us 537ns 10.067us cuEventDestroy
  118. 0.00% 137.38us 24 5.7240us 2.8640us 14.948us cudaBindTexture
  119. 0.00% 136.40us 7 19.485us 15.689us 31.919us cuMemcpyHtoDAsync
  120. 0.00% 128.81us 2 64.402us 62.592us 66.213us cuMemsetD32
  121. 0.00% 114.25us 64 1.7850us 1.4630us 5.6450us cuEventElapsedTime
  122. 0.00% 78.697us 24 3.2790us 500ns 9.5080us cuCtxSetCurrent
  123. 0.00% 71.042us 8 8.8800us 4.2280us 24.102us cuStreamDestroy
  124. 0.00% 62.617us 8 7.8270us 1.8660us 32.014us cuStreamQuery
  125. 0.00% 46.035us 24 1.9180us 1.0550us 3.3500us cudaUnbindTexture
  126. 0.00% 35.458us 24 1.4770us 934ns 9.4170us cudaEventCreateWithFlags
  127. 0.00% 26.015us 1 26.015us 26.015us 26.015us cudaMemcpy
  128. 0.00% 18.299us 21 871ns 546ns 1.7640us cudaDeviceGetAttribute
  129. 0.00% 9.9600us 10 996ns 276ns 3.1900us cuDeviceGetCount
  130. 0.00% 9.1240us 2 4.5620us 3.1170us 6.0070us cudaGetDevice
  131. 0.00% 7.3820us 10 738ns 401ns 1.5590us cuDeviceGet
  132. 0.00% 4.7260us 4 1.1810us 496ns 2.3530us cuDriverGetVersion
  133. 0.00% 4.2060us 2 2.1030us 1.8410us 2.3650us cuDeviceGetPCIBusId
  134. 0.00% 3.8840us 2 1.9420us 1.5750us 2.3090us cuDevicePrimaryCtxSetFlags
  135. 0.00% 3.2630us 2 1.6310us 1.5480us 1.7150us cuInit
  136. 0.00% 2.8190us 8 352ns 263ns 503ns cuCtxGetDevice
  137. 0.00% 1.8750us 1 1.8750us 1.8750us 1.8750us cudaGetDeviceCount
  138. 0.00% 1.7210us 2 860ns 829ns 892ns cuDeviceComputeCapability
  139. 0.00% 1.6040us 4 401ns 299ns 623ns cuDeviceCanAccessPeer
  140. 0.00% 1.0870us 2 543ns 363ns 724ns cuCtxGetCurrent
  141. 0.00% 985ns 2 492ns 486ns 499ns cudaFree
  142.  
  143. real 0m21.865s
  144. user 0m20.820s
  145. sys 0m7.352s
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement