Advertisement
Guest User

MicroHH GPU

a guest
Sep 30th, 2014
261
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.39 KB | None | 0 0
  1. ==26084== Profiling result:
  2. Time(%) Time Calls Avg Min Max Name
  3. 10.72% 2.44564s 628 3.8943ms 3.8246ms 3.9915ms pres_4_hdma(double*, double*, double*, double*, double*, double*, double*, double*, int, int, int)
  4. 9.80% 2.23568s 157 14.240ms 14.178ms 15.649ms advec_4_advecw(double*, double*, double*, double*, double*, double, double, int, int, int, int, int, int, int, int)
  5. 9.04% 2.06149s 314 6.5653ms 6.3450ms 6.7574ms diff_4_diffc(double*, double const *, double const *, double const *, double, double, double, int, int, int, int, int, int, int, int)
  6. 7.96% 1.81504s 157 11.561ms 11.549ms 12.363ms advec_4_advecu(double*, double*, double*, double*, double*, double, double, int, int, int, int, int, int, int, int)
  7. 7.03% 1.60231s 157 10.206ms 10.187ms 10.544ms advec_4_advecv(double*, double*, double*, double*, double*, double, double, int, int, int, int, int, int, int, int)
  8. 4.97% 1.13340s 157 7.2191ms 7.2000ms 7.3375ms pres_4_presin(double*, double const *, double const *, double const *, double const *, double const *, double const *, double const *, double, double, double, int, int, int, int, int, int, int, int, int, int)
  9. 4.48% 1.02251s 157 6.5128ms 6.3021ms 6.6558ms diff_4_diffw(double*, double const *, double const *, double const *, double, double, double, int, int, int, int, int, int, int, int)
  10. 3.67% 836.99ms 157 5.3311ms 5.3254ms 5.3348ms pres_4_presout(double*, double*, double*, double const *, double const *, double, double, int, int, int, int, int, int, int, int)
  11. 3.64% 831.11ms 628 1.3234ms 1.3088ms 1.4022ms pres_4_solvein(double const *, double const *, double const *, double const *, double const *, double const *, double const *, double const *, double*, double*, double*, double*, double*, double*, double*, double*, double const *, double const *, int, int, int, int, int, int, int)
  12. 3.24% 738.23ms 40192 18.367us 13.388us 25.221us void dpRealComplex::preprocessC2C_kernelMem<double, fftAxii_t=1>(Complex<double>*, double const *, unsigned int, coordDivisors_t, Coord<unsigned int>, Coord, unsigned int, Complex, callbackt)
  13. 2.98% 679.75ms 40192 16.912us 4.2640us 29.388us void spRealComplex::unpackC2R_kernel<double>(double*, Complex<spRealComplex::unpackC2R_kernel<double>> const *, unsigned int, coordDivisors_t, Coord<unsigned int>, Coord, callbackt)
  14. 2.88% 657.31ms 40192 16.354us 8.8730us 27.591us void spRealComplex::packR2C_kernel<double>(Complex<double>*, Complex const *, unsigned int, coordDivisors_t, Coord<unsigned int>, Coord, callbackt)
  15. 2.36% 539.03ms 107 5.0377ms 5.0253ms 5.2811ms advec_4_calccfl(double*, double const *, double const *, double const *, double const *, double, double, int, int, int, int, int, int, int, int)
  16. 2.35% 535.67ms 40192 13.327us 8.8700us 16.755us void dpRealComplex::postprocessC2C_kernelMem<double, fftAxii_t=1>(Complex<double>*, double const *, unsigned int, coordDivisors_t, Coord<unsigned int>, Coord, unsigned int, Complex, callbackt)
  17. 2.08% 474.27ms 40192 11.800us 7.2700us 17.056us pres_4_complex_double_y(double2*, double*, int, int, bool)
  18. 1.99% 453.58ms 40192 11.285us 9.4860us 13.574us pres_4_complex_double_x(double2*, double*, int, int, bool)
  19. 1.52% 345.84ms 20096 17.209us 16.089us 18.627us void dpRadix0032B::kernel1Mem<fftDirection_t=1>(Complex<double>*, Complex const *, unsigned int, unsigned int, unsigned int, divisor_t, Complex const *, Complex const *, coordDivisors_t, Coord<unsigned int>, Coord, unsigned int, unsigned int, double, int, int)
  20. 1.36% 311.08ms 20096 15.479us 14.633us 17.032us void dpRadix0032B::kernel1Mem<fftDirection_t=-1>(Complex<double>*, Complex const *, unsigned int, unsigned int, unsigned int, divisor_t, Complex const *, Complex const *, coordDivisors_t, Coord<unsigned int>, Coord, unsigned int, unsigned int, double, int, int)
  21. 1.33% 303.90ms 156 1.9480ms 1.9214ms 1.9723ms void rk3_kernel2<int=0>(double*, double*, double, int, int, int, int, int, int, int, int)
  22. 1.33% 303.73ms 156 1.9470ms 1.9200ms 1.9709ms void rk3_kernel2<int=2>(double*, double*, double, int, int, int, int, int, int, int, int)
  23. 1.33% 303.71ms 156 1.9468ms 1.9206ms 1.9706ms void rk3_kernel2<int=1>(double*, double*, double, int, int, int, int, int, int, int, int)
  24. 1.33% 303.64ms 157 1.9340ms 1.9177ms 1.9571ms force_flux_step1(double*, double*, double const *, double const *, double const *, int, int, int, int, int, int, int, int)
  25. 1.33% 303.26ms 20096 15.090us 11.791us 18.072us void dpRadix0003A::kernel1Mem<fftDirection_t=-1>(Complex<double>*, Complex const *, unsigned int, unsigned int, unsigned int, divisor_t, Complex const *, Complex const *, coordDivisors_t, Coord<unsigned int>, Coord, unsigned int, unsigned int, double, int, int)
  26. 1.17% 267.80ms 20096 13.326us 10.330us 15.064us void dpRadix0003A::kernel1Mem<fftDirection_t=1>(Complex<double>*, Complex const *, unsigned int, unsigned int, unsigned int, divisor_t, Complex const *, Complex const *, coordDivisors_t, Coord<unsigned int>, Coord, unsigned int, unsigned int, double, int, int)
  27. 1.16% 263.50ms 314 839.16us 827.18us 855.05us void deviceReduceInterior<int=0, int=128>(double const *, double*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int)
  28. 1.09% 249.56ms 20096 12.418us 11.631us 13.097us void dpVector0128C::kernelTex<fftDirection_t=-1>(Complex<double>*, unsigned int, unsigned int, unsigned int, unsigned int, coordDivisors_t, Coord<unsigned int>, Coord)
  29. 1.08% 247.34ms 20096 12.307us 11.472us 12.954us void dpVector0128C::kernelTex<fftDirection_t=1>(Complex<double>*, unsigned int, unsigned int, unsigned int, unsigned int, coordDivisors_t, Coord<unsigned int>, Coord)
  30. 0.93% 211.11ms 1099 192.10us 185.88us 202.18us grid_cyclic_x(double*, int, int, int, int, int, int, int, int, int, int)
  31. 0.90% 205.55ms 53 3.8783ms 3.8507ms 3.9194ms pres_4_calcdivergence(double*, double*, double*, double*, double*, double, double, int, int, int, int, int, int, int, int)
  32. 0.78% 178.43ms 628 284.12us 278.50us 308.09us pres_4_solveputback(double*, double const *, int, int, int, int, int)
  33. 0.69% 157.21ms 157 1.0013ms 994.28us 1.0301ms force_flux_step2(double*, double, int, int, int, int, int, int, int, int)
  34. 0.69% 156.86ms 20096 7.8050us 6.2970us 8.5560us pres_4_normalize(double*, int, int, double)
  35. 0.66% 149.94ms 157 955.06us 952.35us 957.78us pres_4_solveout(double*, double*, int, int, int, int, int, int, int, int, int, int)
  36. 0.59% 134.28ms 160 839.22us 829.58us 853.09us void deviceReduceInterior<int=1, int=128>(double const *, double*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int)
  37. 0.53% 121.54ms 543 223.84us 1.8550us 10.650ms [CUDA memcpy DtoH]
  38. 0.51% 116.02ms 86 1.3491ms 1.2480us 10.503ms [CUDA memcpy HtoD]
  39. 0.31% 70.885ms 1099 64.499us 62.536us 66.645us grid_cyclic_y(double*, int, int, int, int, int, int, int, int, int, int)
  40. 0.04% 9.9609ms 314 31.722us 29.992us 33.400us boundary_setgctop_4th(double*, int, double*, double*, double*, int, int, int, int)
  41. 0.04% 9.8655ms 314 31.418us 30.258us 32.735us boundary_setgcbot_4th(double*, int, double*, double*, double*, int, int, int, int)
  42. 0.02% 4.2120ms 157 26.827us 26.238us 27.546us boundary_setgcbotw_4th(double*, int, int, int, int)
  43. 0.02% 4.1904ms 157 26.690us 26.110us 27.442us boundary_setgctopw_4th(double*, int, int, int, int)
  44. 0.01% 2.6596ms 314 8.4690us 7.9610us 9.3020us void deviceReduceAll<int=0, int=128>(double const *, double*, unsigned int, unsigned int, double)
  45. 0.01% 2.6289ms 157 16.744us 15.991us 17.480us pres_4_gcwt(double*, int, int, int, int, int, int, int, int)
  46. 0.01% 1.3540ms 160 8.4620us 8.0470us 9.1790us void deviceReduceAll<int=1, int=128>(double const *, double*, unsigned int, unsigned int, double)
  47. 0.00% 783.23us 314 2.4940us 2.4720us 2.7190us void deviceReduceAll<int=0, int=64>(double const *, double*, unsigned int, unsigned int, double)
  48. 0.00% 397.31us 160 2.4830us 2.4680us 2.5660us void deviceReduceAll<int=1, int=64>(double const *, double*, unsigned int, unsigned int, double)
  49. 0.00% 202.87us 157 1.2920us 1.2470us 2.5280us [CUDA memcpy DtoD]
  50.  
  51. ==26084== API calls:
  52. Time(%) Time Calls Avg Min Max Name
  53. 45.75% 12.3440s 668 18.479ms 10.000us 72.615ms cudaMemcpy
  54. 23.79% 6.41859s 80384 79.849us 5.0000us 21.959ms cudaThreadSynchronize
  55. 14.28% 3.85301s 390782 9.8590us 6.0000us 4.4410ms cudaLaunch
  56. 6.41% 1.72871s 3477776 497ns 0ns 317.00us cudaSetupArgument
  57. 2.10% 566.94ms 88 6.4425ms 12.000us 217.39ms cudaHostAlloc
  58. 1.67% 450.13ms 281344 1.5990us 1.0000us 1.5580ms cudaFuncSetCacheConfig
  59. 0.92% 247.60ms 118 2.0983ms 93.000us 18.148ms cudaMemcpy2D
  60. 0.88% 236.51ms 81 2.9199ms 1.0000us 231.13ms cudaFree
  61. 0.84% 225.56ms 390782 577ns 0ns 315.00us cudaConfigureCall
  62. 0.73% 197.16ms 1 197.16ms 197.16ms 197.16ms cudaDeviceReset
  63. 0.71% 191.18ms 40192 4.7560us 4.0000us 251.00us cudaBindTexture
  64. 0.59% 158.42ms 281344 563ns 0ns 312.00us cudaPeekAtLastError
  65. 0.58% 157.68ms 281608 559ns 0ns 312.00us cudaGetLastError
  66. 0.45% 120.53ms 88 1.3696ms 15.000us 10.697ms cudaFreeHost
  67. 0.27% 71.719ms 40192 1.7840us 1.0000us 179.00us cudaUnbindTexture
  68. 0.04% 9.6110ms 92 104.47us 4.0000us 636.00us cudaMalloc
  69. 0.00% 1.2450ms 4 311.25us 302.00us 317.00us cudaGetDeviceProperties
  70. 0.00% 766.00us 166 4.6140us 0ns 156.00us cuDeviceGetAttribute
  71. 0.00% 98.000us 2 49.000us 45.000us 53.000us cuDeviceTotalMem
  72. 0.00% 85.000us 2 42.500us 42.000us 43.000us cuDeviceGetName
  73. 0.00% 41.000us 24 1.7080us 1.0000us 4.0000us cudaGetDevice
  74. 0.00% 5.0000us 3 1.6660us 1.0000us 2.0000us cuDeviceGetCount
  75. 0.00% 2.0000us 3 666ns 0ns 1.0000us cuDeviceGet
  76. 0.00% 2.0000us 1 2.0000us 2.0000us 2.0000us cuDriverGetVersion
  77. 0.00% 1.0000us 1 1.0000us 1.0000us 1.0000us cuInit
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement