Advertisement
Guest User

CUTLASS tests: clang vs. nvcc on sm_60

a guest
Dec 11th, 2017
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.77 KB | None | 0 0
  1.  
  2. ==========================================
  3. ## bin/dgemm_nn_sm35-60_clang_9.0
  4. ==========================================
  5.  
  6. ------------------------------------------------------------
  7. 7168x4096x4096, GEMM_nn, 29360128 C elements, 10 timing iterations
  8.  
  9. Avg runtime: 53.706 ms, total flops: 240576888832, GFLOP/s: 4479.52
  10.  
  11. Final wave_efficiency 1.0000, tiling_efficiency 8.0000
  12. Invoking kernel<<<(448, 256, 1), (1.y,64.x), 0, 0>>>(), 13 SM occupancy, 4096 split_k
  13. Avg runtime: 162.993 ms, total flops: 240576888832, GFLOP/s: 1476.00
  14.  
  15. Final wave_efficiency 1.0000, tiling_efficiency 16.0000
  16. Invoking kernel<<<(224, 128, 1), (1.y,64.x), 0, 0>>>(), 7 SM occupancy, 4096 split_k
  17. Avg runtime: 103.393 ms, total flops: 240576888832, GFLOP/s: 2326.83
  18.  
  19. Final wave_efficiency 1.0000, tiling_efficiency 32.0000
  20. Invoking kernel<<<(112, 64, 1), (1.y,256.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  21. Avg runtime: 68.634 ms, total flops: 240576888832, GFLOP/s: 3505.24
  22.  
  23. Final wave_efficiency 1.0000, tiling_efficiency 25.6000
  24. Invoking kernel<<<(56, 128, 1), (1.y,128.x), 0, 0>>>(), 3 SM occupancy, 4096 split_k
  25. Avg runtime: 83.297 ms, total flops: 240576888832, GFLOP/s: 2888.18
  26.  
  27. Final wave_efficiency 1.0000, tiling_efficiency 25.6000
  28. Invoking kernel<<<(224, 32, 1), (1.y,128.x), 0, 0>>>(), 3 SM occupancy, 4096 split_k
  29. Avg runtime: 59.055 ms, total flops: 240576888832, GFLOP/s: 4073.79
  30.  
  31. Final wave_efficiency 1.0000, tiling_efficiency 42.6667
  32. Invoking kernel<<<(112, 32, 1), (1.y,128.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  33. Avg runtime: 56.380 ms, total flops: 240576888832, GFLOP/s: 4267.04
  34.  
  35.  
  36.  
  37.  
  38. ==========================================
  39. ## bin/dgemm_nn_sm35-60_nvcc_9.0
  40. ==========================================
  41.  
  42. ------------------------------------------------------------
  43. 7168x4096x4096, GEMM_nn, 29360128 C elements, 10 timing iterations
  44.  
  45. Avg runtime: 53.666 ms, total flops: 240576888832, GFLOP/s: 4482.85
  46.  
  47. Final wave_efficiency 1.0000, tiling_efficiency 8.0000
  48. Invoking kernel<<<(448, 256, 1), (1.y,64.x), 0, 0>>>(), 13 SM occupancy, 4096 split_k
  49. Avg runtime: 107.486 ms, total flops: 240576888832, GFLOP/s: 2238.21
  50.  
  51. Final wave_efficiency 1.0000, tiling_efficiency 16.0000
  52. Invoking kernel<<<(224, 128, 1), (1.y,64.x), 0, 0>>>(), 7 SM occupancy, 4096 split_k
  53. Avg runtime: 61.130 ms, total flops: 240576888832, GFLOP/s: 3935.50
  54.  
  55. Final wave_efficiency 1.0000, tiling_efficiency 32.0000
  56. Invoking kernel<<<(112, 64, 1), (1.y,256.x), 0, 0>>>(), 3 SM occupancy, 4096 split_k
  57. Avg runtime: 59.699 ms, total flops: 240576888832, GFLOP/s: 4029.82
  58.  
  59. Final wave_efficiency 1.0000, tiling_efficiency 25.6000
  60. Invoking kernel<<<(56, 128, 1), (1.y,128.x), 0, 0>>>(), 3 SM occupancy, 4096 split_k
  61. Avg runtime: 57.979 ms, total flops: 240576888832, GFLOP/s: 4149.35
  62.  
  63. Final wave_efficiency 1.0000, tiling_efficiency 25.6000
  64. Invoking kernel<<<(224, 32, 1), (1.y,128.x), 0, 0>>>(), 3 SM occupancy, 4096 split_k
  65. Avg runtime: 57.273 ms, total flops: 240576888832, GFLOP/s: 4200.53
  66.  
  67. Final wave_efficiency 1.0000, tiling_efficiency 42.6667
  68. Invoking kernel<<<(112, 32, 1), (1.y,128.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  69. Avg runtime: 55.135 ms, total flops: 240576888832, GFLOP/s: 4363.44
  70.  
  71.  
  72.  
  73.  
  74. ==========================================
  75. ## bin/igemm_nn_sm35-60_clang_9.0
  76. ==========================================
  77.  
  78. ------------------------------------------------------------
  79. 7168x4096x4096, GEMM_nn, 29360128 C elements, 10 timing iterations
  80.  
  81. CUDA error 30 [gemm.cu, 199]: unknown error
  82. CUDA error 30 [gemm.cu, 253]: unknown error
  83. CUDA error 30 [gemm.cu, 253]: unknown error
  84. CUDA error 30 [gemm.cu, 275]: unknown error
  85. CUDA error 30 [gemm.cu, 275]: unknown error
  86. CUDA error 30 [gemm.cu, 275]: unknown error
  87. CUDA error 30 [gemm.cu, 275]: unknown error
  88. CUDA error 30 [gemm.cu, 275]: unknown error
  89. CUDA error 30 [gemm.cu, 275]: unknown error
  90. CUDA error 30 [gemm.cu, 275]: unknown error
  91. CUDA error 30 [gemm.cu, 275]: unknown error
  92. CUDA error 30 [gemm.cu, 275]: unknown error
  93. CUDA error 30 [gemm.cu, 275]: unknown error
  94. Avg runtime: 0.000 ms, total flops: 240576888832, GFLOP/s: 544784609.36
  95.  
  96. Final wave_efficiency 1.0000, tiling_efficiency 10.6667
  97. Invoking kernel<<<(448, 128, 1), (1.y,32.x), 0, 0>>>(), 28 SM occupancy, 4096 split_k
  98. Avg runtime: 54.287 ms, total flops: 240576888832, GFLOP/s: 4431.56
  99.  
  100. Final wave_efficiency 1.0000, tiling_efficiency 16.0000
  101. Invoking kernel<<<(224, 128, 1), (1.y,64.x), 0, 0>>>(), 20 SM occupancy, 4096 split_k
  102. Avg runtime: 48.548 ms, total flops: 240576888832, GFLOP/s: 4955.47
  103.  
  104. Final wave_efficiency 1.0000, tiling_efficiency 32.0000
  105. Invoking kernel<<<(112, 64, 1), (1.y,128.x), 0, 0>>>(), 7 SM occupancy, 4096 split_k
  106. Avg runtime: 41.715 ms, total flops: 240576888832, GFLOP/s: 5767.18
  107.  
  108. Final wave_efficiency 1.0000, tiling_efficiency 42.6667
  109. Invoking kernel<<<(56, 64, 1), (1.y,256.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  110. Avg runtime: 40.127 ms, total flops: 240576888832, GFLOP/s: 5995.43
  111.  
  112. Final wave_efficiency 1.0000, tiling_efficiency 42.6667
  113. Invoking kernel<<<(112, 32, 1), (1.y,256.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  114. Avg runtime: 41.407 ms, total flops: 240576888832, GFLOP/s: 5810.05
  115.  
  116. Final wave_efficiency 1.0000, tiling_efficiency 64.0000
  117. Invoking kernel<<<(56, 32, 1), (1.y,256.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  118. Avg runtime: 36.196 ms, total flops: 240576888832, GFLOP/s: 6646.46
  119.  
  120.  
  121.  
  122.  
  123. ==========================================
  124. ## bin/igemm_nn_sm35-60_nvcc_9.0
  125. ==========================================
  126.  
  127. ------------------------------------------------------------
  128. 7168x4096x4096, GEMM_nn, 29360128 C elements, 10 timing iterations
  129.  
  130. CUDA error 30 [gemm.cu, 199]: unknown error
  131. CUDA error 30 [gemm.cu, 253]: unknown error
  132. CUDA error 30 [gemm.cu, 253]: unknown error
  133. CUDA error 30 [gemm.cu, 275]: unknown error
  134. CUDA error 30 [gemm.cu, 275]: unknown error
  135. CUDA error 30 [gemm.cu, 275]: unknown error
  136. CUDA error 30 [gemm.cu, 275]: unknown error
  137. CUDA error 30 [gemm.cu, 275]: unknown error
  138. CUDA error 30 [gemm.cu, 275]: unknown error
  139. CUDA error 30 [gemm.cu, 275]: unknown error
  140. CUDA error 30 [gemm.cu, 275]: unknown error
  141. CUDA error 30 [gemm.cu, 275]: unknown error
  142. CUDA error 30 [gemm.cu, 275]: unknown error
  143. Avg runtime: 0.000 ms, total flops: 240576888832, GFLOP/s: 1503605593.18
  144.  
  145. Final wave_efficiency 1.0000, tiling_efficiency 10.6667
  146. Invoking kernel<<<(448, 128, 1), (1.y,32.x), 0, 0>>>(), 28 SM occupancy, 4096 split_k
  147. Avg runtime: 50.958 ms, total flops: 240576888832, GFLOP/s: 4721.12
  148.  
  149. Final wave_efficiency 1.0000, tiling_efficiency 16.0000
  150. Invoking kernel<<<(224, 128, 1), (1.y,64.x), 0, 0>>>(), 20 SM occupancy, 4096 split_k
  151. Avg runtime: 49.173 ms, total flops: 240576888832, GFLOP/s: 4892.42
  152.  
  153. Final wave_efficiency 1.0000, tiling_efficiency 32.0000
  154. Invoking kernel<<<(112, 64, 1), (1.y,128.x), 0, 0>>>(), 8 SM occupancy, 4096 split_k
  155. Avg runtime: 44.977 ms, total flops: 240576888832, GFLOP/s: 5348.92
  156.  
  157. Final wave_efficiency 1.0000, tiling_efficiency 42.6667
  158. Invoking kernel<<<(56, 64, 1), (1.y,256.x), 0, 0>>>(), 3 SM occupancy, 4096 split_k
  159. Avg runtime: 42.101 ms, total flops: 240576888832, GFLOP/s: 5714.31
  160.  
  161. Final wave_efficiency 1.0000, tiling_efficiency 42.6667
  162. Invoking kernel<<<(112, 32, 1), (1.y,256.x), 0, 0>>>(), 3 SM occupancy, 4096 split_k
  163. Avg runtime: 41.762 ms, total flops: 240576888832, GFLOP/s: 5760.67
  164.  
  165. Final wave_efficiency 1.0000, tiling_efficiency 64.0000
  166. Invoking kernel<<<(56, 32, 1), (1.y,256.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  167. Avg runtime: 36.201 ms, total flops: 240576888832, GFLOP/s: 6645.63
  168.  
  169.  
  170.  
  171.  
  172. ==========================================
  173. ## bin/sgemm_nn_sm35-60_clang_9.0
  174. ==========================================
  175.  
  176. ------------------------------------------------------------
  177. 7168x4096x4096, GEMM_nn, 29360128 C elements, 10 timing iterations
  178.  
  179. Avg runtime: 26.767 ms, total flops: 240576888832, GFLOP/s: 8987.94
  180.  
  181. Final wave_efficiency 1.0000, tiling_efficiency 8.0000
  182. Invoking kernel<<<(448, 256, 1), (1.y,64.x), 0, 0>>>(), 14 SM occupancy, 4096 split_k
  183. Avg runtime: 84.903 ms, total flops: 240576888832, GFLOP/s: 2833.56
  184.  
  185. Final wave_efficiency 1.0000, tiling_efficiency 16.0000
  186. Invoking kernel<<<(224, 128, 1), (1.y,64.x), 0, 0>>>(), 16 SM occupancy, 4096 split_k
  187. Avg runtime: 43.737 ms, total flops: 240576888832, GFLOP/s: 5500.53
  188.  
  189. Final wave_efficiency 1.0000, tiling_efficiency 32.0000
  190. Invoking kernel<<<(112, 64, 1), (1.y,64.x), 0, 0>>>(), 8 SM occupancy, 4096 split_k
  191. Avg runtime: 30.143 ms, total flops: 240576888832, GFLOP/s: 7981.22
  192.  
  193. Final wave_efficiency 1.0000, tiling_efficiency 25.6000
  194. Invoking kernel<<<(56, 128, 1), (1.y,128.x), 0, 0>>>(), 5 SM occupancy, 4096 split_k
  195. Avg runtime: 34.104 ms, total flops: 240576888832, GFLOP/s: 7054.30
  196.  
  197. Final wave_efficiency 1.0000, tiling_efficiency 25.6000
  198. Invoking kernel<<<(224, 32, 1), (1.y,128.x), 0, 0>>>(), 5 SM occupancy, 4096 split_k
  199. Avg runtime: 30.888 ms, total flops: 240576888832, GFLOP/s: 7788.80
  200.  
  201. Final wave_efficiency 1.0000, tiling_efficiency 64.0000
  202. Invoking kernel<<<(56, 32, 1), (1.y,256.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  203. Avg runtime: 27.776 ms, total flops: 240576888832, GFLOP/s: 8661.31
  204.  
  205.  
  206.  
  207.  
  208. ==========================================
  209. ## bin/sgemm_nn_sm35-60_nvcc_9.0
  210. ==========================================
  211.  
  212. ------------------------------------------------------------
  213. 7168x4096x4096, GEMM_nn, 29360128 C elements, 10 timing iterations
  214.  
  215. Avg runtime: 26.773 ms, total flops: 240576888832, GFLOP/s: 8985.93
  216.  
  217. Final wave_efficiency 1.0000, tiling_efficiency 8.0000
  218. Invoking kernel<<<(448, 256, 1), (1.y,64.x), 0, 0>>>(), 24 SM occupancy, 4096 split_k
  219. Avg runtime: 98.990 ms, total flops: 240576888832, GFLOP/s: 2430.31
  220.  
  221. Final wave_efficiency 1.0000, tiling_efficiency 16.0000
  222. Invoking kernel<<<(224, 128, 1), (1.y,64.x), 0, 0>>>(), 16 SM occupancy, 4096 split_k
  223. Avg runtime: 35.311 ms, total flops: 240576888832, GFLOP/s: 6813.08
  224.  
  225. Final wave_efficiency 1.0000, tiling_efficiency 32.0000
  226. Invoking kernel<<<(112, 64, 1), (1.y,64.x), 0, 0>>>(), 8 SM occupancy, 4096 split_k
  227. Avg runtime: 27.999 ms, total flops: 240576888832, GFLOP/s: 8592.41
  228.  
  229. Final wave_efficiency 1.0000, tiling_efficiency 25.6000
  230. Invoking kernel<<<(56, 128, 1), (1.y,128.x), 0, 0>>>(), 6 SM occupancy, 4096 split_k
  231. Avg runtime: 29.564 ms, total flops: 240576888832, GFLOP/s: 8137.48
  232.  
  233. Final wave_efficiency 1.0000, tiling_efficiency 25.6000
  234. Invoking kernel<<<(224, 32, 1), (1.y,128.x), 0, 0>>>(), 6 SM occupancy, 4096 split_k
  235. Avg runtime: 30.145 ms, total flops: 240576888832, GFLOP/s: 7980.67
  236.  
  237. Final wave_efficiency 1.0000, tiling_efficiency 64.0000
  238. Invoking kernel<<<(56, 32, 1), (1.y,256.x), 0, 0>>>(), 2 SM occupancy, 4096 split_k
  239. Avg runtime: 27.201 ms, total flops: 240576888832, GFLOP/s: 8844.31
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement