Guest User

Untitled

a guest
Jul 21st, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 26.86 KB | None | 0 0
  1. .text
  2. .hsa_code_object_version 2,1
  3. .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
  4. .globl fuse_conv2d_relu_kernel0
  5. .p2align 8
  6. .type fuse_conv2d_relu_kernel0,@function
  7. .amdgpu_hsa_kernel fuse_conv2d_relu_kernel0
  8. fuse_conv2d_relu_kernel0:
  9. .amd_kernel_code_t
  10. amd_code_version_major = 1
  11. amd_code_version_minor = 1
  12. amd_machine_kind = 1
  13. amd_machine_version_major = 8
  14. amd_machine_version_minor = 0
  15. amd_machine_version_stepping = 3
  16. kernel_code_entry_byte_offset = 256
  17. kernel_code_prefetch_byte_size = 0
  18. max_scratch_backing_memory_byte_size = 0
  19. granulated_workitem_vgpr_count = 1
  20. granulated_wavefront_sgpr_count = 1
  21. priority = 0
  22. float_mode = 192
  23. priv = 0
  24. enable_dx10_clamp = 1
  25. debug_mode = 0
  26. enable_ieee_mode = 1
  27. enable_sgpr_private_segment_wave_byte_offset = 0
  28. user_sgpr_count = 6
  29. enable_trap_handler = 1
  30. enable_sgpr_workgroup_id_x = 1
  31. enable_sgpr_workgroup_id_y = 0
  32. enable_sgpr_workgroup_id_z = 0
  33. enable_sgpr_workgroup_info = 0
  34. enable_vgpr_workitem_id = 0
  35. enable_exception_msb = 0
  36. granulated_lds_size = 0
  37. enable_exception = 0
  38. enable_sgpr_private_segment_buffer = 1
  39. enable_sgpr_dispatch_ptr = 0
  40. enable_sgpr_queue_ptr = 0
  41. enable_sgpr_kernarg_segment_ptr = 1
  42. enable_sgpr_dispatch_id = 0
  43. enable_sgpr_flat_scratch_init = 0
  44. enable_sgpr_private_segment_size = 0
  45. enable_sgpr_grid_workgroup_count_x = 0
  46. enable_sgpr_grid_workgroup_count_y = 0
  47. enable_sgpr_grid_workgroup_count_z = 0
  48. enable_ordered_append_gds = 0
  49. private_element_size = 1
  50. is_ptr64 = 1
  51. is_dynamic_callstack = 0
  52. is_debug_enabled = 0
  53. is_xnack_enabled = 0
  54. workitem_private_segment_byte_size = 0
  55. workgroup_group_segment_byte_size = 0
  56. gds_segment_byte_size = 0
  57. kernarg_segment_byte_size = 16
  58. workgroup_fbarrier_count = 0
  59. wavefront_sgpr_count = 10
  60. workitem_vgpr_count = 7
  61. reserved_vgpr_first = 0
  62. reserved_vgpr_count = 0
  63. reserved_sgpr_first = 0
  64. reserved_sgpr_count = 0
  65. debug_wavefront_private_segment_offset_sgpr = 0
  66. debug_private_segment_buffer_sgpr = 0
  67. kernarg_segment_alignment = 4
  68. group_segment_alignment = 4
  69. private_segment_alignment = 4
  70. wavefront_size = 6
  71. call_convention = -1
  72. runtime_loader_kernel_symbol = 0
  73. .end_amd_kernel_code_t
  74. v_sub_u32_e32 v1, vcc, 0x25bd8, v0
  75. s_lshl_b32 s0, s6, 8
  76. v_cmp_lt_i32_e32 vcc, s0, v1
  77. s_and_saveexec_b64 s[0:1], vcc
  78. s_cbranch_execz BB0_5
  79. BB0_1:
  80. s_mul_i32 s0, s6, 28
  81. v_add_u32_e32 v1, vcc, s0, v0
  82. v_mov_b32_e32 v0, 0x8fb823ef
  83. v_mul_hi_i32 v0, v1, v0
  84. s_movk_i32 s0, 0xe4
  85. v_add_u32_e32 v0, vcc, v0, v1
  86. v_lshrrev_b32_e32 v2, 31, v0
  87. v_ashrrev_i32_e32 v0, 7, v0
  88. v_add_u32_e32 v0, vcc, v2, v0
  89. v_mul_lo_i32 v0, v0, s0
  90. v_mov_b32_e32 v2, 0xe2
  91. v_subrev_u32_e32 v0, vcc, v0, v1
  92. v_cmp_lt_i32_e32 vcc, v0, v2
  93. s_and_saveexec_b64 s[2:3], vcc
  94. s_cbranch_execz BB0_5
  95. BB0_2:
  96. s_mul_i32 s6, s6, s0
  97. v_add_u32_e32 v1, vcc, s6, v1
  98. v_mov_b32_e32 v2, 0x28b30361
  99. v_mov_b32_e32 v4, 0x8fb823ef
  100. v_mul_hi_i32 v4, v1, v4
  101. v_mul_hi_i32 v2, v1, v2
  102. s_movk_i32 s0, 0xe2
  103. s_load_dwordx2 s[6:7], s[4:5], 0x0
  104. v_add_u32_e32 v4, vcc, v4, v1
  105. v_lshrrev_b32_e32 v3, 31, v2
  106. v_ashrrev_i32_e32 v2, 13, v2
  107. v_add_u32_e32 v2, vcc, v2, v3
  108. v_mov_b32_e32 v3, 0x55555556
  109. v_lshrrev_b32_e32 v6, 31, v4
  110. v_ashrrev_i32_e32 v4, 7, v4
  111. v_mul_hi_i32 v3, v2, v3
  112. v_add_u32_e32 v4, vcc, v6, v4
  113. v_mov_b32_e32 v6, 0x487ede05
  114. v_mul_hi_i32 v6, v4, v6
  115. v_lshrrev_b32_e32 v5, 31, v3
  116. v_add_u32_e32 v3, vcc, v3, v5
  117. v_mul_lo_i32 v3, v3, 3
  118. v_lshrrev_b32_e32 v5, 31, v6
  119. v_ashrrev_i32_e32 v6, 6, v6
  120. v_add_u32_e32 v5, vcc, v6, v5
  121. v_mov_b32_e32 v6, 0x6c880903
  122. v_mul_lo_i32 v5, v5, s0
  123. v_mul_hi_i32 v6, v1, v6
  124. v_subrev_u32_e32 v3, vcc, v3, v2
  125. v_cmp_lt_i32_e64 s[2:3], 0, v0
  126. v_subrev_u32_e32 v1, vcc, v5, v4
  127. v_lshrrev_b32_e32 v2, 31, v6
  128. v_add_u32_sdwa v2, vcc, sext(v6), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
  129. v_add_u32_e32 v5, vcc, -1, v1
  130. v_mov_b32_e32 v6, 0xe0
  131. v_mov_b32_e32 v4, 0xe1
  132. v_cmp_lt_u32_e32 vcc, v5, v6
  133. v_cmp_ne_u32_e64 s[0:1], v0, v4
  134. s_and_b64 s[2:3], s[2:3], vcc
  135. v_mov_b32_e32 v4, 0
  136. s_and_b64 s[2:3], s[0:1], s[2:3]
  137. s_and_saveexec_b64 s[0:1], s[2:3]
  138. s_cbranch_execz BB0_4
  139. BB0_3:
  140. s_load_dwordx2 s[2:3], s[4:5], 0x8
  141. s_mov_b32 s4, 0xc400
  142. v_mul_lo_i32 v4, v3, s4
  143. s_mov_b32 s4, 0x24c00
  144. v_mul_lo_i32 v5, v2, s4
  145. s_movk_i32 s4, 0xe0
  146. v_mul_lo_i32 v6, v1, s4
  147. v_add_u32_e32 v5, vcc, v5, v0
  148. v_add_u32_e32 v5, vcc, v5, v6
  149. v_add_u32_e32 v4, vcc, v4, v5
  150. v_add_u32_e32 v4, vcc, 0xffffff1f, v4
  151. v_ashrrev_i32_e32 v5, 31, v4
  152. v_lshlrev_b64 v[4:5], 2, v[4:5]
  153. s_waitcnt lgkmcnt(0)
  154. v_mov_b32_e32 v6, s3
  155. v_add_u32_e32 v4, vcc, s2, v4
  156. v_addc_u32_e32 v5, vcc, v6, v5, vcc
  157. flat_load_dword v4, v[4:5]
  158. BB0_4:
  159. s_or_b64 exec, exec, s[0:1]
  160. s_mov_b32 s0, 0xc948
  161. v_mul_lo_i32 v3, v3, s0
  162. s_mov_b32 s0, 0x25bd8
  163. v_mul_lo_i32 v2, v2, s0
  164. s_movk_i32 s0, 0xe4
  165. v_mul_lo_i32 v1, v1, s0
  166. v_add_u32_e32 v0, vcc, v2, v0
  167. s_waitcnt lgkmcnt(0)
  168. v_mov_b32_e32 v2, s7
  169. v_add_u32_e32 v0, vcc, v0, v1
  170. v_add_u32_e32 v0, vcc, v3, v0
  171. v_ashrrev_i32_e32 v1, 31, v0
  172. v_lshlrev_b64 v[0:1], 2, v[0:1]
  173. v_add_u32_e32 v0, vcc, s6, v0
  174. v_addc_u32_e32 v1, vcc, v2, v1, vcc
  175. s_waitcnt vmcnt(0)
  176. flat_store_dword v[0:1], v4
  177. BB0_5:
  178. s_endpgm
  179. .Lfunc_end0:
  180. .size fuse_conv2d_relu_kernel0, .Lfunc_end0-fuse_conv2d_relu_kernel0
  181.  
  182. .globl fuse_conv2d_relu_kernel1
  183. .p2align 8
  184. .type fuse_conv2d_relu_kernel1,@function
  185. .amdgpu_hsa_kernel fuse_conv2d_relu_kernel1
  186. fuse_conv2d_relu_kernel1:
  187. .amd_kernel_code_t
  188. amd_code_version_major = 1
  189. amd_code_version_minor = 1
  190. amd_machine_kind = 1
  191. amd_machine_version_major = 8
  192. amd_machine_version_minor = 0
  193. amd_machine_version_stepping = 3
  194. kernel_code_entry_byte_offset = 256
  195. kernel_code_prefetch_byte_size = 0
  196. max_scratch_backing_memory_byte_size = 0
  197. granulated_workitem_vgpr_count = 1
  198. granulated_wavefront_sgpr_count = 1
  199. priority = 0
  200. float_mode = 192
  201. priv = 0
  202. enable_dx10_clamp = 1
  203. debug_mode = 0
  204. enable_ieee_mode = 1
  205. enable_sgpr_private_segment_wave_byte_offset = 0
  206. user_sgpr_count = 6
  207. enable_trap_handler = 1
  208. enable_sgpr_workgroup_id_x = 1
  209. enable_sgpr_workgroup_id_y = 0
  210. enable_sgpr_workgroup_id_z = 0
  211. enable_sgpr_workgroup_info = 0
  212. enable_vgpr_workitem_id = 0
  213. enable_exception_msb = 0
  214. granulated_lds_size = 0
  215. enable_exception = 0
  216. enable_sgpr_private_segment_buffer = 1
  217. enable_sgpr_dispatch_ptr = 0
  218. enable_sgpr_queue_ptr = 0
  219. enable_sgpr_kernarg_segment_ptr = 1
  220. enable_sgpr_dispatch_id = 0
  221. enable_sgpr_flat_scratch_init = 0
  222. enable_sgpr_private_segment_size = 0
  223. enable_sgpr_grid_workgroup_count_x = 0
  224. enable_sgpr_grid_workgroup_count_y = 0
  225. enable_sgpr_grid_workgroup_count_z = 0
  226. enable_ordered_append_gds = 0
  227. private_element_size = 1
  228. is_ptr64 = 1
  229. is_dynamic_callstack = 0
  230. is_debug_enabled = 0
  231. is_xnack_enabled = 0
  232. workitem_private_segment_byte_size = 0
  233. workgroup_group_segment_byte_size = 0
  234. gds_segment_byte_size = 0
  235. kernarg_segment_byte_size = 8
  236. workgroup_fbarrier_count = 0
  237. wavefront_sgpr_count = 9
  238. workitem_vgpr_count = 8
  239. reserved_vgpr_first = 0
  240. reserved_vgpr_count = 0
  241. reserved_sgpr_first = 0
  242. reserved_sgpr_count = 0
  243. debug_wavefront_private_segment_offset_sgpr = 0
  244. debug_private_segment_buffer_sgpr = 0
  245. kernarg_segment_alignment = 4
  246. group_segment_alignment = 4
  247. private_segment_alignment = 4
  248. wavefront_size = 6
  249. call_convention = -1
  250. runtime_loader_kernel_symbol = 0
  251. .end_amd_kernel_code_t
  252. v_lshrrev_b32_e32 v1, 2, v0
  253. v_sub_u32_e32 v2, vcc, 0x96f6, v1
  254. s_lshl_b32 s0, s6, 6
  255. v_cmp_lt_i32_e32 vcc, s0, v2
  256. s_and_saveexec_b64 s[2:3], vcc
  257. s_cbranch_execz BB1_4
  258. BB1_1:
  259. v_add_u32_e32 v1, vcc, s0, v1
  260. v_mov_b32_e32 v2, 0x28b30361
  261. v_mul_hi_i32 v2, v1, v2
  262. v_mov_b32_e32 v4, 0x8fb823ef
  263. v_mul_hi_i32 v6, v1, v4
  264. s_mulk_i32 s6, 0xffc7
  265. v_lshrrev_b32_e32 v3, 31, v2
  266. v_ashrrev_i32_e32 v2, 11, v2
  267. v_add_u32_e32 v2, vcc, v2, v3
  268. v_mov_b32_e32 v3, 0x55555556
  269. v_mul_hi_i32 v3, v2, v3
  270. v_and_b32_e32 v7, 3, v0
  271. s_movk_i32 s2, 0xe2
  272. s_mov_b32 s3, 0xc948
  273. v_lshrrev_b32_e32 v5, 31, v3
  274. v_add_u32_e32 v3, vcc, v3, v5
  275. v_add_u32_e32 v5, vcc, v6, v1
  276. v_lshrrev_b32_e32 v6, 31, v5
  277. v_ashrrev_i32_e32 v5, 5, v5
  278. v_add_u32_e32 v5, vcc, v6, v5
  279. v_mov_b32_e32 v6, 0x487ede05
  280. v_mul_lo_i32 v3, v3, 3
  281. v_mul_hi_i32 v6, v5, v6
  282. s_load_dwordx2 s[0:1], s[4:5], 0x0
  283. v_subrev_u32_e32 v2, vcc, v3, v2
  284. v_lshrrev_b32_e32 v3, 31, v6
  285. v_ashrrev_i32_e32 v6, 6, v6
  286. v_add_u32_e32 v3, vcc, v6, v3
  287. v_add_u32_e32 v6, vcc, s6, v1
  288. v_mul_hi_i32 v4, v6, v4
  289. v_mul_lo_i32 v3, v3, s2
  290. v_mul_lo_i32 v2, v2, s3
  291. s_movk_i32 s3, 0xe4
  292. v_add_u32_e32 v0, vcc, v4, v6
  293. v_lshrrev_b32_e32 v4, 31, v0
  294. v_ashrrev_i32_e32 v0, 5, v0
  295. v_add_u32_e32 v0, vcc, v4, v0
  296. v_mul_lo_i32 v0, v0, 57
  297. v_mov_b32_e32 v4, 0x6c880903
  298. v_mul_hi_i32 v4, v1, v4
  299. v_subrev_u32_e32 v3, vcc, v3, v5
  300. v_subrev_u32_e32 v0, vcc, v0, v6
  301. v_mul_lo_i32 v3, v3, s3
  302. v_lshlrev_b32_e32 v0, 2, v0
  303. v_lshrrev_b32_e32 v6, 31, v4
  304. v_ashrrev_i32_e32 v4, 14, v4
  305. v_add_u32_e32 v4, vcc, v4, v6
  306. v_or_b32_e32 v0, v0, v7
  307. v_mov_b32_e32 v6, 0x25bd8
  308. v_mad_i32_i24 v0, v4, v6, v0
  309. v_add_u32_e32 v0, vcc, v0, v3
  310. v_add_u32_e32 v0, vcc, v2, v0
  311. v_mul_lo_i32 v2, v5, 57
  312. v_sub_u32_e32 v3, vcc, s2, v7
  313. v_subrev_u32_e32 v1, vcc, v2, v1
  314. v_lshlrev_b32_e32 v1, 2, v1
  315. v_cmp_lt_i32_e32 vcc, v1, v3
  316. v_ashrrev_i32_e32 v1, 31, v0
  317. v_lshlrev_b64 v[0:1], 2, v[0:1]
  318. s_waitcnt lgkmcnt(0)
  319. v_mov_b32_e32 v2, s1
  320. v_add_u32_e64 v0, s[0:1], s0, v0
  321. v_addc_u32_e64 v1, s[0:1], v2, v1, s[0:1]
  322. v_mov_b32_e32 v2, 0
  323. s_and_saveexec_b64 s[0:1], vcc
  324. BB1_2:
  325. flat_load_dword v2, v[0:1]
  326. BB1_3:
  327. s_or_b64 exec, exec, s[0:1]
  328. s_waitcnt vmcnt(0) lgkmcnt(0)
  329. flat_store_dword v[0:1], v2
  330. BB1_4:
  331. s_endpgm
  332. .Lfunc_end1:
  333. .size fuse_conv2d_relu_kernel1, .Lfunc_end1-fuse_conv2d_relu_kernel1
  334.  
  335. .globl fuse_conv2d_relu_kernel2
  336. .p2align 8
  337. .type fuse_conv2d_relu_kernel2,@function
  338. .amdgpu_hsa_kernel fuse_conv2d_relu_kernel2
  339. fuse_conv2d_relu_kernel2:
  340. .amd_kernel_code_t
  341. amd_code_version_major = 1
  342. amd_code_version_minor = 1
  343. amd_machine_kind = 1
  344. amd_machine_version_major = 8
  345. amd_machine_version_minor = 0
  346. amd_machine_version_stepping = 3
  347. kernel_code_entry_byte_offset = 256
  348. kernel_code_prefetch_byte_size = 0
  349. max_scratch_backing_memory_byte_size = 0
  350. granulated_workitem_vgpr_count = 8
  351. granulated_wavefront_sgpr_count = 2
  352. priority = 0
  353. float_mode = 192
  354. priv = 0
  355. enable_dx10_clamp = 1
  356. debug_mode = 0
  357. enable_ieee_mode = 1
  358. enable_sgpr_private_segment_wave_byte_offset = 0
  359. user_sgpr_count = 6
  360. enable_trap_handler = 1
  361. enable_sgpr_workgroup_id_x = 1
  362. enable_sgpr_workgroup_id_y = 0
  363. enable_sgpr_workgroup_id_z = 0
  364. enable_sgpr_workgroup_info = 0
  365. enable_vgpr_workitem_id = 0
  366. enable_exception_msb = 0
  367. granulated_lds_size = 0
  368. enable_exception = 0
  369. enable_sgpr_private_segment_buffer = 1
  370. enable_sgpr_dispatch_ptr = 0
  371. enable_sgpr_queue_ptr = 0
  372. enable_sgpr_kernarg_segment_ptr = 1
  373. enable_sgpr_dispatch_id = 0
  374. enable_sgpr_flat_scratch_init = 0
  375. enable_sgpr_private_segment_size = 0
  376. enable_sgpr_grid_workgroup_count_x = 0
  377. enable_sgpr_grid_workgroup_count_y = 0
  378. enable_sgpr_grid_workgroup_count_z = 0
  379. enable_ordered_append_gds = 0
  380. private_element_size = 1
  381. is_ptr64 = 1
  382. is_dynamic_callstack = 0
  383. is_debug_enabled = 0
  384. is_xnack_enabled = 0
  385. workitem_private_segment_byte_size = 0
  386. workgroup_group_segment_byte_size = 5376
  387. gds_segment_byte_size = 0
  388. kernarg_segment_byte_size = 32
  389. workgroup_fbarrier_count = 0
  390. wavefront_sgpr_count = 21
  391. workitem_vgpr_count = 34
  392. reserved_vgpr_first = 0
  393. reserved_vgpr_count = 0
  394. reserved_sgpr_first = 0
  395. reserved_sgpr_count = 0
  396. debug_wavefront_private_segment_offset_sgpr = 0
  397. debug_private_segment_buffer_sgpr = 0
  398. kernarg_segment_alignment = 4
  399. group_segment_alignment = 4
  400. private_segment_alignment = 4
  401. wavefront_size = 6
  402. call_convention = -1
  403. runtime_loader_kernel_symbol = 0
  404. .end_amd_kernel_code_t
  405. s_load_dwordx2 s[12:13], s[4:5], 0x8
  406. s_load_dwordx2 s[8:9], s[4:5], 0x10
  407. s_load_dwordx2 s[10:11], s[4:5], 0x18
  408. v_cmp_gt_i32_e32 vcc, 27, v0
  409. s_and_saveexec_b64 s[0:1], vcc
  410. s_cbranch_execz BB2_2
  411. BB2_1:
  412. v_mov_b32_e32 v1, 0x92492493
  413. v_mul_hi_i32 v1, s6, v1
  414. v_mov_b32_e32 v2, 0x38e38e39
  415. v_mul_hi_u32 v2, v0, v2
  416. s_load_dwordx2 s[2:3], s[4:5], 0x0
  417. v_add_u32_e32 v1, vcc, s6, v1
  418. v_lshrrev_b32_e32 v3, 31, v1
  419. v_ashrrev_i32_e32 v1, 6, v1
  420. v_add_u32_e32 v1, vcc, v3, v1
  421. v_mul_lo_i32 v1, v1, 48
  422. v_lshrrev_b32_e32 v2, 1, v2
  423. v_mul_u32_u24_e32 v3, 9, v2
  424. s_waitcnt lgkmcnt(0)
  425. v_mov_b32_e32 v4, s3
  426. v_add_u32_e32 v1, vcc, v1, v2
  427. v_mul_lo_i32 v1, v1, 9
  428. v_subrev_u32_e32 v2, vcc, v3, v0
  429. v_mov_b32_e32 v6, s3
  430. s_movk_i32 s4, 0x5e8
  431. v_add_u32_e32 v1, vcc, v1, v2
  432. v_ashrrev_i32_e32 v2, 31, v1
  433. v_lshlrev_b64 v[2:3], 2, v[1:2]
  434. v_add_u32_e32 v2, vcc, s2, v2
  435. v_addc_u32_e32 v3, vcc, v4, v3, vcc
  436. v_add_u32_e32 v12, vcc, 27, v1
  437. v_ashrrev_i32_e32 v13, 31, v12
  438. v_add_u32_e32 v4, vcc, 0xd8, v1
  439. v_add_u32_e32 v5, vcc, 0xbd, v1
  440. v_add_u32_e32 v7, vcc, 0xa2, v1
  441. v_add_u32_e32 v8, vcc, 0x87, v1
  442. v_add_u32_e32 v9, vcc, 0x6c, v1
  443. v_add_u32_e32 v10, vcc, 0x51, v1
  444. v_add_u32_e32 v11, vcc, 54, v1
  445. v_lshlrev_b64 v[12:13], 2, v[12:13]
  446. v_add_u32_e32 v12, vcc, s2, v12
  447. v_addc_u32_e32 v13, vcc, v6, v13, vcc
  448. v_add_u32_e32 v17, vcc, 0xf3, v1
  449. v_ashrrev_i32_e32 v18, 31, v17
  450. v_add_u32_e32 v14, vcc, 0x144, v1
  451. v_add_u32_e32 v15, vcc, 0x129, v1
  452. v_add_u32_e32 v16, vcc, 0x10e, v1
  453. v_lshlrev_b64 v[17:18], 2, v[17:18]
  454. v_add_u32_e32 v19, vcc, s2, v17
  455. v_ashrrev_i32_e32 v17, 31, v16
  456. v_mov_b32_e32 v1, s3
  457. v_addc_u32_e32 v20, vcc, v1, v18, vcc
  458. v_lshlrev_b64 v[16:17], 2, v[16:17]
  459. v_add_u32_e32 v21, vcc, s2, v16
  460. v_ashrrev_i32_e32 v16, 31, v15
  461. v_addc_u32_e32 v22, vcc, v1, v17, vcc
  462. v_lshlrev_b64 v[15:16], 2, v[15:16]
  463. v_add_u32_e32 v17, vcc, s2, v15
  464. v_ashrrev_i32_e32 v15, 31, v14
  465. v_addc_u32_e32 v18, vcc, v1, v16, vcc
  466. v_lshlrev_b64 v[14:15], 2, v[14:15]
  467. v_ashrrev_i32_e32 v6, 31, v5
  468. v_add_u32_e32 v14, vcc, s2, v14
  469. v_addc_u32_e32 v15, vcc, v1, v15, vcc
  470. v_lshlrev_b64 v[5:6], 2, v[5:6]
  471. flat_load_dword v1, v[19:20]
  472. flat_load_dword v19, v[21:22]
  473. flat_load_dword v18, v[17:18]
  474. flat_load_dword v20, v[14:15]
  475. flat_load_dword v21, v[12:13]
  476. v_add_u32_e32 v12, vcc, s2, v5
  477. v_ashrrev_i32_e32 v5, 31, v4
  478. v_mov_b32_e32 v13, s3
  479. v_addc_u32_e32 v13, vcc, v13, v6, vcc
  480. v_lshlrev_b64 v[4:5], 2, v[4:5]
  481. v_mov_b32_e32 v6, s3
  482. v_add_u32_e32 v4, vcc, s2, v4
  483. v_addc_u32_e32 v5, vcc, v6, v5, vcc
  484. v_add_u32_e32 v14, vcc, s4, v2
  485. v_addc_u32_e32 v15, vcc, 0, v3, vcc
  486. s_movk_i32 s4, 0x654
  487. v_add_u32_e32 v16, vcc, s4, v2
  488. flat_load_dword v22, v[12:13]
  489. v_addc_u32_e32 v17, vcc, 0, v3, vcc
  490. flat_load_dword v24, v[4:5]
  491. flat_load_dword v4, v[2:3]
  492. flat_load_dword v16, v[16:17]
  493. flat_load_dword v17, v[14:15]
  494. v_lshlrev_b32_e32 v23, 2, v0
  495. s_mov_b32 m0, -1
  496. v_ashrrev_i32_e32 v12, 31, v11
  497. s_waitcnt vmcnt(2) lgkmcnt(2)
  498. ds_write2_b32 v23, v4, v21 offset1:27
  499. v_lshlrev_b64 v[4:5], 2, v[11:12]
  500. v_ashrrev_i32_e32 v11, 31, v10
  501. v_add_u32_e32 v4, vcc, s2, v4
  502. v_addc_u32_e32 v5, vcc, v6, v5, vcc
  503. v_lshlrev_b64 v[10:11], 2, v[10:11]
  504. v_add_u32_e32 v12, vcc, s2, v10
  505. v_ashrrev_i32_e32 v10, 31, v9
  506. v_addc_u32_e32 v13, vcc, v6, v11, vcc
  507. v_lshlrev_b64 v[9:10], 2, v[9:10]
  508. v_add_u32_e32 v14, vcc, s2, v9
  509. v_ashrrev_i32_e32 v9, 31, v8
  510. v_addc_u32_e32 v15, vcc, v6, v10, vcc
  511. v_lshlrev_b64 v[8:9], 2, v[8:9]
  512. v_add_u32_e32 v10, vcc, s2, v8
  513. v_ashrrev_i32_e32 v8, 31, v7
  514. v_addc_u32_e32 v11, vcc, v6, v9, vcc
  515. v_lshlrev_b64 v[6:7], 2, v[7:8]
  516. v_add_u32_e32 v6, vcc, s2, v6
  517. v_mov_b32_e32 v8, s3
  518. v_addc_u32_e32 v7, vcc, v8, v7, vcc
  519. flat_load_dword v4, v[4:5]
  520. flat_load_dword v5, v[12:13]
  521. flat_load_dword v8, v[14:15]
  522. flat_load_dword v9, v[10:11]
  523. flat_load_dword v6, v[6:7]
  524. s_movk_i32 s2, 0x57c
  525. s_waitcnt vmcnt(3) lgkmcnt(3)
  526. ds_write2_b32 v23, v4, v5 offset0:54 offset1:81
  527. s_waitcnt vmcnt(1) lgkmcnt(2)
  528. ds_write2_b32 v23, v8, v9 offset0:108 offset1:135
  529. s_waitcnt vmcnt(0) lgkmcnt(2)
  530. ds_write2_b32 v23, v6, v22 offset0:162 offset1:189
  531. ds_write2_b32 v23, v24, v1 offset0:216 offset1:243
  532. v_add_u32_e32 v1, vcc, 0x438, v23
  533. ds_write2_b32 v1, v19, v18 offset1:27
  534. v_add_u32_e32 v1, vcc, s2, v2
  535. v_addc_u32_e32 v2, vcc, 0, v3, vcc
  536. flat_load_dword v1, v[1:2]
  537. v_add_u32_e32 v2, vcc, 0x510, v23
  538. s_waitcnt vmcnt(0) lgkmcnt(0)
  539. ds_write2_b32 v2, v20, v1 offset1:27
  540. v_add_u32_e32 v1, vcc, 0x5e8, v23
  541. ds_write2_b32 v1, v17, v16 offset1:27
  542. BB2_2:
  543. s_or_b64 exec, exec, s[0:1]
  544. v_mov_b32_e32 v1, 0x1f7047dd
  545. v_mul_hi_u32 v1, v0, v1
  546. v_mov_b32_e32 v2, 0xe4
  547. v_cmp_lt_i32_e64 s[0:1], v0, v2
  548. s_movk_i32 s2, 0x70
  549. v_subrev_u32_e32 v2, vcc, v1, v0
  550. v_lshrrev_b32_e32 v2, 1, v2
  551. v_add_u32_e32 v1, vcc, v1, v2
  552. v_mov_b32_e32 v2, 0x92492493
  553. v_mul_hi_i32 v2, s6, v2
  554. v_mov_b32_e32 v5, 0x24924925
  555. v_lshrrev_b32_e32 v1, 5, v1
  556. v_sub_u32_e32 v3, vcc, 0xe2, v1
  557. v_add_u32_e32 v2, vcc, s6, v2
  558. v_lshrrev_b32_e32 v4, 31, v2
  559. v_ashrrev_i32_e32 v2, 6, v2
  560. v_add_u32_e32 v17, vcc, v4, v2
  561. v_mul_lo_i32 v2, v17, s2
  562. v_lshrrev_b32_e32 v4, 5, v0
  563. v_mul_hi_u32 v18, v4, v5
  564. s_movk_i32 s4, 0x6c0
  565. v_sub_u32_e32 v19, vcc, s6, v2
  566. v_lshlrev_b32_e32 v2, 1, v19
  567. v_cmp_lt_i32_e64 s[2:3], v2, v3
  568. v_mul_u32_u24_e32 v3, 0xe0, v18
  569. v_subrev_u32_e32 v16, vcc, v3, v0
  570. v_mul_u32_u24_e32 v3, 57, v1
  571. v_add_u32_e32 v1, vcc, v2, v1
  572. v_mul_lo_i32 v1, v1, 57
  573. v_subrev_u32_e32 v2, vcc, v3, v0
  574. s_and_b64 s[0:1], s[0:1], s[2:3]
  575. v_mov_b32_e32 v12, 0
  576. v_add_u32_e32 v20, vcc, v1, v2
  577. v_lshlrev_b32_e32 v1, 4, v0
  578. v_add_u32_e32 v21, vcc, s4, v1
  579. v_lshlrev_b32_e32 v1, 2, v18
  580. v_add_u32_e32 v0, vcc, v1, v0
  581. v_lshlrev_b32_e32 v0, 2, v0
  582. v_add_u32_e32 v22, vcc, s4, v0
  583. s_mov_b64 s[2:3], 0
  584. s_mov_b32 s4, 0
  585. v_mov_b32_e32 v13, 0
  586. v_mov_b32_e32 v14, 0
  587. v_mov_b32_e32 v15, 0
  588. v_mov_b32_e32 v11, 0
  589. v_mov_b32_e32 v10, 0
  590. v_mov_b32_e32 v9, 0
  591. v_mov_b32_e32 v8, 0
  592. v_mov_b32_e32 v7, 0
  593. v_mov_b32_e32 v6, 0
  594. v_mov_b32_e32 v5, 0
  595. v_mov_b32_e32 v4, 0
  596. v_mov_b32_e32 v3, 0
  597. v_mov_b32_e32 v2, 0
  598. v_mov_b32_e32 v1, 0
  599. v_mov_b32_e32 v0, 0
  600. BB2_3:
  601. s_waitcnt lgkmcnt(0)
  602. s_barrier
  603. s_and_saveexec_b64 s[6:7], s[0:1]
  604. s_cbranch_execz BB2_5
  605. BB2_4:
  606. s_mul_i32 s5, s2, 0x3252
  607. v_add_u32_e32 v23, vcc, s5, v20
  608. v_lshlrev_b32_e32 v23, 2, v23
  609. v_ashrrev_i32_e32 v24, 31, v23
  610. v_lshlrev_b64 v[23:24], 2, v[23:24]
  611. v_mov_b32_e32 v25, s13
  612. v_add_u32_e32 v23, vcc, s12, v23
  613. v_addc_u32_e32 v24, vcc, v25, v24, vcc
  614. flat_load_dwordx4 v[23:26], v[23:24]
  615. s_mov_b32 m0, -1
  616. s_waitcnt vmcnt(0) lgkmcnt(0)
  617. ds_write2_b64 v21, v[23:24], v[25:26] offset1:1
  618. BB2_5:
  619. s_or_b64 exec, exec, s[6:7]
  620. s_waitcnt lgkmcnt(0)
  621. s_barrier
  622. s_mov_b32 s5, 0
  623. v_mov_b32_e32 v23, v22
  624. BB2_6:
  625. s_add_i32 s6, s4, s5
  626. s_mov_b32 m0, -1
  627. v_mov_b32_e32 v32, s6
  628. ds_read2_b32 v[24:25], v32 offset1:1
  629. ds_read_b32 v33, v23 offset:8
  630. ds_read2_b32 v[26:27], v32 offset0:27 offset1:28
  631. ds_read2_b32 v[28:29], v32 offset0:54 offset1:55
  632. ds_read2_b32 v[30:31], v23 offset1:1
  633. s_add_i32 s7, s6, 0x438
  634. s_add_i32 s14, s6, 0x4a4
  635. s_add_i32 s15, s6, 0x510
  636. s_add_i32 s16, s6, 0x57c
  637. s_waitcnt lgkmcnt(0)
  638. v_mac_f32_e32 v12, v30, v24
  639. v_mac_f32_e32 v13, v30, v26
  640. v_mac_f32_e32 v14, v30, v28
  641. v_mac_f32_e32 v12, v31, v25
  642. v_mac_f32_e32 v13, v31, v27
  643. v_mac_f32_e32 v14, v31, v29
  644. ds_read2_b32 v[24:25], v32 offset0:81 offset1:82
  645. ds_read2_b32 v[26:27], v32 offset0:108 offset1:109
  646. ds_read2_b32 v[28:29], v32 offset0:135 offset1:136
  647. s_add_i32 s17, s6, 0x5e8
  648. s_add_i32 s18, s6, 0x654
  649. s_waitcnt lgkmcnt(2)
  650. v_mac_f32_e32 v15, v30, v24
  651. s_waitcnt lgkmcnt(1)
  652. v_mac_f32_e32 v11, v30, v26
  653. s_waitcnt lgkmcnt(0)
  654. v_mac_f32_e32 v10, v30, v28
  655. v_mac_f32_e32 v15, v31, v25
  656. v_mac_f32_e32 v11, v31, v27
  657. v_mac_f32_e32 v10, v31, v29
  658. ds_read2_b32 v[24:25], v32 offset0:162 offset1:163
  659. ds_read2_b32 v[26:27], v32 offset0:189 offset1:190
  660. ds_read2_b32 v[28:29], v32 offset0:216 offset1:217
  661. s_add_i32 s5, s5, 12
  662. s_cmp_lg_u32 s5, 36
  663. s_waitcnt lgkmcnt(2)
  664. v_mac_f32_e32 v9, v30, v24
  665. s_waitcnt lgkmcnt(1)
  666. v_mac_f32_e32 v8, v30, v26
  667. s_waitcnt lgkmcnt(0)
  668. v_mac_f32_e32 v7, v30, v28
  669. v_mov_b32_e32 v24, s7
  670. v_mov_b32_e32 v26, s14
  671. v_mac_f32_e32 v9, v31, v25
  672. v_mac_f32_e32 v8, v31, v27
  673. v_mac_f32_e32 v7, v31, v29
  674. ds_read2_b32 v[24:25], v24 offset1:1
  675. ds_read2_b32 v[26:27], v26 offset1:1
  676. ds_read2_b32 v[28:29], v32 offset0:243 offset1:244
  677. v_add_u32_e32 v23, vcc, 0x390, v23
  678. s_waitcnt lgkmcnt(2)
  679. v_mac_f32_e32 v5, v30, v24
  680. s_waitcnt lgkmcnt(1)
  681. v_mac_f32_e32 v4, v30, v26
  682. v_mov_b32_e32 v24, s15
  683. v_mac_f32_e32 v5, v31, v25
  684. v_mac_f32_e32 v4, v31, v27
  685. v_mov_b32_e32 v26, s16
  686. ds_read2_b32 v[24:25], v24 offset1:1
  687. s_waitcnt lgkmcnt(1)
  688. v_mac_f32_e32 v6, v30, v28
  689. v_mov_b32_e32 v28, s17
  690. v_mac_f32_e32 v6, v31, v29
  691. ds_read2_b32 v[26:27], v26 offset1:1
  692. ds_read2_b32 v[28:29], v28 offset1:1
  693. s_waitcnt lgkmcnt(2)
  694. v_mac_f32_e32 v3, v30, v24
  695. v_mov_b32_e32 v24, s18
  696. v_mac_f32_e32 v3, v31, v25
  697. ds_read2_b32 v[24:25], v24 offset1:1
  698. s_waitcnt lgkmcnt(2)
  699. v_mac_f32_e32 v2, v30, v26
  700. v_mac_f32_e32 v2, v31, v27
  701. s_waitcnt lgkmcnt(1)
  702. v_mac_f32_e32 v1, v30, v28
  703. v_mac_f32_e32 v1, v31, v29
  704. s_waitcnt lgkmcnt(0)
  705. v_mac_f32_e32 v0, v30, v24
  706. v_mac_f32_e32 v0, v31, v25
  707. ds_read2_b32 v[24:25], v32 offset0:2 offset1:29
  708. s_waitcnt lgkmcnt(0)
  709. v_mac_f32_e32 v12, v33, v24
  710. v_mac_f32_e32 v13, v33, v25
  711. ds_read2_b32 v[24:25], v32 offset0:56 offset1:83
  712. s_waitcnt lgkmcnt(0)
  713. v_mac_f32_e32 v14, v33, v24
  714. v_mac_f32_e32 v15, v33, v25
  715. ds_read2_b32 v[24:25], v32 offset0:110 offset1:137
  716. s_waitcnt lgkmcnt(0)
  717. v_mac_f32_e32 v11, v33, v24
  718. v_mac_f32_e32 v10, v33, v25
  719. ds_read2_b32 v[24:25], v32 offset0:164 offset1:191
  720. s_waitcnt lgkmcnt(0)
  721. v_mac_f32_e32 v9, v33, v24
  722. v_mac_f32_e32 v8, v33, v25
  723. ds_read2_b32 v[24:25], v32 offset0:218 offset1:245
  724. s_waitcnt lgkmcnt(0)
  725. v_mac_f32_e32 v7, v33, v24
  726. v_add_u32_e32 v24, vcc, 0x440, v32
  727. v_mac_f32_e32 v6, v33, v25
  728. ds_read2_b32 v[24:25], v24 offset1:27
  729. s_waitcnt lgkmcnt(0)
  730. v_mac_f32_e32 v5, v33, v24
  731. v_add_u32_e32 v24, vcc, 0x518, v32
  732. v_mac_f32_e32 v4, v33, v25
  733. ds_read2_b32 v[24:25], v24 offset1:27
  734. s_waitcnt lgkmcnt(0)
  735. v_mac_f32_e32 v3, v33, v24
  736. v_add_u32_e32 v24, vcc, 0x5f0, v32
  737. v_mac_f32_e32 v2, v33, v25
  738. ds_read2_b32 v[24:25], v24 offset1:27
  739. s_waitcnt lgkmcnt(0)
  740. v_mac_f32_e32 v1, v33, v24
  741. v_mac_f32_e32 v0, v33, v25
  742. s_cbranch_scc1 BB2_6
  743. s_add_u32 s2, s2, 1
  744. s_addc_u32 s3, s3, 0
  745. s_add_i32 s4, s4, 36
  746. s_cmp_eq_u64 s[2:3], 3
  747. s_cbranch_scc0 BB2_3
  748. s_movk_i32 s0, 0x700
  749. v_mul_lo_i32 v20, v17, s0
  750. s_movk_i32 s0, 0xe0
  751. v_mov_b32_e32 v22, s11
  752. v_mov_b32_e32 v23, s11
  753. v_add_u32_e32 v19, vcc, v19, v20
  754. v_lshlrev_b32_e32 v20, 4, v17
  755. v_lshlrev_b32_e32 v19, 1, v19
  756. v_ashrrev_i32_e32 v21, 31, v20
  757. v_add_u32_e32 v19, vcc, v18, v19
  758. v_lshlrev_b64 v[17:18], 2, v[20:21]
  759. v_add_u32_e32 v17, vcc, s10, v17
  760. v_mul_lo_i32 v19, v19, s0
  761. v_or_b32_e32 v21, 4, v20
  762. v_addc_u32_e32 v18, vcc, v22, v18, vcc
  763. v_ashrrev_i32_e32 v22, 31, v21
  764. v_lshlrev_b64 v[21:22], 2, v[21:22]
  765. v_add_u32_e32 v21, vcc, s10, v21
  766. v_addc_u32_e32 v22, vcc, v23, v22, vcc
  767. v_add_u32_e32 v23, vcc, v19, v16
  768. v_ashrrev_i32_e32 v24, 31, v23
  769. v_lshlrev_b64 v[24:25], 2, v[23:24]
  770. v_mov_b32_e32 v16, s9
  771. v_add_u32_e32 v24, vcc, s8, v24
  772. v_addc_u32_e32 v25, vcc, v16, v25, vcc
  773. v_add_u32_e32 v26, vcc, 0xc400, v23
  774. v_ashrrev_i32_e32 v27, 31, v26
  775. v_lshlrev_b64 v[26:27], 2, v[26:27]
  776. v_add_u32_e32 v26, vcc, s8, v26
  777. v_addc_u32_e32 v27, vcc, v16, v27, vcc
  778. flat_load_dwordx4 v[16:19], v[17:18]
  779. v_mov_b32_e32 v28, s9
  780. v_mov_b32_e32 v29, s9
  781. s_mov_b32 s0, 0x27d000
  782. s_waitcnt vmcnt(0) lgkmcnt(0)
  783. v_add_f32_e32 v30, v12, v16
  784. v_add_u32_e32 v12, vcc, 0x18800, v23
  785. v_add_f32_e32 v31, v13, v17
  786. v_ashrrev_i32_e32 v13, 31, v12
  787. v_lshlrev_b64 v[12:13], 2, v[12:13]
  788. v_add_u32_e32 v16, vcc, s8, v12
  789. v_addc_u32_e32 v17, vcc, v28, v13, vcc
  790. v_add_u32_e32 v12, vcc, 0x24c00, v23
  791. v_ashrrev_i32_e32 v13, 31, v12
  792. v_lshlrev_b64 v[12:13], 2, v[12:13]
  793. v_add_f32_e32 v32, v14, v18
  794. v_add_u32_e32 v18, vcc, s8, v12
  795. v_add_f32_e32 v33, v15, v19
  796. v_addc_u32_e32 v19, vcc, v29, v13, vcc
  797. v_add_u32_e32 v12, vcc, 0x31000, v23
  798. v_ashrrev_i32_e32 v13, 31, v12
  799. v_lshlrev_b64 v[12:13], 2, v[12:13]
  800. v_add_u32_e32 v28, vcc, s8, v12
  801. v_mov_b32_e32 v14, s9
  802. v_addc_u32_e32 v29, vcc, v14, v13, vcc
  803. v_max_f32_e32 v12, 0, v30
  804. v_max_f32_e32 v13, 0, v31
  805. flat_store_dword v[24:25], v12
  806. flat_store_dword v[26:27], v13
  807. flat_load_dwordx4 v[12:15], v[21:22]
  808. v_max_f32_e32 v21, 0, v33
  809. s_waitcnt vmcnt(0) lgkmcnt(0)
  810. v_add_f32_e32 v11, v11, v12
  811. v_max_f32_e32 v12, 0, v32
  812. v_max_f32_e32 v11, 0, v11
  813. flat_store_dword v[16:17], v12
  814. flat_store_dword v[18:19], v21
  815. flat_store_dword v[28:29], v11
  816. v_add_u32_e32 v11, vcc, 0x3d400, v23
  817. v_ashrrev_i32_e32 v12, 31, v11
  818. v_lshlrev_b64 v[11:12], 2, v[11:12]
  819. v_add_f32_e32 v10, v10, v13
  820. v_mov_b32_e32 v16, s9
  821. v_add_u32_e32 v11, vcc, s8, v11
  822. v_addc_u32_e32 v12, vcc, v16, v12, vcc
  823. v_max_f32_e32 v10, 0, v10
  824. flat_store_dword v[11:12], v10
  825. v_add_f32_e32 v12, v8, v15
  826. v_add_u32_e32 v8, vcc, 0x49800, v23
  827. v_add_f32_e32 v11, v9, v14
  828. v_ashrrev_i32_e32 v9, 31, v8
  829. v_lshlrev_b64 v[8:9], 2, v[8:9]
  830. v_mov_b32_e32 v10, s9
  831. v_add_u32_e32 v8, vcc, s8, v8
  832. v_addc_u32_e32 v9, vcc, v10, v9, vcc
  833. v_max_f32_e32 v11, 0, v11
  834. v_add_u32_e32 v10, vcc, 0x55c00, v23
  835. flat_store_dword v[8:9], v11
  836. v_ashrrev_i32_e32 v11, 31, v10
  837. v_lshlrev_b64 v[8:9], 2, v[10:11]
  838. v_mov_b32_e32 v10, s9
  839. v_add_u32_e32 v8, vcc, s8, v8
  840. v_addc_u32_e32 v9, vcc, v10, v9, vcc
  841. v_max_f32_e32 v12, 0, v12
  842. flat_store_dword v[8:9], v12
  843. v_or_b32_e32 v8, 8, v20
  844. v_ashrrev_i32_e32 v9, 31, v8
  845. v_lshlrev_b64 v[8:9], 2, v[8:9]
  846. v_mov_b32_e32 v10, s11
  847. v_add_u32_e32 v12, vcc, s10, v8
  848. v_addc_u32_e32 v13, vcc, v10, v9, vcc
  849. v_add_u32_e32 v8, vcc, 0x62000, v23
  850. v_ashrrev_i32_e32 v9, 31, v8
  851. v_lshlrev_b64 v[8:9], 2, v[8:9]
  852. v_add_u32_e32 v16, vcc, s8, v8
  853. v_mov_b32_e32 v10, s9
  854. v_or_b32_e32 v8, 12, v20
  855. v_addc_u32_e32 v17, vcc, v10, v9, vcc
  856. v_ashrrev_i32_e32 v9, 31, v8
  857. v_lshlrev_b64 v[8:9], 2, v[8:9]
  858. v_mov_b32_e32 v10, s11
  859. v_add_u32_e32 v8, vcc, s10, v8
  860. v_addc_u32_e32 v9, vcc, v10, v9, vcc
  861. flat_load_dwordx4 v[8:11], v[8:9]
  862. flat_load_dwordx4 v[12:15], v[12:13]
  863. s_waitcnt vmcnt(1) lgkmcnt(1)
  864. v_add_f32_e32 v3, v3, v8
  865. s_waitcnt vmcnt(0) lgkmcnt(0)
  866. v_add_f32_e32 v7, v7, v12
  867. v_max_f32_e32 v7, 0, v7
  868. flat_store_dword v[16:17], v7
  869. v_add_u32_e32 v16, vcc, 0x6e400, v23
  870. v_ashrrev_i32_e32 v17, 31, v16
  871. v_lshlrev_b64 v[16:17], 2, v[16:17]
  872. v_add_f32_e32 v6, v6, v13
  873. v_mov_b32_e32 v7, s9
  874. v_add_u32_e32 v16, vcc, s8, v16
  875. v_addc_u32_e32 v17, vcc, v7, v17, vcc
  876. v_max_f32_e32 v6, 0, v6
  877. flat_store_dword v[16:17], v6
  878. v_add_u32_e32 v6, vcc, 0x7a800, v23
  879. v_ashrrev_i32_e32 v7, 31, v6
  880. v_lshlrev_b64 v[6:7], 2, v[6:7]
  881. v_add_f32_e32 v5, v5, v14
  882. v_mov_b32_e32 v12, s9
  883. v_add_u32_e32 v6, vcc, s8, v6
  884. v_addc_u32_e32 v7, vcc, v12, v7, vcc
  885. v_max_f32_e32 v5, 0, v5
  886. flat_store_dword v[6:7], v5
  887. v_add_u32_e32 v6, vcc, 0x86c00, v23
  888. v_ashrrev_i32_e32 v7, 31, v6
  889. v_add_u32_e32 v5, vcc, 0x93000, v23
  890. v_lshlrev_b64 v[6:7], 2, v[6:7]
  891. v_add_f32_e32 v4, v4, v15
  892. v_add_u32_e32 v6, vcc, s8, v6
  893. v_max_f32_e32 v4, 0, v4
  894. v_addc_u32_e32 v7, vcc, v12, v7, vcc
  895. flat_store_dword v[6:7], v4
  896. v_ashrrev_i32_e32 v6, 31, v5
  897. v_max_f32_e32 v7, 0, v3
  898. v_lshlrev_b64 v[3:4], 2, v[5:6]
  899. v_mov_b32_e32 v5, s9
  900. v_add_u32_e32 v3, vcc, s8, v3
  901. v_addc_u32_e32 v4, vcc, v5, v4, vcc
  902. v_add_f32_e32 v2, v2, v9
  903. flat_store_dword v[3:4], v7
  904. v_max_f32_e32 v4, 0, v2
  905. v_add_u32_e32 v2, vcc, s0, v24
  906. v_addc_u32_e32 v3, vcc, 0, v25, vcc
  907. v_add_f32_e32 v1, v1, v10
  908. s_mov_b32 s0, 0x2ae000
  909. flat_store_dword v[2:3], v4
  910. v_max_f32_e32 v3, 0, v1
  911. v_add_u32_e32 v1, vcc, s0, v24
  912. v_addc_u32_e32 v2, vcc, 0, v25, vcc
  913. v_add_f32_e32 v0, v0, v11
  914. s_mov_b32 s0, 0x2df000
  915. flat_store_dword v[1:2], v3
  916. v_max_f32_e32 v2, 0, v0
  917. v_add_u32_e32 v0, vcc, s0, v24
  918. v_addc_u32_e32 v1, vcc, 0, v25, vcc
  919. flat_store_dword v[0:1], v2
  920. s_endpgm
  921. .Lfunc_end2:
  922. .size fuse_conv2d_relu_kernel2, .Lfunc_end2-fuse_conv2d_relu_kernel2
Add Comment
Please, Sign In to add comment