Guest User

Untitled

a guest
Jul 21st, 2018
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 59.40 KB | None | 0 0
  1. .text
  2. .hsa_code_object_version 2,1
  3. .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
  4. .globl fuse_conv2d_relu_kernel0
  5. .p2align 8
  6. .type fuse_conv2d_relu_kernel0,@function
  7. .amdgpu_hsa_kernel fuse_conv2d_relu_kernel0
  8. fuse_conv2d_relu_kernel0:
  9. .amd_kernel_code_t
  10. amd_code_version_major = 1
  11. amd_code_version_minor = 1
  12. amd_machine_kind = 1
  13. amd_machine_version_major = 8
  14. amd_machine_version_minor = 0
  15. amd_machine_version_stepping = 3
  16. kernel_code_entry_byte_offset = 256
  17. kernel_code_prefetch_byte_size = 0
  18. max_scratch_backing_memory_byte_size = 0
  19. granulated_workitem_vgpr_count = 1
  20. granulated_wavefront_sgpr_count = 1
  21. priority = 0
  22. float_mode = 192
  23. priv = 0
  24. enable_dx10_clamp = 1
  25. debug_mode = 0
  26. enable_ieee_mode = 1
  27. enable_sgpr_private_segment_wave_byte_offset = 0
  28. user_sgpr_count = 6
  29. enable_trap_handler = 1
  30. enable_sgpr_workgroup_id_x = 1
  31. enable_sgpr_workgroup_id_y = 0
  32. enable_sgpr_workgroup_id_z = 0
  33. enable_sgpr_workgroup_info = 0
  34. enable_vgpr_workitem_id = 0
  35. enable_exception_msb = 0
  36. granulated_lds_size = 0
  37. enable_exception = 0
  38. enable_sgpr_private_segment_buffer = 1
  39. enable_sgpr_dispatch_ptr = 0
  40. enable_sgpr_queue_ptr = 0
  41. enable_sgpr_kernarg_segment_ptr = 1
  42. enable_sgpr_dispatch_id = 0
  43. enable_sgpr_flat_scratch_init = 0
  44. enable_sgpr_private_segment_size = 0
  45. enable_sgpr_grid_workgroup_count_x = 0
  46. enable_sgpr_grid_workgroup_count_y = 0
  47. enable_sgpr_grid_workgroup_count_z = 0
  48. enable_ordered_append_gds = 0
  49. private_element_size = 1
  50. is_ptr64 = 1
  51. is_dynamic_callstack = 0
  52. is_debug_enabled = 0
  53. is_xnack_enabled = 0
  54. workitem_private_segment_byte_size = 0
  55. workgroup_group_segment_byte_size = 0
  56. gds_segment_byte_size = 0
  57. kernarg_segment_byte_size = 16
  58. workgroup_fbarrier_count = 0
  59. wavefront_sgpr_count = 10
  60. workitem_vgpr_count = 7
  61. reserved_vgpr_first = 0
  62. reserved_vgpr_count = 0
  63. reserved_sgpr_first = 0
  64. reserved_sgpr_count = 0
  65. debug_wavefront_private_segment_offset_sgpr = 0
  66. debug_private_segment_buffer_sgpr = 0
  67. kernarg_segment_alignment = 4
  68. group_segment_alignment = 4
  69. private_segment_alignment = 4
  70. wavefront_size = 6
  71. call_convention = -1
  72. runtime_loader_kernel_symbol = 0
  73. .end_amd_kernel_code_t
  74. v_sub_u32_e32 v1, vcc, 0x25bd8, v0
  75. s_lshl_b32 s0, s6, 8
  76. v_cmp_lt_i32_e32 vcc, s0, v1
  77. s_and_saveexec_b64 s[0:1], vcc
  78. s_cbranch_execz BB0_5
  79. BB0_1:
  80. s_mul_i32 s0, s6, 28
  81. v_add_u32_e32 v1, vcc, s0, v0
  82. v_mov_b32_e32 v0, 0x8fb823ef
  83. v_mul_hi_i32 v0, v1, v0
  84. s_movk_i32 s0, 0xe4
  85. v_add_u32_e32 v0, vcc, v0, v1
  86. v_lshrrev_b32_e32 v2, 31, v0
  87. v_ashrrev_i32_e32 v0, 7, v0
  88. v_add_u32_e32 v0, vcc, v2, v0
  89. v_mul_lo_i32 v0, v0, s0
  90. v_mov_b32_e32 v2, 0xe2
  91. v_subrev_u32_e32 v0, vcc, v0, v1
  92. v_cmp_lt_i32_e32 vcc, v0, v2
  93. s_and_saveexec_b64 s[2:3], vcc
  94. s_cbranch_execz BB0_5
  95. BB0_2:
  96. s_mul_i32 s6, s6, s0
  97. v_add_u32_e32 v1, vcc, s6, v1
  98. v_mov_b32_e32 v2, 0x28b30361
  99. v_mov_b32_e32 v4, 0x8fb823ef
  100. v_mul_hi_i32 v4, v1, v4
  101. v_mul_hi_i32 v2, v1, v2
  102. s_movk_i32 s0, 0xe2
  103. s_load_dwordx2 s[6:7], s[4:5], 0x0
  104. v_add_u32_e32 v4, vcc, v4, v1
  105. v_lshrrev_b32_e32 v3, 31, v2
  106. v_ashrrev_i32_e32 v2, 13, v2
  107. v_add_u32_e32 v2, vcc, v2, v3
  108. v_mov_b32_e32 v3, 0x55555556
  109. v_lshrrev_b32_e32 v6, 31, v4
  110. v_ashrrev_i32_e32 v4, 7, v4
  111. v_mul_hi_i32 v3, v2, v3
  112. v_add_u32_e32 v4, vcc, v6, v4
  113. v_mov_b32_e32 v6, 0x487ede05
  114. v_mul_hi_i32 v6, v4, v6
  115. v_lshrrev_b32_e32 v5, 31, v3
  116. v_add_u32_e32 v3, vcc, v3, v5
  117. v_mul_lo_i32 v3, v3, 3
  118. v_lshrrev_b32_e32 v5, 31, v6
  119. v_ashrrev_i32_e32 v6, 6, v6
  120. v_add_u32_e32 v5, vcc, v6, v5
  121. v_mov_b32_e32 v6, 0x6c880903
  122. v_mul_lo_i32 v5, v5, s0
  123. v_mul_hi_i32 v6, v1, v6
  124. v_subrev_u32_e32 v3, vcc, v3, v2
  125. v_cmp_lt_i32_e64 s[2:3], 0, v0
  126. v_subrev_u32_e32 v1, vcc, v5, v4
  127. v_lshrrev_b32_e32 v2, 31, v6
  128. v_add_u32_sdwa v2, vcc, sext(v6), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
  129. v_add_u32_e32 v5, vcc, -1, v1
  130. v_mov_b32_e32 v6, 0xe0
  131. v_mov_b32_e32 v4, 0xe1
  132. v_cmp_lt_u32_e32 vcc, v5, v6
  133. v_cmp_ne_u32_e64 s[0:1], v0, v4
  134. s_and_b64 s[2:3], s[2:3], vcc
  135. v_mov_b32_e32 v4, 0
  136. s_and_b64 s[2:3], s[0:1], s[2:3]
  137. s_and_saveexec_b64 s[0:1], s[2:3]
  138. s_cbranch_execz BB0_4
  139. BB0_3:
  140. s_load_dwordx2 s[2:3], s[4:5], 0x8
  141. s_mov_b32 s4, 0xc400
  142. v_mul_lo_i32 v4, v3, s4
  143. s_mov_b32 s4, 0x24c00
  144. v_mul_lo_i32 v5, v2, s4
  145. s_movk_i32 s4, 0xe0
  146. v_mul_lo_i32 v6, v1, s4
  147. v_add_u32_e32 v5, vcc, v5, v0
  148. v_add_u32_e32 v5, vcc, v5, v6
  149. v_add_u32_e32 v4, vcc, v4, v5
  150. v_add_u32_e32 v4, vcc, 0xffffff1f, v4
  151. v_ashrrev_i32_e32 v5, 31, v4
  152. v_lshlrev_b64 v[4:5], 2, v[4:5]
  153. s_waitcnt lgkmcnt(0)
  154. v_mov_b32_e32 v6, s3
  155. v_add_u32_e32 v4, vcc, s2, v4
  156. v_addc_u32_e32 v5, vcc, v6, v5, vcc
  157. flat_load_dword v4, v[4:5]
  158. BB0_4:
  159. s_or_b64 exec, exec, s[0:1]
  160. s_mov_b32 s0, 0xc948
  161. v_mul_lo_i32 v3, v3, s0
  162. s_mov_b32 s0, 0x25bd8
  163. v_mul_lo_i32 v2, v2, s0
  164. s_movk_i32 s0, 0xe4
  165. v_mul_lo_i32 v1, v1, s0
  166. v_add_u32_e32 v0, vcc, v2, v0
  167. s_waitcnt lgkmcnt(0)
  168. v_mov_b32_e32 v2, s7
  169. v_add_u32_e32 v0, vcc, v0, v1
  170. v_add_u32_e32 v0, vcc, v3, v0
  171. v_ashrrev_i32_e32 v1, 31, v0
  172. v_lshlrev_b64 v[0:1], 2, v[0:1]
  173. v_add_u32_e32 v0, vcc, s6, v0
  174. v_addc_u32_e32 v1, vcc, v2, v1, vcc
  175. s_waitcnt vmcnt(0)
  176. flat_store_dword v[0:1], v4
  177. BB0_5:
  178. s_endpgm
  179. .Lfunc_end0:
  180. .size fuse_conv2d_relu_kernel0, .Lfunc_end0-fuse_conv2d_relu_kernel0
  181.  
  182. .globl fuse_conv2d_relu_kernel1
  183. .p2align 8
  184. .type fuse_conv2d_relu_kernel1,@function
  185. .amdgpu_hsa_kernel fuse_conv2d_relu_kernel1
  186. fuse_conv2d_relu_kernel1:
  187. .amd_kernel_code_t
  188. amd_code_version_major = 1
  189. amd_code_version_minor = 1
  190. amd_machine_kind = 1
  191. amd_machine_version_major = 8
  192. amd_machine_version_minor = 0
  193. amd_machine_version_stepping = 3
  194. kernel_code_entry_byte_offset = 256
  195. kernel_code_prefetch_byte_size = 0
  196. max_scratch_backing_memory_byte_size = 0
  197. granulated_workitem_vgpr_count = 1
  198. granulated_wavefront_sgpr_count = 1
  199. priority = 0
  200. float_mode = 192
  201. priv = 0
  202. enable_dx10_clamp = 1
  203. debug_mode = 0
  204. enable_ieee_mode = 1
  205. enable_sgpr_private_segment_wave_byte_offset = 0
  206. user_sgpr_count = 6
  207. enable_trap_handler = 1
  208. enable_sgpr_workgroup_id_x = 1
  209. enable_sgpr_workgroup_id_y = 0
  210. enable_sgpr_workgroup_id_z = 0
  211. enable_sgpr_workgroup_info = 0
  212. enable_vgpr_workitem_id = 0
  213. enable_exception_msb = 0
  214. granulated_lds_size = 0
  215. enable_exception = 0
  216. enable_sgpr_private_segment_buffer = 1
  217. enable_sgpr_dispatch_ptr = 0
  218. enable_sgpr_queue_ptr = 0
  219. enable_sgpr_kernarg_segment_ptr = 1
  220. enable_sgpr_dispatch_id = 0
  221. enable_sgpr_flat_scratch_init = 0
  222. enable_sgpr_private_segment_size = 0
  223. enable_sgpr_grid_workgroup_count_x = 0
  224. enable_sgpr_grid_workgroup_count_y = 0
  225. enable_sgpr_grid_workgroup_count_z = 0
  226. enable_ordered_append_gds = 0
  227. private_element_size = 1
  228. is_ptr64 = 1
  229. is_dynamic_callstack = 0
  230. is_debug_enabled = 0
  231. is_xnack_enabled = 0
  232. workitem_private_segment_byte_size = 0
  233. workgroup_group_segment_byte_size = 0
  234. gds_segment_byte_size = 0
  235. kernarg_segment_byte_size = 8
  236. workgroup_fbarrier_count = 0
  237. wavefront_sgpr_count = 9
  238. workitem_vgpr_count = 8
  239. reserved_vgpr_first = 0
  240. reserved_vgpr_count = 0
  241. reserved_sgpr_first = 0
  242. reserved_sgpr_count = 0
  243. debug_wavefront_private_segment_offset_sgpr = 0
  244. debug_private_segment_buffer_sgpr = 0
  245. kernarg_segment_alignment = 4
  246. group_segment_alignment = 4
  247. private_segment_alignment = 4
  248. wavefront_size = 6
  249. call_convention = -1
  250. runtime_loader_kernel_symbol = 0
  251. .end_amd_kernel_code_t
  252. v_lshrrev_b32_e32 v1, 2, v0
  253. v_sub_u32_e32 v2, vcc, 0x96f6, v1
  254. s_lshl_b32 s0, s6, 6
  255. v_cmp_lt_i32_e32 vcc, s0, v2
  256. s_and_saveexec_b64 s[2:3], vcc
  257. s_cbranch_execz BB1_4
  258. BB1_1:
  259. v_add_u32_e32 v1, vcc, s0, v1
  260. v_mov_b32_e32 v2, 0x28b30361
  261. v_mul_hi_i32 v2, v1, v2
  262. v_mov_b32_e32 v4, 0x8fb823ef
  263. v_mul_hi_i32 v6, v1, v4
  264. s_mulk_i32 s6, 0xffc7
  265. v_lshrrev_b32_e32 v3, 31, v2
  266. v_ashrrev_i32_e32 v2, 11, v2
  267. v_add_u32_e32 v2, vcc, v2, v3
  268. v_mov_b32_e32 v3, 0x55555556
  269. v_mul_hi_i32 v3, v2, v3
  270. v_and_b32_e32 v7, 3, v0
  271. s_movk_i32 s2, 0xe2
  272. s_mov_b32 s3, 0xc948
  273. v_lshrrev_b32_e32 v5, 31, v3
  274. v_add_u32_e32 v3, vcc, v3, v5
  275. v_add_u32_e32 v5, vcc, v6, v1
  276. v_lshrrev_b32_e32 v6, 31, v5
  277. v_ashrrev_i32_e32 v5, 5, v5
  278. v_add_u32_e32 v5, vcc, v6, v5
  279. v_mov_b32_e32 v6, 0x487ede05
  280. v_mul_lo_i32 v3, v3, 3
  281. v_mul_hi_i32 v6, v5, v6
  282. s_load_dwordx2 s[0:1], s[4:5], 0x0
  283. v_subrev_u32_e32 v2, vcc, v3, v2
  284. v_lshrrev_b32_e32 v3, 31, v6
  285. v_ashrrev_i32_e32 v6, 6, v6
  286. v_add_u32_e32 v3, vcc, v6, v3
  287. v_add_u32_e32 v6, vcc, s6, v1
  288. v_mul_hi_i32 v4, v6, v4
  289. v_mul_lo_i32 v3, v3, s2
  290. v_mul_lo_i32 v2, v2, s3
  291. s_movk_i32 s3, 0xe4
  292. v_add_u32_e32 v0, vcc, v4, v6
  293. v_lshrrev_b32_e32 v4, 31, v0
  294. v_ashrrev_i32_e32 v0, 5, v0
  295. v_add_u32_e32 v0, vcc, v4, v0
  296. v_mul_lo_i32 v0, v0, 57
  297. v_mov_b32_e32 v4, 0x6c880903
  298. v_mul_hi_i32 v4, v1, v4
  299. v_subrev_u32_e32 v3, vcc, v3, v5
  300. v_subrev_u32_e32 v0, vcc, v0, v6
  301. v_mul_lo_i32 v3, v3, s3
  302. v_lshlrev_b32_e32 v0, 2, v0
  303. v_lshrrev_b32_e32 v6, 31, v4
  304. v_ashrrev_i32_e32 v4, 14, v4
  305. v_add_u32_e32 v4, vcc, v4, v6
  306. v_or_b32_e32 v0, v0, v7
  307. v_mov_b32_e32 v6, 0x25bd8
  308. v_mad_i32_i24 v0, v4, v6, v0
  309. v_add_u32_e32 v0, vcc, v0, v3
  310. v_add_u32_e32 v0, vcc, v2, v0
  311. v_mul_lo_i32 v2, v5, 57
  312. v_sub_u32_e32 v3, vcc, s2, v7
  313. v_subrev_u32_e32 v1, vcc, v2, v1
  314. v_lshlrev_b32_e32 v1, 2, v1
  315. v_cmp_lt_i32_e32 vcc, v1, v3
  316. v_ashrrev_i32_e32 v1, 31, v0
  317. v_lshlrev_b64 v[0:1], 2, v[0:1]
  318. s_waitcnt lgkmcnt(0)
  319. v_mov_b32_e32 v2, s1
  320. v_add_u32_e64 v0, s[0:1], s0, v0
  321. v_addc_u32_e64 v1, s[0:1], v2, v1, s[0:1]
  322. v_mov_b32_e32 v2, 0
  323. s_and_saveexec_b64 s[0:1], vcc
  324. BB1_2:
  325. flat_load_dword v2, v[0:1]
  326. BB1_3:
  327. s_or_b64 exec, exec, s[0:1]
  328. s_waitcnt vmcnt(0) lgkmcnt(0)
  329. flat_store_dword v[0:1], v2
  330. BB1_4:
  331. s_endpgm
  332. .Lfunc_end1:
  333. .size fuse_conv2d_relu_kernel1, .Lfunc_end1-fuse_conv2d_relu_kernel1
  334.  
  335. .globl fuse_conv2d_relu_kernel2
  336. .p2align 8
  337. .type fuse_conv2d_relu_kernel2,@function
  338. .amdgpu_hsa_kernel fuse_conv2d_relu_kernel2
  339. fuse_conv2d_relu_kernel2:
  340. .amd_kernel_code_t
  341. amd_code_version_major = 1
  342. amd_code_version_minor = 1
  343. amd_machine_kind = 1
  344. amd_machine_version_major = 8
  345. amd_machine_version_minor = 0
  346. amd_machine_version_stepping = 3
  347. kernel_code_entry_byte_offset = 256
  348. kernel_code_prefetch_byte_size = 0
  349. max_scratch_backing_memory_byte_size = 0
  350. granulated_workitem_vgpr_count = 63
  351. granulated_wavefront_sgpr_count = 2
  352. priority = 0
  353. float_mode = 192
  354. priv = 0
  355. enable_dx10_clamp = 1
  356. debug_mode = 0
  357. enable_ieee_mode = 1
  358. enable_sgpr_private_segment_wave_byte_offset = 1
  359. user_sgpr_count = 6
  360. enable_trap_handler = 1
  361. enable_sgpr_workgroup_id_x = 1
  362. enable_sgpr_workgroup_id_y = 0
  363. enable_sgpr_workgroup_id_z = 0
  364. enable_sgpr_workgroup_info = 0
  365. enable_vgpr_workitem_id = 0
  366. enable_exception_msb = 0
  367. granulated_lds_size = 0
  368. enable_exception = 0
  369. enable_sgpr_private_segment_buffer = 1
  370. enable_sgpr_dispatch_ptr = 0
  371. enable_sgpr_queue_ptr = 0
  372. enable_sgpr_kernarg_segment_ptr = 1
  373. enable_sgpr_dispatch_id = 0
  374. enable_sgpr_flat_scratch_init = 0
  375. enable_sgpr_private_segment_size = 0
  376. enable_sgpr_grid_workgroup_count_x = 0
  377. enable_sgpr_grid_workgroup_count_y = 0
  378. enable_sgpr_grid_workgroup_count_z = 0
  379. enable_ordered_append_gds = 0
  380. private_element_size = 1
  381. is_ptr64 = 1
  382. is_dynamic_callstack = 0
  383. is_debug_enabled = 0
  384. is_xnack_enabled = 0
  385. workitem_private_segment_byte_size = 268
  386. workgroup_group_segment_byte_size = 4464
  387. gds_segment_byte_size = 0
  388. kernarg_segment_byte_size = 32
  389. workgroup_fbarrier_count = 0
  390. wavefront_sgpr_count = 19
  391. workitem_vgpr_count = 256
  392. reserved_vgpr_first = 0
  393. reserved_vgpr_count = 0
  394. reserved_sgpr_first = 0
  395. reserved_sgpr_count = 0
  396. debug_wavefront_private_segment_offset_sgpr = 0
  397. debug_private_segment_buffer_sgpr = 0
  398. kernarg_segment_alignment = 4
  399. group_segment_alignment = 4
  400. private_segment_alignment = 4
  401. wavefront_size = 6
  402. call_convention = -1
  403. runtime_loader_kernel_symbol = 0
  404. .end_amd_kernel_code_t
  405. s_mov_b64 s[14:15], s[2:3]
  406. v_mov_b32_e32 v255, v0
  407. s_mov_b32 s16, s7
  408. s_mov_b64 s[12:13], s[0:1]
  409. v_cmp_lt_i32_e32 vcc, 26, v255
  410. v_mov_b32_e32 v1, 0
  411. s_and_saveexec_b64 s[0:1], vcc
  412. s_xor_b64 s[0:1], exec, s[0:1]
  413. BB2_1:
  414. v_mov_b32_e32 v1, 0xab
  415. v_cmp_lt_i32_e32 vcc, v255, v1
  416. v_mov_b32_e32 v4, 0
  417. v_cndmask_b32_e64 v1, 0, -1, vcc
  418. BB2_2:
  419. s_or_saveexec_b64 s[0:1], s[0:1]
  420. s_load_dwordx2 s[8:9], s[4:5], 0x8
  421. s_xor_b64 exec, exec, s[0:1]
  422. s_cbranch_execz BB2_4
  423. BB2_3:
  424. v_mov_b32_e32 v1, 0x92492493
  425. v_mul_hi_i32 v1, s6, v1
  426. v_mov_b32_e32 v2, 0x38e38e39
  427. v_mul_hi_u32 v2, v255, v2
  428. s_load_dwordx2 s[2:3], s[4:5], 0x0
  429. v_add_u32_e32 v1, vcc, s6, v1
  430. v_lshrrev_b32_e32 v3, 31, v1
  431. v_ashrrev_i32_e32 v1, 7, v1
  432. v_add_u32_e32 v1, vcc, v3, v1
  433. v_mul_lo_i32 v1, v1, 48
  434. v_lshrrev_b32_e32 v2, 1, v2
  435. v_mul_u32_u24_e32 v3, 9, v2
  436. s_waitcnt lgkmcnt(0)
  437. v_mov_b32_e32 v5, s3
  438. v_add_u32_e32 v1, vcc, v1, v2
  439. v_mul_lo_i32 v1, v1, 9
  440. v_subrev_u32_e32 v2, vcc, v3, v255
  441. v_mov_b32_e32 v7, s3
  442. s_movk_i32 s7, 0x5e8
  443. v_add_u32_e32 v1, vcc, v1, v2
  444. v_ashrrev_i32_e32 v2, 31, v1
  445. v_lshlrev_b64 v[2:3], 2, v[1:2]
  446. v_add_u32_e32 v2, vcc, s2, v2
  447. v_addc_u32_e32 v3, vcc, v5, v3, vcc
  448. v_add_u32_e32 v13, vcc, 27, v1
  449. v_ashrrev_i32_e32 v14, 31, v13
  450. v_add_u32_e32 v5, vcc, 0xd8, v1
  451. v_add_u32_e32 v6, vcc, 0xbd, v1
  452. v_add_u32_e32 v8, vcc, 0xa2, v1
  453. v_add_u32_e32 v9, vcc, 0x87, v1
  454. v_add_u32_e32 v10, vcc, 0x6c, v1
  455. v_add_u32_e32 v11, vcc, 0x51, v1
  456. v_add_u32_e32 v12, vcc, 54, v1
  457. v_lshlrev_b64 v[13:14], 2, v[13:14]
  458. v_add_u32_e32 v13, vcc, s2, v13
  459. v_addc_u32_e32 v14, vcc, v7, v14, vcc
  460. v_add_u32_e32 v18, vcc, 0xf3, v1
  461. v_ashrrev_i32_e32 v19, 31, v18
  462. v_add_u32_e32 v15, vcc, 0x144, v1
  463. v_add_u32_e32 v16, vcc, 0x129, v1
  464. v_add_u32_e32 v17, vcc, 0x10e, v1
  465. v_lshlrev_b64 v[18:19], 2, v[18:19]
  466. v_add_u32_e32 v20, vcc, s2, v18
  467. v_ashrrev_i32_e32 v18, 31, v17
  468. v_mov_b32_e32 v1, s3
  469. v_addc_u32_e32 v21, vcc, v1, v19, vcc
  470. v_lshlrev_b64 v[17:18], 2, v[17:18]
  471. v_add_u32_e32 v22, vcc, s2, v17
  472. v_ashrrev_i32_e32 v17, 31, v16
  473. v_addc_u32_e32 v23, vcc, v1, v18, vcc
  474. v_lshlrev_b64 v[16:17], 2, v[16:17]
  475. v_add_u32_e32 v18, vcc, s2, v16
  476. v_ashrrev_i32_e32 v16, 31, v15
  477. v_addc_u32_e32 v19, vcc, v1, v17, vcc
  478. v_lshlrev_b64 v[15:16], 2, v[15:16]
  479. v_ashrrev_i32_e32 v7, 31, v6
  480. v_add_u32_e32 v15, vcc, s2, v15
  481. v_addc_u32_e32 v16, vcc, v1, v16, vcc
  482. v_lshlrev_b64 v[6:7], 2, v[6:7]
  483. flat_load_dword v1, v[20:21]
  484. flat_load_dword v20, v[22:23]
  485. flat_load_dword v19, v[18:19]
  486. flat_load_dword v21, v[15:16]
  487. flat_load_dword v22, v[13:14]
  488. v_add_u32_e32 v13, vcc, s2, v6
  489. v_ashrrev_i32_e32 v6, 31, v5
  490. v_mov_b32_e32 v14, s3
  491. v_addc_u32_e32 v14, vcc, v14, v7, vcc
  492. v_lshlrev_b64 v[5:6], 2, v[5:6]
  493. v_mov_b32_e32 v7, s3
  494. v_add_u32_e32 v5, vcc, s2, v5
  495. v_addc_u32_e32 v6, vcc, v7, v6, vcc
  496. v_add_u32_e32 v15, vcc, s7, v2
  497. v_addc_u32_e32 v16, vcc, 0, v3, vcc
  498. s_movk_i32 s7, 0x654
  499. v_add_u32_e32 v17, vcc, s7, v2
  500. flat_load_dword v23, v[13:14]
  501. v_addc_u32_e32 v18, vcc, 0, v3, vcc
  502. flat_load_dword v25, v[5:6]
  503. flat_load_dword v5, v[2:3]
  504. flat_load_dword v17, v[17:18]
  505. flat_load_dword v18, v[15:16]
  506. v_lshlrev_b32_e32 v24, 2, v255
  507. s_mov_b32 m0, -1
  508. v_ashrrev_i32_e32 v13, 31, v12
  509. s_waitcnt vmcnt(2) lgkmcnt(2)
  510. ds_write2_b32 v24, v5, v22 offset1:27
  511. v_lshlrev_b64 v[5:6], 2, v[12:13]
  512. v_ashrrev_i32_e32 v12, 31, v11
  513. v_add_u32_e32 v5, vcc, s2, v5
  514. v_addc_u32_e32 v6, vcc, v7, v6, vcc
  515. v_lshlrev_b64 v[11:12], 2, v[11:12]
  516. v_add_u32_e32 v13, vcc, s2, v11
  517. v_ashrrev_i32_e32 v11, 31, v10
  518. v_addc_u32_e32 v14, vcc, v7, v12, vcc
  519. v_lshlrev_b64 v[10:11], 2, v[10:11]
  520. v_add_u32_e32 v15, vcc, s2, v10
  521. v_ashrrev_i32_e32 v10, 31, v9
  522. v_addc_u32_e32 v16, vcc, v7, v11, vcc
  523. v_lshlrev_b64 v[9:10], 2, v[9:10]
  524. v_add_u32_e32 v11, vcc, s2, v9
  525. v_ashrrev_i32_e32 v9, 31, v8
  526. v_addc_u32_e32 v12, vcc, v7, v10, vcc
  527. v_lshlrev_b64 v[7:8], 2, v[8:9]
  528. v_add_u32_e32 v7, vcc, s2, v7
  529. v_mov_b32_e32 v9, s3
  530. v_addc_u32_e32 v8, vcc, v9, v8, vcc
  531. flat_load_dword v5, v[5:6]
  532. flat_load_dword v6, v[13:14]
  533. flat_load_dword v9, v[15:16]
  534. flat_load_dword v10, v[11:12]
  535. flat_load_dword v7, v[7:8]
  536. s_movk_i32 s2, 0x57c
  537. s_waitcnt vmcnt(3) lgkmcnt(3)
  538. ds_write2_b32 v24, v5, v6 offset0:54 offset1:81
  539. s_waitcnt vmcnt(1) lgkmcnt(2)
  540. ds_write2_b32 v24, v9, v10 offset0:108 offset1:135
  541. s_waitcnt vmcnt(0) lgkmcnt(2)
  542. ds_write2_b32 v24, v7, v23 offset0:162 offset1:189
  543. ds_write2_b32 v24, v25, v1 offset0:216 offset1:243
  544. v_add_u32_e32 v1, vcc, 0x438, v24
  545. ds_write2_b32 v1, v20, v19 offset1:27
  546. v_add_u32_e32 v1, vcc, s2, v2
  547. v_addc_u32_e32 v2, vcc, 0, v3, vcc
  548. flat_load_dword v1, v[1:2]
  549. v_add_u32_e32 v2, vcc, 0x510, v24
  550. s_waitcnt vmcnt(0) lgkmcnt(0)
  551. ds_write2_b32 v2, v21, v1 offset1:27
  552. v_add_u32_e32 v1, vcc, 0x5e8, v24
  553. ds_write2_b32 v1, v18, v17 offset1:27
  554. v_mov_b32_e32 v1, -1
  555. BB2_4:
  556. s_or_b64 exec, exec, s[0:1]
  557. v_cmp_ne_u32_e32 vcc, 0, v1
  558. s_and_saveexec_b64 s[0:1], vcc
  559. s_cbranch_execz BB2_7
  560. BB2_5:
  561. v_mov_b32_e32 v1, 0x1f7047dd
  562. v_mov_b32_e32 v2, 0x92492493
  563. v_mul_hi_u32 v1, v255, v1
  564. v_mul_hi_i32 v2, s6, v2
  565. s_movk_i32 s2, 0xe0
  566. v_mov_b32_e32 v4, -1
  567. v_subrev_u32_e32 v3, vcc, v1, v255
  568. v_lshrrev_b32_e32 v3, 1, v3
  569. v_add_u32_e32 v2, vcc, s6, v2
  570. v_add_u32_e32 v1, vcc, v1, v3
  571. v_lshrrev_b32_e32 v3, 31, v2
  572. v_ashrrev_i32_e32 v2, 7, v2
  573. v_add_u32_e32 v2, vcc, v3, v2
  574. v_mul_lo_i32 v2, v2, s2
  575. v_lshrrev_b32_e32 v1, 5, v1
  576. v_sub_u32_e32 v3, vcc, 0xe2, v1
  577. v_sub_u32_e32 v2, vcc, s6, v2
  578. v_cmp_lt_i32_e32 vcc, v2, v3
  579. s_and_b64 exec, exec, vcc
  580. s_cbranch_execz BB2_7
  581. BB2_6:
  582. v_mov_b32_e32 v3, 0x1f7047dd
  583. v_mul_hi_u32 v3, v255, v3
  584. v_add_u32_e32 v1, vcc, v2, v1
  585. v_mul_lo_i32 v1, v1, 57
  586. v_lshlrev_b32_e32 v5, 4, v255
  587. v_subrev_u32_e32 v4, vcc, v3, v255
  588. v_lshrrev_b32_e32 v4, 1, v4
  589. v_add_u32_e32 v3, vcc, v3, v4
  590. v_lshrrev_b32_e32 v3, 5, v3
  591. v_mul_lo_i32 v3, v3, 57
  592. s_mov_b32 m0, -1
  593. v_subrev_u32_e32 v2, vcc, v3, v255
  594. v_add_u32_e32 v1, vcc, v1, v2
  595. v_lshlrev_b32_e32 v1, 2, v1
  596. v_ashrrev_i32_e32 v2, 31, v1
  597. v_lshlrev_b64 v[1:2], 2, v[1:2]
  598. s_waitcnt lgkmcnt(0)
  599. v_mov_b32_e32 v3, s9
  600. v_add_u32_e32 v1, vcc, s8, v1
  601. v_addc_u32_e32 v2, vcc, v3, v2, vcc
  602. flat_load_dwordx4 v[1:4], v[1:2]
  603. v_add_u32_e32 v6, vcc, 0x6c0, v5
  604. s_waitcnt vmcnt(0) lgkmcnt(0)
  605. ds_write_b64 v6, v[3:4] offset:8
  606. ds_write_b64 v5, v[1:2] offset:1728
  607. v_mov_b32_e32 v4, -1
  608. BB2_7:
  609. s_or_b64 exec, exec, s[0:1]
  610. s_waitcnt lgkmcnt(0)
  611. s_barrier
  612. s_mov_b32 m0, -1
  613. v_mov_b32_e32 v3, 0
  614. ds_read2_b32 v[6:7], v3 offset0:247 offset1:248
  615. v_mov_b32_e32 v5, 0x4a4
  616. v_mov_b32_e32 v10, 0x584
  617. v_mov_b32_e32 v11, 0x65c
  618. ds_read2_b32 v[13:14], v10 offset1:1
  619. s_waitcnt lgkmcnt(1)
  620. buffer_store_dword v6, off, s[12:15], s16 offset:148
  621. buffer_store_dword v7, off, s[12:15], s16 offset:152
  622. s_waitcnt expcnt(0)
  623. ds_read2_b32 v[6:7], v3 offset0:249 offset1:250
  624. ds_read2_b32 v[10:11], v11 offset1:1
  625. v_mov_b32_e32 v12, 0x4b4
  626. v_lshlrev_b32_e32 v2, 2, v255
  627. s_waitcnt lgkmcnt(2)
  628. buffer_store_dword v13, off, s[12:15], s16 offset:68
  629. s_waitcnt lgkmcnt(1)
  630. buffer_store_dword v6, off, s[12:15], s16 offset:140
  631. buffer_store_dword v7, off, s[12:15], s16 offset:144
  632. s_waitcnt expcnt(1)
  633. ds_read2_b32 v[5:6], v5 offset1:1
  634. s_waitcnt expcnt(0)
  635. v_mov_b32_e32 v7, 0x4ac
  636. s_waitcnt lgkmcnt(1)
  637. buffer_store_dword v10, off, s[12:15], s16 offset:44
  638. buffer_store_dword v11, off, s[12:15], s16 offset:48
  639. s_waitcnt expcnt(0)
  640. ds_read2_b32 v[10:11], v12 offset1:1
  641. s_waitcnt lgkmcnt(1)
  642. buffer_store_dword v5, off, s[12:15], s16 offset:100
  643. buffer_store_dword v6, off, s[12:15], s16 offset:104
  644. s_waitcnt expcnt(1)
  645. v_mov_b32_e32 v5, 0x57c
  646. s_waitcnt expcnt(0)
  647. v_mov_b32_e32 v6, 0x654
  648. ds_read2_b32 v[8:9], v5 offset1:1
  649. ds_read2_b32 v[5:6], v6 offset1:1
  650. s_waitcnt lgkmcnt(2)
  651. buffer_store_dword v10, off, s[12:15], s16 offset:116
  652. buffer_store_dword v11, off, s[12:15], s16 offset:120
  653. s_waitcnt expcnt(1)
  654. v_mov_b32_e32 v10, 0x58c
  655. s_waitcnt lgkmcnt(1)
  656. buffer_store_dword v8, off, s[12:15], s16 offset:60
  657. s_waitcnt lgkmcnt(0)
  658. buffer_store_dword v5, off, s[12:15], s16 offset:52
  659. buffer_store_dword v6, off, s[12:15], s16 offset:56
  660. s_waitcnt expcnt(0)
  661. ds_read2_b32 v[5:6], v7 offset1:1
  662. buffer_store_dword v9, off, s[12:15], s16 offset:64
  663. v_add_u32_e32 v7, vcc, 0xde0, v2
  664. v_mov_b32_e32 v8, 0x594
  665. ds_read_b32 v28, v2 offset:3560
  666. s_waitcnt lgkmcnt(1)
  667. buffer_store_dword v5, off, s[12:15], s16 offset:108
  668. buffer_store_dword v6, off, s[12:15], s16 offset:112
  669. s_waitcnt expcnt(0)
  670. v_add_u32_e32 v6, vcc, 0x6c8, v2
  671. buffer_store_dword v14, off, s[12:15], s16 offset:72
  672. ds_read2_b32 v[43:44], v6 offset1:228
  673. ds_read2_b32 v[12:13], v8 offset1:1
  674. ds_read2_b32 v[33:34], v7 offset1:1
  675. ds_read2_b32 v[6:7], v10 offset1:1
  676. v_mov_b32_e32 v11, 0x664
  677. v_add_u32_e32 v155, vcc, 0x6c0, v2
  678. v_add_u32_e32 v5, vcc, 0xa50, v2
  679. v_mov_b32_e32 v2, 0x4bc
  680. s_waitcnt lgkmcnt(0)
  681. buffer_store_dword v6, off, s[12:15], s16 offset:84
  682. buffer_store_dword v7, off, s[12:15], s16 offset:88
  683. s_waitcnt expcnt(0)
  684. ds_read2_b32 v[6:7], v11 offset1:1
  685. ds_read2_b32 v[47:48], v5 offset1:1
  686. ds_read2_b32 v[57:58], v155 offset1:1
  687. ds_read2_b32 v[0:1], v3 offset0:85 offset1:86
  688. ds_read2_b32 v[166:167], v3 offset0:87 offset1:88
  689. s_waitcnt lgkmcnt(4)
  690. buffer_store_dword v6, off, s[12:15], s16 offset:36
  691. buffer_store_dword v7, off, s[12:15], s16 offset:40
  692. s_waitcnt expcnt(0)
  693. ds_read2_b32 v[6:7], v2 offset1:1
  694. v_add_u32_e32 v2, vcc, 0x458, v3
  695. s_waitcnt lgkmcnt(2)
  696. buffer_store_dword v0, off, s[12:15], s16 offset:220
  697. ds_read2_b32 v[15:16], v3 offset0:135 offset1:136
  698. buffer_store_dword v1, off, s[12:15], s16 offset:224
  699. s_waitcnt lgkmcnt(1)
  700. buffer_store_dword v6, off, s[12:15], s16 offset:124
  701. buffer_store_dword v7, off, s[12:15], s16 offset:128
  702. s_waitcnt expcnt(0)
  703. ds_read2_b32 v[6:7], v2 offset1:27
  704. v_add_u32_e32 v2, vcc, 0x530, v3
  705. ds_read2_b32 v[168:169], v3 offset0:33 offset1:34
  706. ds_read2_b32 v[17:18], v3 offset0:81 offset1:82
  707. ds_read2_b32 v[0:1], v3 offset0:83 offset1:84
  708. s_waitcnt lgkmcnt(3)
  709. buffer_store_dword v6, off, s[12:15], s16 offset:132
  710. buffer_store_dword v7, off, s[12:15], s16 offset:136
  711. s_waitcnt expcnt(0)
  712. ds_read2_b32 v[6:7], v2 offset1:27
  713. v_add_u32_e32 v2, vcc, 0x608, v3
  714. buffer_store_dword v12, off, s[12:15], s16 offset:76
  715. s_waitcnt lgkmcnt(1)
  716. buffer_store_dword v0, off, s[12:15], s16 offset:228
  717. buffer_store_dword v13, off, s[12:15], s16 offset:80
  718. s_waitcnt lgkmcnt(0)
  719. buffer_store_dword v6, off, s[12:15], s16 offset:92
  720. buffer_store_dword v7, off, s[12:15], s16 offset:96
  721. s_waitcnt expcnt(0)
  722. ds_read2_b32 v[6:7], v2 offset1:27
  723. buffer_store_dword v1, off, s[12:15], s16 offset:232
  724. ds_read2_b32 v[12:13], v3 offset0:27 offset1:28
  725. s_waitcnt expcnt(0)
  726. ds_read2_b32 v[0:1], v3 offset0:29 offset1:30
  727. v_mov_b32_e32 v9, 0x66c
  728. s_waitcnt lgkmcnt(2)
  729. buffer_store_dword v6, off, s[12:15], s16 offset:20
  730. buffer_store_dword v7, off, s[12:15], s16 offset:24
  731. s_waitcnt expcnt(1)
  732. ds_read2_b32 v[5:6], v3 offset0:195 offset1:196
  733. s_waitcnt lgkmcnt(1)
  734. buffer_store_dword v0, off, s[12:15], s16 offset:236
  735. buffer_store_dword v1, off, s[12:15], s16 offset:240
  736. s_waitcnt expcnt(0)
  737. ds_read2_b32 v[0:1], v3 offset0:31 offset1:32
  738. ds_read2_b32 v[8:9], v9 offset1:1
  739. s_waitcnt lgkmcnt(2)
  740. buffer_store_dword v5, off, s[12:15], s16 offset:180
  741. buffer_store_dword v6, off, s[12:15], s16 offset:184
  742. s_waitcnt expcnt(0)
  743. ds_read2_b32 v[5:6], v3 offset0:243 offset1:244
  744. s_waitcnt lgkmcnt(2)
  745. buffer_store_dword v0, off, s[12:15], s16 offset:244
  746. buffer_store_dword v1, off, s[12:15], s16 offset:248
  747. ds_read2_b32 v[158:159], v3 offset0:137 offset1:138
  748. s_waitcnt expcnt(0)
  749. ds_read2_b32 v[0:1], v3 offset0:139 offset1:140
  750. s_waitcnt lgkmcnt(2)
  751. buffer_store_dword v5, off, s[12:15], s16 offset:164
  752. buffer_store_dword v6, off, s[12:15], s16 offset:168
  753. s_waitcnt expcnt(0)
  754. ds_read2_b32 v[5:6], v3 offset0:245 offset1:246
  755. ds_read2_b32 v[156:157], v3 offset0:141 offset1:142
  756. ds_read2_b32 v[164:165], v3 offset0:8 offset1:35
  757. ds_read2_b32 v[162:163], v3 offset0:62 offset1:89
  758. buffer_store_dword v8, off, s[12:15], s16 offset:28
  759. s_waitcnt lgkmcnt(3)
  760. buffer_store_dword v5, off, s[12:15], s16 offset:156
  761. buffer_store_dword v6, off, s[12:15], s16 offset:160
  762. s_waitcnt expcnt(0)
  763. ds_read2_b32 v[5:6], v3 offset0:189 offset1:190
  764. buffer_store_dword v9, off, s[12:15], s16 offset:32
  765. ds_read2_b64 v[171:174], v3 offset1:27
  766. ds_read2_b64 v[131:134], v3 offset0:54 offset1:81
  767. ds_read2_b64 v[105:108], v3 offset0:108 offset1:135
  768. s_waitcnt lgkmcnt(3)
  769. buffer_store_dword v5, off, s[12:15], s16 offset:204
  770. buffer_store_dword v6, off, s[12:15], s16 offset:208
  771. s_waitcnt expcnt(0)
  772. ds_read2_b32 v[5:6], v3 offset0:191 offset1:192
  773. ds_read2_b64 v[77:80], v3 offset0:162 offset1:189
  774. ds_read2_b64 v[175:178], v3 offset0:1 offset1:28
  775. ds_read2_b64 v[135:138], v3 offset0:55 offset1:82
  776. ds_read2_b64 v[109:112], v3 offset0:109 offset1:136
  777. s_waitcnt lgkmcnt(4)
  778. buffer_store_dword v5, off, s[12:15], s16 offset:196
  779. buffer_store_dword v6, off, s[12:15], s16 offset:200
  780. s_waitcnt expcnt(0)
  781. ds_read2_b32 v[5:6], v3 offset0:193 offset1:194
  782. ds_read2_b64 v[81:84], v3 offset0:163 offset1:190
  783. ds_read2_b64 v[179:182], v3 offset0:2 offset1:29
  784. ds_read2_b64 v[139:142], v3 offset0:56 offset1:83
  785. ds_read2_b64 v[113:116], v3 offset0:110 offset1:137
  786. s_waitcnt lgkmcnt(4)
  787. buffer_store_dword v5, off, s[12:15], s16 offset:188
  788. buffer_store_dword v6, off, s[12:15], s16 offset:192
  789. ds_read2_b32 v[160:161], v3 offset0:116 offset1:143
  790. s_waitcnt expcnt(0)
  791. ds_read2_b32 v[5:6], v3 offset0:170 offset1:197
  792. ds_read2_b64 v[87:90], v3 offset0:164 offset1:191
  793. ds_read2_b64 v[183:186], v3 offset0:3 offset1:30
  794. ds_read2_b64 v[147:150], v3 offset0:57 offset1:84
  795. ds_read2_b64 v[117:120], v3 offset0:111 offset1:138
  796. s_waitcnt lgkmcnt(4)
  797. buffer_store_dword v5, off, s[12:15], s16 offset:212
  798. buffer_store_dword v6, off, s[12:15], s16 offset:216
  799. s_waitcnt expcnt(0)
  800. ds_read2_b32 v[5:6], v3 offset0:224 offset1:251
  801. ds_read2_b64 v[91:94], v3 offset0:165 offset1:192
  802. v_cmp_ne_u32_e32 vcc, 0, v4
  803. s_waitcnt lgkmcnt(1)
  804. buffer_store_dword v5, off, s[12:15], s16 offset:172
  805. buffer_store_dword v6, off, s[12:15], s16 offset:176
  806. s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  807. s_barrier
  808. s_and_saveexec_b64 s[0:1], vcc
  809. s_cbranch_execz BB2_10
  810. BB2_8:
  811. v_mov_b32_e32 v2, 0x1f7047dd
  812. v_mov_b32_e32 v3, 0x92492493
  813. v_mul_hi_u32 v2, v255, v2
  814. v_mul_hi_i32 v3, s6, v3
  815. s_movk_i32 s2, 0xe0
  816. v_subrev_u32_e32 v5, vcc, v2, v255
  817. v_lshrrev_b32_e32 v5, 1, v5
  818. v_add_u32_e32 v3, vcc, s6, v3
  819. v_add_u32_e32 v2, vcc, v2, v5
  820. v_lshrrev_b32_e32 v5, 31, v3
  821. v_ashrrev_i32_e32 v3, 7, v3
  822. v_add_u32_e32 v3, vcc, v5, v3
  823. v_mul_lo_i32 v3, v3, s2
  824. v_lshrrev_b32_e32 v2, 5, v2
  825. v_sub_u32_e32 v5, vcc, 0xe2, v2
  826. v_sub_u32_e32 v3, vcc, s6, v3
  827. v_cmp_lt_i32_e32 vcc, v3, v5
  828. s_and_b64 exec, exec, vcc
  829. s_cbranch_execz BB2_10
  830. BB2_9:
  831. v_mov_b32_e32 v5, 0x1f7047dd
  832. v_mul_hi_u32 v5, v255, v5
  833. v_add_u32_e32 v2, vcc, v3, v2
  834. v_mul_lo_i32 v2, v2, 57
  835. s_mov_b32 m0, -1
  836. v_subrev_u32_e32 v6, vcc, v5, v255
  837. v_lshrrev_b32_e32 v6, 1, v6
  838. v_add_u32_e32 v5, vcc, v5, v6
  839. v_lshrrev_b32_e32 v5, 5, v5
  840. v_mul_lo_i32 v5, v5, 57
  841. v_subrev_u32_e32 v3, vcc, v5, v255
  842. v_add_u32_e32 v2, vcc, v2, v3
  843. v_lshlrev_b32_e32 v2, 2, v2
  844. v_add_u32_e32 v2, vcc, 0xc948, v2
  845. v_ashrrev_i32_e32 v3, 31, v2
  846. v_lshlrev_b64 v[2:3], 2, v[2:3]
  847. v_mov_b32_e32 v5, s9
  848. v_add_u32_e32 v2, vcc, s8, v2
  849. v_addc_u32_e32 v3, vcc, v5, v3, vcc
  850. flat_load_dwordx4 v[5:8], v[2:3]
  851. v_mad_u32_u24 v2, v255, 12, v155
  852. s_waitcnt vmcnt(0) lgkmcnt(0)
  853. ds_write2_b64 v2, v[5:6], v[7:8] offset1:1
  854. BB2_10:
  855. s_or_b64 exec, exec, s[0:1]
  856. s_waitcnt lgkmcnt(0)
  857. s_barrier
  858. s_mov_b32 m0, -1
  859. v_mov_b32_e32 v7, 0
  860. v_mov_b32_e32 v2, 0x45c
  861. ds_read2_b32 v[217:218], v7 offset0:229 offset1:230
  862. ds_read2_b32 v[215:216], v7 offset0:231 offset1:232
  863. ds_read2_b32 v[205:206], v2 offset1:1
  864. v_mov_b32_e32 v2, 0x534
  865. v_mov_b32_e32 v3, 0x60c
  866. v_mov_b32_e32 v5, 0x464
  867. ds_read2_b32 v[195:196], v2 offset1:1
  868. ds_read2_b32 v[187:188], v3 offset1:1
  869. ds_read2_b32 v[209:210], v5 offset1:1
  870. v_mov_b32_e32 v2, 0x53c
  871. v_mov_b32_e32 v3, 0x614
  872. v_mov_b32_e32 v5, 0x46c
  873. ds_read2_b32 v[199:200], v2 offset1:1
  874. ds_read2_b32 v[189:190], v3 offset1:1
  875. ds_read2_b32 v[211:212], v5 offset1:1
  876. v_add_u32_e32 v5, vcc, 0x720, v155
  877. ds_read2_b32 v[41:42], v155 offset0:228 offset1:229
  878. ds_read_b32 v19, v155 offset:1832
  879. ds_read2_b32 v[22:23], v5 offset1:1
  880. v_mov_b32_e32 v2, 0x544
  881. v_mov_b32_e32 v3, 0x61c
  882. v_mov_b32_e32 v5, 0x474
  883. ds_read2_b32 v[201:202], v2 offset1:1
  884. ds_read2_b32 v[191:192], v3 offset1:1
  885. ds_read2_b32 v[213:214], v5 offset1:1
  886. v_add_u32_e32 v5, vcc, 0x3a4, v7
  887. ds_read2_b32 v[219:220], v5 offset1:27
  888. v_add_u32_e32 v5, vcc, 0x47c, v7
  889. ds_read2_b32 v[207:208], v5 offset1:27
  890. v_add_u32_e32 v5, vcc, 0x554, v7
  891. ds_read2_b32 v[197:198], v5 offset1:27
  892. v_add_u32_e32 v5, vcc, 0x62c, v7
  893. v_mov_b32_e32 v2, 0x54c
  894. v_mov_b32_e32 v3, 0x624
  895. ds_read2_b32 v[26:27], v5 offset1:27
  896. ds_read2_b32 v[203:204], v2 offset1:1
  897. ds_read2_b32 v[193:194], v3 offset1:1
  898. ds_read2_b32 v[59:60], v155 offset1:1
  899. ds_read2_b32 v[75:76], v7 offset0:67 offset1:68
  900. ds_read2_b32 v[69:70], v7 offset0:69 offset1:70
  901. ds_read2_b32 v[49:50], v7 offset0:117 offset1:118
  902. ds_read2_b32 v[5:6], v7 offset0:15 offset1:16
  903. ds_read2_b32 v[95:96], v7 offset0:63 offset1:64
  904. ds_read2_b32 v[85:86], v7 offset0:65 offset1:66
  905. ds_read2_b32 v[243:244], v7 offset0:177 offset1:178
  906. ds_read2_b32 v[239:240], v7 offset0:225 offset1:226
  907. ds_read2_b32 v[237:238], v7 offset0:227 offset1:228
  908. ds_read2_b32 v[24:25], v7 offset0:9 offset1:10
  909. ds_read2_b32 v[2:3], v7 offset0:11 offset1:12
  910. ds_read2_b32 v[99:100], v7 offset0:13 offset1:14
  911. ds_read2_b32 v[45:46], v7 offset0:171 offset1:172
  912. ds_read2_b32 v[20:21], v7 offset0:173 offset1:174
  913. ds_read2_b32 v[253:254], v7 offset0:175 offset1:176
  914. ds_read2_b32 v[35:36], v155 offset0:2 offset1:230
  915. ds_read2_b32 v[67:68], v7 offset0:119 offset1:120
  916. ds_read2_b32 v[65:66], v7 offset0:121 offset1:122
  917. ds_read2_b32 v[55:56], v7 offset0:123 offset1:124
  918. ds_read2_b32 v[121:122], v7 offset0:17 offset1:44
  919. ds_read2_b32 v[97:98], v7 offset0:71 offset1:98
  920. ds_read2_b32 v[8:9], v7 offset0:125 offset1:152
  921. ds_read2_b32 v[241:242], v7 offset0:179 offset1:206
  922. ds_read2_b64 v[127:130], v7 offset0:18 offset1:45
  923. ds_read2_b64 v[245:248], v7 offset0:72 offset1:99
  924. ds_read2_b64 v[221:224], v7 offset0:126 offset1:153
  925. ds_read2_b64 v[71:74], v7 offset0:180 offset1:207
  926. ds_read2_b64 v[143:146], v7 offset0:19 offset1:46
  927. ds_read2_b64 v[249:252], v7 offset0:73 offset1:100
  928. ds_read2_b64 v[225:228], v7 offset0:127 offset1:154
  929. ds_read2_b64 v[37:40], v7 offset0:181 offset1:208
  930. ds_read2_b64 v[101:104], v7 offset0:20 offset1:47
  931. ds_read2_b64 v[29:32], v7 offset0:74 offset1:101
  932. ds_read2_b64 v[229:232], v7 offset0:128 offset1:155
  933. ds_read2_b64 v[51:54], v7 offset0:182 offset1:209
  934. ds_read2_b64 v[123:126], v7 offset0:21 offset1:48
  935. ds_read2_b64 v[61:64], v7 offset0:75 offset1:102
  936. ds_read2_b64 v[233:236], v7 offset0:129 offset1:156
  937. ds_read2_b64 v[151:154], v7 offset0:183 offset1:210
  938. s_load_dwordx2 s[0:1], s[4:5], 0x10
  939. s_load_dwordx2 s[2:3], s[4:5], 0x18
  940. v_cmp_ne_u32_e32 vcc, 0, v4
  941. s_xor_b64 s[4:5], vcc, -1
  942. s_waitcnt lgkmcnt(0)
  943. buffer_store_dword v151, off, s[12:15], s16 offset:4
  944. buffer_store_dword v152, off, s[12:15], s16 offset:8
  945. buffer_store_dword v153, off, s[12:15], s16 offset:12
  946. buffer_store_dword v154, off, s[12:15], s16 offset:16
  947. s_waitcnt vmcnt(0) expcnt(0)
  948. s_barrier
  949. s_and_saveexec_b64 s[10:11], s[4:5]
  950. s_xor_b64 s[4:5], exec, s[10:11]
  951. BB2_11:
  952. v_mov_b32_e32 v4, 0x92492493
  953. v_mul_hi_i32 v4, s6, v4
  954. s_movk_i32 s7, 0xe0
  955. v_add_u32_e32 v4, vcc, s6, v4
  956. v_lshrrev_b32_e32 v7, 31, v4
  957. v_ashrrev_i32_e32 v4, 7, v4
  958. v_add_u32_e32 v4, vcc, v7, v4
  959. v_mul_lo_i32 v4, v4, s7
  960. v_sub_u32_e32 v151, vcc, s6, v4
  961. BB2_12:
  962. s_or_saveexec_b64 s[4:5], s[4:5]
  963. s_xor_b64 exec, exec, s[4:5]
  964. s_cbranch_execz BB2_16
  965. BB2_13:
  966. v_mov_b32_e32 v4, 0x1f7047dd
  967. v_mov_b32_e32 v7, 0x92492493
  968. v_mul_hi_u32 v4, v255, v4
  969. v_mul_hi_i32 v7, s6, v7
  970. s_movk_i32 s7, 0xe0
  971. v_subrev_u32_e32 v10, vcc, v4, v255
  972. v_lshrrev_b32_e32 v10, 1, v10
  973. v_add_u32_e32 v7, vcc, s6, v7
  974. v_add_u32_e32 v4, vcc, v4, v10
  975. v_lshrrev_b32_e32 v10, 31, v7
  976. v_ashrrev_i32_e32 v7, 7, v7
  977. v_add_u32_e32 v7, vcc, v10, v7
  978. v_mul_lo_i32 v7, v7, s7
  979. v_lshrrev_b32_e32 v4, 5, v4
  980. v_sub_u32_e32 v10, vcc, 0xe2, v4
  981. v_sub_u32_e32 v151, vcc, s6, v7
  982. v_cmp_lt_i32_e32 vcc, v151, v10
  983. s_and_saveexec_b64 s[10:11], vcc
  984. s_cbranch_execz BB2_15
  985. BB2_14:
  986. v_mul_lo_i32 v7, v4, 57
  987. v_add_u32_e32 v4, vcc, v151, v4
  988. v_mul_lo_i32 v4, v4, 57
  989. buffer_store_dword v37, off, s[12:15], s16 offset:252
  990. v_subrev_u32_e32 v7, vcc, v7, v255
  991. buffer_store_dword v38, off, s[12:15], s16 offset:256
  992. v_add_u32_e32 v4, vcc, v4, v7
  993. v_lshlrev_b32_e32 v4, 2, v4
  994. v_add_u32_e32 v10, vcc, 0x19290, v4
  995. v_ashrrev_i32_e32 v11, 31, v10
  996. v_lshlrev_b64 v[10:11], 2, v[10:11]
  997. v_mov_b32_e32 v4, s9
  998. v_add_u32_e32 v10, vcc, s8, v10
  999. buffer_store_dword v39, off, s[12:15], s16 offset:260
  1000. buffer_store_dword v40, off, s[12:15], s16 offset:264
  1001. s_waitcnt expcnt(0)
  1002. v_mov_b32_e32 v40, v13
  1003. v_addc_u32_e32 v11, vcc, v4, v11, vcc
  1004. v_mov_b32_e32 v39, v12
  1005. v_mov_b32_e32 v12, v151
  1006. flat_load_dwordx4 v[151:154], v[10:11]
  1007. v_mov_b32_e32 v13, v168
  1008. v_mov_b32_e32 v14, v169
  1009. v_mov_b32_e32 v170, v167
  1010. v_mov_b32_e32 v169, v166
  1011. v_mov_b32_e32 v168, v165
  1012. v_mov_b32_e32 v167, v164
  1013. v_mov_b32_e32 v166, v18
  1014. v_mov_b32_e32 v165, v17
  1015. v_mov_b32_e32 v17, v162
  1016. v_mov_b32_e32 v18, v163
  1017. v_mov_b32_e32 v164, v161
  1018. v_mov_b32_e32 v163, v160
  1019. v_mov_b32_e32 v162, v159
  1020. v_mov_b32_e32 v161, v158
  1021. v_mov_b32_e32 v160, v1
  1022. v_mov_b32_e32 v159, v0
  1023. v_mov_b32_e32 v0, v156
  1024. v_mov_b32_e32 v1, v157
  1025. v_mov_b32_e32 v0, v159
  1026. v_mov_b32_e32 v1, v160
  1027. v_mov_b32_e32 v158, v161
  1028. v_mov_b32_e32 v159, v162
  1029. v_mov_b32_e32 v160, v163
  1030. v_mov_b32_e32 v161, v164
  1031. v_mov_b32_e32 v163, v18
  1032. v_mov_b32_e32 v162, v17
  1033. v_mov_b32_e32 v17, v165
  1034. v_mov_b32_e32 v18, v166
  1035. v_mov_b32_e32 v164, v167
  1036. v_mov_b32_e32 v7, v155
  1037. v_mov_b32_e32 v165, v168
  1038. v_mov_b32_e32 v166, v169
  1039. v_mov_b32_e32 v167, v170
  1040. v_mov_b32_e32 v38, v16
  1041. v_mov_b32_e32 v169, v14
  1042. v_mad_u32_u24 v4, v255, 12, v7
  1043. s_mov_b32 m0, -1
  1044. v_mov_b32_e32 v37, v15
  1045. v_mov_b32_e32 v168, v13
  1046. s_waitcnt vmcnt(0) lgkmcnt(0)
  1047. ds_write2_b64 v4, v[151:152], v[153:154] offset1:1
  1048. v_mov_b32_e32 v151, v12
  1049. v_mov_b32_e32 v12, v39
  1050. v_mov_b32_e32 v13, v40
  1051. buffer_load_dword v37, off, s[12:15], s16 offset:252
  1052. s_waitcnt vmcnt(0)
  1053. buffer_load_dword v38, off, s[12:15], s16 offset:256
  1054. s_waitcnt vmcnt(0)
  1055. buffer_load_dword v39, off, s[12:15], s16 offset:260
  1056. s_waitcnt vmcnt(0)
  1057. buffer_load_dword v40, off, s[12:15], s16 offset:264
  1058. BB2_15:
  1059. s_or_b64 exec, exec, s[10:11]
  1060. BB2_16:
  1061. s_or_b64 exec, exec, s[4:5]
  1062. v_mul_f32_e32 v7, v57, v171
  1063. v_mac_f32_e32 v7, v58, v172
  1064. v_mac_f32_e32 v7, v43, v175
  1065. v_mac_f32_e32 v7, v47, v176
  1066. v_mac_f32_e32 v7, v48, v179
  1067. v_mac_f32_e32 v7, v44, v180
  1068. v_mac_f32_e32 v7, v33, v183
  1069. v_mac_f32_e32 v7, v34, v184
  1070. v_mac_f32_e32 v7, v28, v164
  1071. v_mac_f32_e32 v7, v59, v24
  1072. v_mac_f32_e32 v7, v60, v25
  1073. s_waitcnt vmcnt(0) lgkmcnt(0)
  1074. s_barrier
  1075. v_mac_f32_e32 v7, v35, v2
  1076. v_mac_f32_e32 v7, v41, v3
  1077. buffer_load_dword v3, off, s[12:15], s16 offset:212
  1078. v_mul_f32_e32 v15, v57, v15
  1079. v_mac_f32_e32 v15, v58, v16
  1080. v_mul_f32_e32 v16, v57, v133
  1081. v_mac_f32_e32 v16, v58, v134
  1082. v_mac_f32_e32 v16, v43, v137
  1083. v_mac_f32_e32 v16, v47, v138
  1084. v_mac_f32_e32 v16, v48, v141
  1085. s_waitcnt vmcnt(0)
  1086. buffer_load_dword v4, off, s[12:15], s16 offset:216
  1087. v_mac_f32_e32 v16, v44, v142
  1088. v_mac_f32_e32 v16, v33, v149
  1089. v_mac_f32_e32 v16, v34, v150
  1090. v_mul_f32_e32 v17, v57, v17
  1091. v_mac_f32_e32 v17, v58, v18
  1092. v_mac_f32_e32 v7, v42, v99
  1093. v_mac_f32_e32 v7, v36, v100
  1094. v_mac_f32_e32 v7, v22, v5
  1095. v_mac_f32_e32 v7, v23, v6
  1096. v_mov_b32_e32 v14, v13
  1097. v_mov_b32_e32 v13, v12
  1098. v_mul_f32_e32 v13, v57, v13
  1099. v_mac_f32_e32 v13, v58, v14
  1100. v_mul_f32_e32 v14, v57, v131
  1101. v_mac_f32_e32 v14, v58, v132
  1102. v_mac_f32_e32 v14, v43, v135
  1103. v_mac_f32_e32 v15, v43, v158
  1104. v_mac_f32_e32 v14, v47, v136
  1105. v_mac_f32_e32 v15, v47, v159
  1106. v_mac_f32_e32 v15, v48, v0
  1107. v_mac_f32_e32 v14, v48, v139
  1108. v_mac_f32_e32 v14, v44, v140
  1109. v_mac_f32_e32 v15, v44, v1
  1110. v_mac_f32_e32 v14, v33, v147
  1111. v_mac_f32_e32 v15, v33, v156
  1112. v_mac_f32_e32 v14, v34, v148
  1113. v_mac_f32_e32 v15, v34, v157
  1114. v_mac_f32_e32 v14, v28, v160
  1115. v_mac_f32_e32 v15, v28, v161
  1116. v_mac_f32_e32 v14, v59, v49
  1117. v_mac_f32_e32 v15, v59, v245
  1118. v_mac_f32_e32 v14, v60, v50
  1119. v_mac_f32_e32 v15, v60, v246
  1120. v_mac_f32_e32 v14, v35, v67
  1121. v_mac_f32_e32 v15, v35, v249
  1122. v_mac_f32_e32 v14, v41, v68
  1123. v_mac_f32_e32 v15, v41, v250
  1124. v_mac_f32_e32 v14, v42, v65
  1125. v_mac_f32_e32 v15, v42, v29
  1126. v_mac_f32_e32 v14, v36, v66
  1127. v_mac_f32_e32 v15, v36, v30
  1128. v_mac_f32_e32 v14, v22, v55
  1129. v_mac_f32_e32 v15, v22, v61
  1130. v_mac_f32_e32 v14, v23, v56
  1131. v_mac_f32_e32 v15, v23, v62
  1132. v_mac_f32_e32 v14, v19, v8
  1133. v_mac_f32_e32 v15, v19, v9
  1134. buffer_load_dword v8, off, s[12:15], s16 offset:172
  1135. s_waitcnt vmcnt(0)
  1136. buffer_load_dword v9, off, s[12:15], s16 offset:176
  1137. buffer_load_dword v24, off, s[12:15], s16 offset:92
  1138. s_waitcnt vmcnt(0)
  1139. buffer_load_dword v25, off, s[12:15], s16 offset:96
  1140. buffer_load_dword v152, off, s[12:15], s16 offset:236
  1141. s_waitcnt vmcnt(0)
  1142. buffer_load_dword v153, off, s[12:15], s16 offset:240
  1143. v_mul_f32_e32 v11, v57, v173
  1144. v_mac_f32_e32 v11, v58, v174
  1145. v_mac_f32_e32 v11, v43, v177
  1146. v_mac_f32_e32 v11, v47, v178
  1147. v_mac_f32_e32 v11, v48, v181
  1148. v_mac_f32_e32 v11, v44, v182
  1149. v_mac_f32_e32 v11, v33, v185
  1150. v_mac_f32_e32 v11, v34, v186
  1151. v_mac_f32_e32 v11, v28, v162
  1152. v_mac_f32_e32 v11, v59, v95
  1153. v_mac_f32_e32 v11, v60, v96
  1154. v_mac_f32_e32 v11, v35, v85
  1155. v_mac_f32_e32 v11, v41, v86
  1156. v_mac_f32_e32 v11, v42, v75
  1157. v_mac_f32_e32 v11, v36, v76
  1158. v_mac_f32_e32 v11, v22, v69
  1159. v_mac_f32_e32 v11, v23, v70
  1160. s_mov_b32 m0, -1
  1161. v_mac_f32_e32 v16, v28, v3
  1162. buffer_load_dword v2, off, s[12:15], s16 offset:204
  1163. s_waitcnt vmcnt(0)
  1164. buffer_load_dword v3, off, s[12:15], s16 offset:208
  1165. v_mac_f32_e32 v16, v59, v45
  1166. v_mac_f32_e32 v16, v60, v46
  1167. v_mac_f32_e32 v16, v35, v20
  1168. v_mac_f32_e32 v16, v41, v21
  1169. buffer_load_dword v20, off, s[12:15], s16 offset:68
  1170. s_waitcnt vmcnt(0)
  1171. buffer_load_dword v21, off, s[12:15], s16 offset:72
  1172. v_mac_f32_e32 v16, v42, v253
  1173. v_mac_f32_e32 v16, v36, v254
  1174. v_mac_f32_e32 v16, v22, v243
  1175. v_mac_f32_e32 v16, v23, v244
  1176. v_mac_f32_e32 v7, v19, v121
  1177. v_mac_f32_e32 v11, v19, v97
  1178. v_mac_f32_e32 v16, v19, v241
  1179. s_movk_i32 s9, 0xe00
  1180. s_movk_i32 s8, 0xe0
  1181. v_mov_b32_e32 v0, s1
  1182. s_mov_b32 s7, 0x27d000
  1183. s_mov_b32 s5, 0x2ae000
  1184. s_mov_b32 s4, 0x2df000
  1185. v_mac_f32_e32 v13, v43, v152
  1186. v_mac_f32_e32 v13, v47, v153
  1187. buffer_load_dword v152, off, s[12:15], s16 offset:244
  1188. s_waitcnt vmcnt(0)
  1189. buffer_load_dword v153, off, s[12:15], s16 offset:248
  1190. v_mul_f32_e32 v18, v57, v2
  1191. v_mac_f32_e32 v18, v58, v3
  1192. buffer_load_dword v2, off, s[12:15], s16 offset:196
  1193. s_waitcnt vmcnt(0)
  1194. buffer_load_dword v3, off, s[12:15], s16 offset:200
  1195. v_mac_f32_e32 v13, v48, v152
  1196. v_mac_f32_e32 v13, v44, v153
  1197. buffer_load_dword v152, off, s[12:15], s16 offset:228
  1198. s_waitcnt vmcnt(0)
  1199. buffer_load_dword v153, off, s[12:15], s16 offset:232
  1200. v_mac_f32_e32 v13, v33, v168
  1201. v_mac_f32_e32 v13, v34, v169
  1202. v_mac_f32_e32 v13, v28, v165
  1203. v_mac_f32_e32 v13, v59, v127
  1204. v_mac_f32_e32 v13, v60, v128
  1205. v_mac_f32_e32 v13, v35, v143
  1206. v_mac_f32_e32 v13, v41, v144
  1207. v_mac_f32_e32 v13, v42, v101
  1208. v_mac_f32_e32 v13, v36, v102
  1209. v_mac_f32_e32 v13, v22, v123
  1210. v_mac_f32_e32 v13, v23, v124
  1211. v_mac_f32_e32 v13, v19, v122
  1212. v_mac_f32_e32 v18, v43, v2
  1213. v_mac_f32_e32 v18, v47, v3
  1214. buffer_load_dword v2, off, s[12:15], s16 offset:188
  1215. s_waitcnt vmcnt(0)
  1216. buffer_load_dword v3, off, s[12:15], s16 offset:192
  1217. v_mac_f32_e32 v17, v43, v152
  1218. v_mac_f32_e32 v17, v47, v153
  1219. buffer_load_dword v152, off, s[12:15], s16 offset:220
  1220. s_waitcnt vmcnt(0)
  1221. buffer_load_dword v153, off, s[12:15], s16 offset:224
  1222. v_mac_f32_e32 v18, v48, v2
  1223. v_mac_f32_e32 v18, v44, v3
  1224. buffer_load_dword v2, off, s[12:15], s16 offset:180
  1225. s_waitcnt vmcnt(0)
  1226. buffer_load_dword v3, off, s[12:15], s16 offset:184
  1227. v_mac_f32_e32 v17, v48, v152
  1228. v_mac_f32_e32 v17, v44, v153
  1229. v_mac_f32_e32 v17, v33, v166
  1230. v_mac_f32_e32 v17, v34, v167
  1231. v_mac_f32_e32 v17, v28, v163
  1232. v_mac_f32_e32 v17, v59, v129
  1233. v_mac_f32_e32 v17, v60, v130
  1234. v_mac_f32_e32 v17, v35, v145
  1235. v_mac_f32_e32 v17, v41, v146
  1236. v_mac_f32_e32 v17, v42, v103
  1237. v_mac_f32_e32 v17, v36, v104
  1238. v_mac_f32_e32 v17, v22, v125
  1239. v_mac_f32_e32 v17, v23, v126
  1240. v_mac_f32_e32 v17, v19, v98
  1241. s_waitcnt vmcnt(0)
  1242. v_mac_f32_e32 v18, v33, v2
  1243. v_mac_f32_e32 v18, v34, v3
  1244. v_mac_f32_e32 v18, v28, v4
  1245. buffer_load_dword v4, off, s[12:15], s16 offset:164
  1246. s_waitcnt vmcnt(0)
  1247. buffer_load_dword v5, off, s[12:15], s16 offset:168
  1248. v_mul_f32_e32 v3, v57, v105
  1249. v_mac_f32_e32 v3, v58, v106
  1250. v_mac_f32_e32 v3, v43, v109
  1251. v_mac_f32_e32 v3, v47, v110
  1252. v_mac_f32_e32 v3, v48, v113
  1253. v_mac_f32_e32 v3, v44, v114
  1254. v_mac_f32_e32 v3, v33, v117
  1255. v_mac_f32_e32 v3, v34, v118
  1256. v_mac_f32_e32 v3, v28, v8
  1257. v_mac_f32_e32 v18, v59, v247
  1258. v_mac_f32_e32 v18, v60, v248
  1259. v_mac_f32_e32 v18, v35, v251
  1260. v_mac_f32_e32 v18, v41, v252
  1261. v_mac_f32_e32 v18, v42, v31
  1262. v_mac_f32_e32 v18, v36, v32
  1263. buffer_load_dword v29, off, s[12:15], s16 offset:4
  1264. s_waitcnt vmcnt(0)
  1265. buffer_load_dword v30, off, s[12:15], s16 offset:8
  1266. s_waitcnt vmcnt(0)
  1267. buffer_load_dword v31, off, s[12:15], s16 offset:12
  1268. s_waitcnt vmcnt(0)
  1269. buffer_load_dword v32, off, s[12:15], s16 offset:16
  1270. v_mac_f32_e32 v3, v59, v239
  1271. v_mac_f32_e32 v3, v60, v240
  1272. v_mac_f32_e32 v3, v35, v237
  1273. v_mac_f32_e32 v3, v41, v238
  1274. v_mac_f32_e32 v3, v42, v217
  1275. v_mac_f32_e32 v3, v36, v218
  1276. v_mac_f32_e32 v18, v22, v63
  1277. v_mac_f32_e32 v3, v22, v215
  1278. v_mac_f32_e32 v18, v23, v64
  1279. v_mac_f32_e32 v3, v23, v216
  1280. v_mac_f32_e32 v18, v19, v242
  1281. v_mac_f32_e32 v3, v19, v219
  1282. v_mov_b32_e32 v2, 0x414
  1283. v_mul_f32_e32 v4, v57, v4
  1284. v_mac_f32_e32 v4, v58, v5
  1285. buffer_load_dword v5, off, s[12:15], s16 offset:156
  1286. s_waitcnt vmcnt(0)
  1287. buffer_load_dword v6, off, s[12:15], s16 offset:160
  1288. s_waitcnt vmcnt(0)
  1289. v_mac_f32_e32 v4, v43, v5
  1290. v_mac_f32_e32 v4, v47, v6
  1291. buffer_load_dword v5, off, s[12:15], s16 offset:148
  1292. s_waitcnt vmcnt(0)
  1293. buffer_load_dword v6, off, s[12:15], s16 offset:152
  1294. s_waitcnt vmcnt(0)
  1295. v_mac_f32_e32 v4, v48, v5
  1296. v_mac_f32_e32 v4, v44, v6
  1297. buffer_load_dword v5, off, s[12:15], s16 offset:140
  1298. s_waitcnt vmcnt(0)
  1299. buffer_load_dword v6, off, s[12:15], s16 offset:144
  1300. s_waitcnt vmcnt(0)
  1301. v_mac_f32_e32 v4, v33, v5
  1302. v_mac_f32_e32 v4, v34, v6
  1303. v_mac_f32_e32 v4, v28, v9
  1304. buffer_load_dword v9, off, s[12:15], s16 offset:132
  1305. v_mul_f32_e32 v5, v57, v107
  1306. v_mac_f32_e32 v5, v58, v108
  1307. v_mac_f32_e32 v5, v43, v111
  1308. v_mac_f32_e32 v5, v47, v112
  1309. v_mac_f32_e32 v5, v48, v115
  1310. s_waitcnt vmcnt(0)
  1311. buffer_load_dword v10, off, s[12:15], s16 offset:136
  1312. v_mac_f32_e32 v5, v44, v116
  1313. v_mac_f32_e32 v5, v33, v119
  1314. v_mac_f32_e32 v5, v34, v120
  1315. v_mac_f32_e32 v4, v59, v221
  1316. v_mac_f32_e32 v4, v60, v222
  1317. v_mac_f32_e32 v4, v35, v225
  1318. v_mac_f32_e32 v4, v41, v226
  1319. v_mac_f32_e32 v4, v42, v229
  1320. v_mac_f32_e32 v4, v36, v230
  1321. v_mac_f32_e32 v4, v22, v233
  1322. v_mac_f32_e32 v4, v23, v234
  1323. v_mac_f32_e32 v4, v19, v220
  1324. s_waitcnt vmcnt(0)
  1325. v_mac_f32_e32 v5, v28, v9
  1326. buffer_load_dword v8, off, s[12:15], s16 offset:100
  1327. s_waitcnt vmcnt(0)
  1328. buffer_load_dword v9, off, s[12:15], s16 offset:104
  1329. v_mac_f32_e32 v5, v59, v205
  1330. v_mac_f32_e32 v5, v60, v206
  1331. v_mac_f32_e32 v5, v35, v209
  1332. v_mac_f32_e32 v5, v41, v210
  1333. v_mac_f32_e32 v5, v42, v211
  1334. v_mac_f32_e32 v5, v36, v212
  1335. v_mac_f32_e32 v5, v22, v213
  1336. v_mac_f32_e32 v5, v23, v214
  1337. v_mac_f32_e32 v5, v19, v207
  1338. s_waitcnt vmcnt(0)
  1339. v_mul_f32_e32 v6, v57, v8
  1340. v_mac_f32_e32 v6, v58, v9
  1341. buffer_load_dword v8, off, s[12:15], s16 offset:108
  1342. s_waitcnt vmcnt(0)
  1343. buffer_load_dword v9, off, s[12:15], s16 offset:112
  1344. s_waitcnt vmcnt(0)
  1345. v_mac_f32_e32 v6, v43, v8
  1346. v_mac_f32_e32 v6, v47, v9
  1347. buffer_load_dword v8, off, s[12:15], s16 offset:116
  1348. s_waitcnt vmcnt(0)
  1349. buffer_load_dword v9, off, s[12:15], s16 offset:120
  1350. s_waitcnt vmcnt(0)
  1351. v_mac_f32_e32 v6, v48, v8
  1352. v_mac_f32_e32 v6, v44, v9
  1353. buffer_load_dword v8, off, s[12:15], s16 offset:124
  1354. s_waitcnt vmcnt(0)
  1355. buffer_load_dword v9, off, s[12:15], s16 offset:128
  1356. s_waitcnt vmcnt(0)
  1357. v_mac_f32_e32 v6, v33, v8
  1358. v_mac_f32_e32 v6, v34, v9
  1359. v_mac_f32_e32 v6, v28, v10
  1360. buffer_load_dword v9, off, s[12:15], s16 offset:60
  1361. s_waitcnt vmcnt(0)
  1362. buffer_load_dword v10, off, s[12:15], s16 offset:64
  1363. v_mul_f32_e32 v8, v57, v77
  1364. v_mac_f32_e32 v8, v58, v78
  1365. v_mac_f32_e32 v8, v43, v81
  1366. v_mac_f32_e32 v8, v47, v82
  1367. v_mac_f32_e32 v8, v48, v87
  1368. v_mac_f32_e32 v8, v44, v88
  1369. v_mac_f32_e32 v8, v33, v91
  1370. v_mac_f32_e32 v8, v34, v92
  1371. v_mac_f32_e32 v8, v28, v24
  1372. v_mac_f32_e32 v6, v59, v223
  1373. v_mac_f32_e32 v8, v59, v195
  1374. v_mac_f32_e32 v6, v60, v224
  1375. v_mac_f32_e32 v8, v60, v196
  1376. v_mac_f32_e32 v6, v35, v227
  1377. v_mac_f32_e32 v8, v35, v199
  1378. v_mac_f32_e32 v6, v41, v228
  1379. v_mac_f32_e32 v8, v41, v200
  1380. v_mac_f32_e32 v6, v42, v231
  1381. v_mac_f32_e32 v8, v42, v201
  1382. v_mac_f32_e32 v6, v36, v232
  1383. v_mac_f32_e32 v8, v36, v202
  1384. v_mac_f32_e32 v6, v22, v235
  1385. v_mac_f32_e32 v8, v22, v203
  1386. v_mac_f32_e32 v6, v23, v236
  1387. v_mac_f32_e32 v8, v23, v204
  1388. v_mac_f32_e32 v6, v19, v208
  1389. v_mac_f32_e32 v8, v19, v197
  1390. s_waitcnt vmcnt(0)
  1391. v_mul_f32_e32 v9, v57, v9
  1392. v_mac_f32_e32 v9, v58, v10
  1393. v_mac_f32_e32 v9, v43, v20
  1394. v_mac_f32_e32 v9, v47, v21
  1395. buffer_load_dword v20, off, s[12:15], s16 offset:84
  1396. s_waitcnt vmcnt(0)
  1397. buffer_load_dword v21, off, s[12:15], s16 offset:88
  1398. v_mul_f32_e32 v10, v57, v79
  1399. v_mac_f32_e32 v10, v58, v80
  1400. v_mac_f32_e32 v10, v43, v83
  1401. v_mac_f32_e32 v10, v47, v84
  1402. v_mac_f32_e32 v10, v48, v89
  1403. v_mac_f32_e32 v10, v44, v90
  1404. v_mac_f32_e32 v10, v33, v93
  1405. v_mac_f32_e32 v10, v34, v94
  1406. s_waitcnt vmcnt(0)
  1407. v_mac_f32_e32 v9, v48, v20
  1408. v_mac_f32_e32 v9, v44, v21
  1409. buffer_load_dword v20, off, s[12:15], s16 offset:76
  1410. s_waitcnt vmcnt(0)
  1411. buffer_load_dword v21, off, s[12:15], s16 offset:80
  1412. s_waitcnt vmcnt(0)
  1413. v_mac_f32_e32 v9, v33, v20
  1414. v_mac_f32_e32 v9, v34, v21
  1415. v_mac_f32_e32 v9, v28, v25
  1416. buffer_load_dword v24, off, s[12:15], s16 offset:52
  1417. s_waitcnt vmcnt(0)
  1418. buffer_load_dword v25, off, s[12:15], s16 offset:56
  1419. buffer_load_dword v20, off, s[12:15], s16 offset:20
  1420. s_waitcnt vmcnt(0)
  1421. buffer_load_dword v21, off, s[12:15], s16 offset:24
  1422. v_mac_f32_e32 v9, v59, v71
  1423. v_mac_f32_e32 v9, v60, v72
  1424. v_mac_f32_e32 v9, v35, v37
  1425. v_mac_f32_e32 v9, v41, v38
  1426. v_mac_f32_e32 v9, v42, v51
  1427. v_mac_f32_e32 v9, v36, v52
  1428. v_mac_f32_e32 v9, v22, v29
  1429. v_mac_f32_e32 v9, v23, v30
  1430. v_mac_f32_e32 v9, v19, v198
  1431. v_mov_b32_e32 v52, 0x5c4
  1432. v_mul_f32_e32 v12, v57, v24
  1433. v_mac_f32_e32 v12, v58, v25
  1434. buffer_load_dword v24, off, s[12:15], s16 offset:44
  1435. s_waitcnt vmcnt(0)
  1436. buffer_load_dword v25, off, s[12:15], s16 offset:48
  1437. v_mac_f32_e32 v10, v28, v20
  1438. v_mac_f32_e32 v10, v59, v187
  1439. v_mac_f32_e32 v10, v60, v188
  1440. v_mac_f32_e32 v10, v35, v189
  1441. v_mac_f32_e32 v10, v41, v190
  1442. v_mac_f32_e32 v10, v42, v191
  1443. v_mac_f32_e32 v10, v36, v192
  1444. v_mac_f32_e32 v10, v22, v193
  1445. v_mac_f32_e32 v10, v23, v194
  1446. v_mac_f32_e32 v10, v19, v26
  1447. v_mov_b32_e32 v20, 0x4ec
  1448. s_waitcnt vmcnt(0)
  1449. v_mac_f32_e32 v12, v43, v24
  1450. v_mac_f32_e32 v12, v47, v25
  1451. buffer_load_dword v24, off, s[12:15], s16 offset:36
  1452. s_waitcnt vmcnt(0)
  1453. buffer_load_dword v25, off, s[12:15], s16 offset:40
  1454. s_waitcnt vmcnt(0)
  1455. v_mac_f32_e32 v12, v48, v24
  1456. v_mac_f32_e32 v12, v44, v25
  1457. buffer_load_dword v24, off, s[12:15], s16 offset:28
  1458. s_waitcnt vmcnt(0)
  1459. buffer_load_dword v25, off, s[12:15], s16 offset:32
  1460. s_waitcnt vmcnt(0)
  1461. v_mac_f32_e32 v12, v33, v24
  1462. v_mac_f32_e32 v12, v34, v25
  1463. v_mac_f32_e32 v12, v28, v21
  1464. v_mac_f32_e32 v12, v59, v73
  1465. v_mac_f32_e32 v12, v60, v74
  1466. v_mac_f32_e32 v12, v35, v39
  1467. v_mac_f32_e32 v12, v41, v40
  1468. v_mac_f32_e32 v12, v42, v53
  1469. v_mac_f32_e32 v12, v36, v54
  1470. v_mac_f32_e32 v12, v22, v31
  1471. v_mac_f32_e32 v12, v23, v32
  1472. v_mac_f32_e32 v12, v19, v27
  1473. v_mov_b32_e32 v19, 0
  1474. ds_read2_b32 v[24:25], v19 offset0:213 offset1:214
  1475. ds_read2_b32 v[26:27], v2 offset1:1
  1476. ds_read2_b32 v[28:29], v20 offset1:1
  1477. v_mov_b32_e32 v2, 0x5dc
  1478. v_mov_b32_e32 v20, 0x6b4
  1479. ds_read2_b32 v[30:31], v2 offset1:1
  1480. ds_read2_b32 v[32:33], v20 offset1:1
  1481. ds_read2_b32 v[34:35], v155 offset1:1
  1482. ds_read2_b64 v[20:23], v19 offset0:9 offset1:36
  1483. ds_read2_b32 v[44:45], v19 offset0:103 offset1:104
  1484. v_mov_b32_e32 v2, 0x69c
  1485. s_waitcnt lgkmcnt(2)
  1486. v_mac_f32_e32 v4, v34, v26
  1487. s_waitcnt lgkmcnt(1)
  1488. v_mac_f32_e32 v7, v34, v20
  1489. v_mac_f32_e32 v11, v34, v22
  1490. v_mac_f32_e32 v7, v35, v21
  1491. v_mac_f32_e32 v11, v35, v23
  1492. ds_read2_b64 v[20:23], v19 offset0:63 offset1:90
  1493. v_mac_f32_e32 v4, v35, v27
  1494. v_mov_b32_e32 v26, 0x41c
  1495. v_mac_f32_e32 v6, v34, v28
  1496. v_mac_f32_e32 v6, v35, v29
  1497. s_waitcnt lgkmcnt(0)
  1498. v_mac_f32_e32 v14, v34, v20
  1499. v_mac_f32_e32 v16, v34, v22
  1500. v_mac_f32_e32 v14, v35, v21
  1501. v_mac_f32_e32 v16, v35, v23
  1502. ds_read2_b64 v[20:23], v19 offset0:117 offset1:144
  1503. s_waitcnt lgkmcnt(0)
  1504. v_mac_f32_e32 v3, v34, v20
  1505. v_mac_f32_e32 v5, v34, v22
  1506. v_mac_f32_e32 v3, v35, v21
  1507. v_mac_f32_e32 v5, v35, v23
  1508. ds_read2_b64 v[20:23], v19 offset0:171 offset1:198
  1509. s_waitcnt lgkmcnt(0)
  1510. v_mac_f32_e32 v8, v34, v20
  1511. v_mac_f32_e32 v8, v35, v21
  1512. ds_read2_b32 v[20:21], v19 offset0:45 offset1:46
  1513. ds_read2_b32 v[36:37], v19 offset0:47 offset1:48
  1514. ds_read2_b32 v[38:39], v19 offset0:49 offset1:50
  1515. v_mac_f32_e32 v10, v34, v22
  1516. v_mac_f32_e32 v10, v35, v23
  1517. s_waitcnt lgkmcnt(2)
  1518. v_mac_f32_e32 v13, v34, v20
  1519. v_mac_f32_e32 v13, v35, v21
  1520. ds_read2_b32 v[40:41], v19 offset0:51 offset1:52
  1521. ds_read2_b32 v[20:21], v19 offset0:99 offset1:100
  1522. ds_read2_b32 v[42:43], v19 offset0:101 offset1:102
  1523. s_waitcnt lgkmcnt(1)
  1524. v_mac_f32_e32 v17, v34, v20
  1525. v_mac_f32_e32 v17, v35, v21
  1526. ds_read2_b32 v[46:47], v19 offset0:105 offset1:106
  1527. ds_read2_b32 v[20:21], v19 offset0:153 offset1:154
  1528. s_waitcnt lgkmcnt(0)
  1529. v_mac_f32_e32 v15, v34, v20
  1530. v_mac_f32_e32 v15, v35, v21
  1531. ds_read2_b32 v[20:21], v19 offset0:207 offset1:208
  1532. ds_read2_b32 v[48:49], v19 offset0:209 offset1:210
  1533. ds_read2_b32 v[50:51], v19 offset0:211 offset1:212
  1534. s_waitcnt lgkmcnt(2)
  1535. v_mac_f32_e32 v18, v34, v20
  1536. v_mac_f32_e32 v18, v35, v21
  1537. ds_read2_b32 v[20:21], v52 offset1:1
  1538. ds_read2_b32 v[22:23], v2 offset1:1
  1539. ds_read2_b32 v[26:27], v26 offset1:1
  1540. v_add_u32_e32 v2, vcc, 0x720, v155
  1541. s_waitcnt lgkmcnt(2)
  1542. v_mac_f32_e32 v9, v34, v20
  1543. s_waitcnt lgkmcnt(1)
  1544. v_mac_f32_e32 v12, v34, v22
  1545. v_mac_f32_e32 v9, v35, v21
  1546. v_mac_f32_e32 v12, v35, v23
  1547. ds_read2_b32 v[28:29], v155 offset0:2 offset1:230
  1548. ds_read2_b64 v[20:23], v19 offset0:10 offset1:37
  1549. ds_read2_b32 v[34:35], v155 offset0:228 offset1:229
  1550. ds_read_b32 v56, v155 offset:1832
  1551. ds_read2_b32 v[1:2], v2 offset1:1
  1552. s_waitcnt lgkmcnt(4)
  1553. v_mac_f32_e32 v13, v28, v36
  1554. s_waitcnt lgkmcnt(3)
  1555. v_mac_f32_e32 v7, v28, v20
  1556. v_mac_f32_e32 v11, v28, v22
  1557. s_waitcnt lgkmcnt(2)
  1558. v_mac_f32_e32 v7, v34, v21
  1559. v_mac_f32_e32 v11, v34, v23
  1560. ds_read2_b64 v[20:23], v19 offset0:64 offset1:91
  1561. v_mac_f32_e32 v17, v28, v42
  1562. v_mac_f32_e32 v4, v28, v26
  1563. v_mac_f32_e32 v13, v34, v37
  1564. v_mov_b32_e32 v26, 0x5cc
  1565. s_waitcnt lgkmcnt(0)
  1566. v_mac_f32_e32 v14, v28, v20
  1567. v_mac_f32_e32 v16, v28, v22
  1568. v_mac_f32_e32 v14, v34, v21
  1569. v_mac_f32_e32 v16, v34, v23
  1570. ds_read2_b64 v[20:23], v19 offset0:118 offset1:145
  1571. v_mov_b32_e32 v42, 0x6a4
  1572. v_mac_f32_e32 v17, v34, v43
  1573. v_mac_f32_e32 v4, v34, v27
  1574. v_mac_f32_e32 v18, v28, v48
  1575. s_waitcnt lgkmcnt(0)
  1576. v_mac_f32_e32 v3, v28, v20
  1577. v_mac_f32_e32 v5, v28, v22
  1578. v_mac_f32_e32 v3, v34, v21
  1579. v_mac_f32_e32 v5, v34, v23
  1580. ds_read2_b64 v[20:23], v19 offset0:172 offset1:199
  1581. v_mac_f32_e32 v18, v34, v49
  1582. v_mac_f32_e32 v13, v35, v38
  1583. v_mac_f32_e32 v17, v35, v44
  1584. v_mac_f32_e32 v18, v35, v50
  1585. s_waitcnt lgkmcnt(0)
  1586. v_mac_f32_e32 v8, v28, v20
  1587. v_mac_f32_e32 v10, v28, v22
  1588. v_mac_f32_e32 v8, v34, v21
  1589. v_mac_f32_e32 v10, v34, v23
  1590. ds_read2_b64 v[20:23], v19 offset0:11 offset1:38
  1591. v_mac_f32_e32 v13, v29, v39
  1592. v_mac_f32_e32 v17, v29, v45
  1593. v_mac_f32_e32 v18, v29, v51
  1594. v_mac_f32_e32 v13, v1, v40
  1595. s_waitcnt lgkmcnt(0)
  1596. v_mac_f32_e32 v7, v35, v20
  1597. v_mac_f32_e32 v11, v35, v22
  1598. v_mac_f32_e32 v7, v29, v21
  1599. v_mac_f32_e32 v11, v29, v23
  1600. ds_read2_b64 v[20:23], v19 offset0:65 offset1:92
  1601. v_mac_f32_e32 v17, v1, v46
  1602. v_mac_f32_e32 v18, v1, v24
  1603. v_mac_f32_e32 v13, v2, v41
  1604. v_mac_f32_e32 v17, v2, v47
  1605. s_waitcnt lgkmcnt(0)
  1606. v_mac_f32_e32 v14, v35, v20
  1607. v_mac_f32_e32 v16, v35, v22
  1608. v_mac_f32_e32 v14, v29, v21
  1609. v_mac_f32_e32 v16, v29, v23
  1610. ds_read2_b64 v[20:23], v19 offset0:119 offset1:146
  1611. v_mac_f32_e32 v18, v2, v25
  1612. s_waitcnt lgkmcnt(0)
  1613. v_mac_f32_e32 v3, v35, v20
  1614. v_mac_f32_e32 v5, v35, v22
  1615. v_mac_f32_e32 v3, v29, v21
  1616. v_mac_f32_e32 v5, v29, v23
  1617. ds_read2_b64 v[20:23], v19 offset0:173 offset1:200
  1618. s_waitcnt lgkmcnt(0)
  1619. v_mac_f32_e32 v8, v35, v20
  1620. v_mac_f32_e32 v10, v35, v22
  1621. v_mac_f32_e32 v8, v29, v21
  1622. v_mac_f32_e32 v10, v29, v23
  1623. ds_read2_b32 v[20:21], v19 offset0:155 offset1:156
  1624. ds_read2_b32 v[22:23], v19 offset0:157 offset1:158
  1625. ds_read2_b32 v[36:37], v19 offset0:159 offset1:160
  1626. s_waitcnt lgkmcnt(2)
  1627. v_mac_f32_e32 v15, v28, v20
  1628. v_mov_b32_e32 v20, 0x4f4
  1629. v_mac_f32_e32 v15, v34, v21
  1630. ds_read2_b32 v[20:21], v20 offset1:1
  1631. ds_read2_b32 v[26:27], v26 offset1:1
  1632. ds_read2_b32 v[42:43], v42 offset1:1
  1633. s_waitcnt lgkmcnt(4)
  1634. v_mac_f32_e32 v15, v35, v22
  1635. v_mov_b32_e32 v22, 0x4fc
  1636. s_waitcnt lgkmcnt(2)
  1637. v_mac_f32_e32 v6, v28, v20
  1638. s_waitcnt lgkmcnt(1)
  1639. v_mac_f32_e32 v9, v28, v26
  1640. s_waitcnt lgkmcnt(0)
  1641. v_mac_f32_e32 v12, v28, v42
  1642. v_mov_b32_e32 v20, 0x424
  1643. v_mac_f32_e32 v6, v34, v21
  1644. v_mac_f32_e32 v9, v34, v27
  1645. v_mac_f32_e32 v12, v34, v43
  1646. v_mov_b32_e32 v28, 0x5d4
  1647. ds_read2_b32 v[20:21], v20 offset1:1
  1648. ds_read2_b32 v[26:27], v22 offset1:1
  1649. ds_read2_b32 v[42:43], v28 offset1:1
  1650. v_mov_b32_e32 v22, 0x42c
  1651. v_mac_f32_e32 v15, v29, v23
  1652. s_waitcnt lgkmcnt(2)
  1653. v_mac_f32_e32 v4, v35, v20
  1654. s_waitcnt lgkmcnt(1)
  1655. v_mac_f32_e32 v6, v35, v26
  1656. v_mov_b32_e32 v20, 0x6ac
  1657. v_mov_b32_e32 v26, 0x504
  1658. ds_read2_b32 v[48:49], v20 offset1:1
  1659. ds_read2_b32 v[52:53], v22 offset1:1
  1660. ds_read2_b32 v[54:55], v26 offset1:1
  1661. v_mac_f32_e32 v4, v29, v21
  1662. ds_read2_b64 v[20:23], v19 offset0:12 offset1:39
  1663. s_waitcnt lgkmcnt(4)
  1664. v_mac_f32_e32 v9, v35, v42
  1665. s_waitcnt lgkmcnt(3)
  1666. v_mac_f32_e32 v12, v35, v48
  1667. v_mac_f32_e32 v6, v29, v27
  1668. v_mac_f32_e32 v9, v29, v43
  1669. s_waitcnt lgkmcnt(0)
  1670. v_mac_f32_e32 v7, v1, v20
  1671. v_mac_f32_e32 v11, v1, v22
  1672. v_mac_f32_e32 v7, v2, v21
  1673. v_mac_f32_e32 v11, v2, v23
  1674. ds_read2_b64 v[20:23], v19 offset0:66 offset1:93
  1675. v_mac_f32_e32 v12, v29, v49
  1676. v_mac_f32_e32 v15, v1, v36
  1677. v_mac_f32_e32 v4, v1, v52
  1678. v_mac_f32_e32 v6, v1, v54
  1679. s_waitcnt lgkmcnt(0)
  1680. v_mac_f32_e32 v14, v1, v20
  1681. v_mac_f32_e32 v16, v1, v22
  1682. v_mac_f32_e32 v14, v2, v21
  1683. v_mac_f32_e32 v16, v2, v23
  1684. ds_read2_b64 v[20:23], v19 offset0:120 offset1:147
  1685. v_mac_f32_e32 v9, v1, v30
  1686. v_mac_f32_e32 v12, v1, v32
  1687. v_mac_f32_e32 v15, v2, v37
  1688. v_mac_f32_e32 v4, v2, v53
  1689. s_waitcnt lgkmcnt(0)
  1690. v_mac_f32_e32 v3, v1, v20
  1691. v_mac_f32_e32 v5, v1, v22
  1692. v_mac_f32_e32 v3, v2, v21
  1693. v_mac_f32_e32 v5, v2, v23
  1694. ds_read2_b64 v[20:23], v19 offset0:174 offset1:201
  1695. v_mac_f32_e32 v6, v2, v55
  1696. v_mac_f32_e32 v9, v2, v31
  1697. v_mac_f32_e32 v12, v2, v33
  1698. s_waitcnt lgkmcnt(0)
  1699. v_mac_f32_e32 v8, v1, v20
  1700. v_mov_b32_e32 v20, 0x92492493
  1701. v_mul_hi_i32 v20, s6, v20
  1702. v_mac_f32_e32 v10, v1, v22
  1703. v_mac_f32_e32 v8, v2, v21
  1704. v_mac_f32_e32 v10, v2, v23
  1705. v_add_u32_e32 v1, vcc, s6, v20
  1706. v_lshrrev_b32_e32 v2, 31, v1
  1707. v_ashrrev_i32_e32 v1, 7, v1
  1708. v_add_u32_e32 v28, vcc, v2, v1
  1709. v_lshlrev_b32_e32 v1, 4, v28
  1710. v_ashrrev_i32_e32 v2, 31, v1
  1711. v_lshlrev_b64 v[20:21], 2, v[1:2]
  1712. v_mov_b32_e32 v22, s3
  1713. v_add_u32_e32 v24, vcc, s2, v20
  1714. v_addc_u32_e32 v25, vcc, v22, v21, vcc
  1715. ds_read2_b32 v[20:21], v19 offset0:26 offset1:53
  1716. v_mov_b32_e32 v2, s3
  1717. s_waitcnt lgkmcnt(0)
  1718. v_mac_f32_e32 v7, v56, v20
  1719. v_mac_f32_e32 v13, v56, v21
  1720. ds_read2_b32 v[20:21], v19 offset0:80 offset1:107
  1721. s_waitcnt lgkmcnt(0)
  1722. v_mac_f32_e32 v11, v56, v20
  1723. v_or_b32_e32 v20, 4, v1
  1724. v_mac_f32_e32 v17, v56, v21
  1725. v_ashrrev_i32_e32 v21, 31, v20
  1726. v_lshlrev_b64 v[20:21], 2, v[20:21]
  1727. v_add_u32_e32 v20, vcc, s2, v20
  1728. v_addc_u32_e32 v21, vcc, v2, v21, vcc
  1729. flat_load_dwordx4 v[20:23], v[20:21]
  1730. flat_load_dwordx4 v[24:27], v[24:25]
  1731. s_waitcnt vmcnt(0) lgkmcnt(0)
  1732. v_add_f32_e32 v2, v7, v24
  1733. v_add_f32_e32 v7, v13, v25
  1734. ds_read2_b32 v[24:25], v19 offset0:134 offset1:161
  1735. v_add_f32_e32 v13, v17, v27
  1736. v_add_u32_e32 v17, vcc, 0x3c8, v19
  1737. v_add_f32_e32 v11, v11, v26
  1738. s_waitcnt lgkmcnt(0)
  1739. v_mac_f32_e32 v14, v56, v24
  1740. v_mac_f32_e32 v15, v56, v25
  1741. ds_read2_b32 v[24:25], v19 offset0:188 offset1:215
  1742. s_waitcnt lgkmcnt(0)
  1743. v_mac_f32_e32 v16, v56, v24
  1744. v_mac_f32_e32 v18, v56, v25
  1745. ds_read2_b32 v[24:25], v17 offset1:27
  1746. v_add_u32_e32 v17, vcc, 0x4a0, v19
  1747. s_waitcnt lgkmcnt(0)
  1748. v_mac_f32_e32 v3, v56, v24
  1749. v_mac_f32_e32 v4, v56, v25
  1750. ds_read2_b32 v[24:25], v17 offset1:27
  1751. v_add_u32_e32 v17, vcc, 0x578, v19
  1752. s_waitcnt lgkmcnt(0)
  1753. v_mac_f32_e32 v5, v56, v24
  1754. v_mac_f32_e32 v6, v56, v25
  1755. ds_read2_b32 v[24:25], v17 offset1:27
  1756. v_add_u32_e32 v17, vcc, 0x650, v19
  1757. v_mul_lo_i32 v19, v28, s9
  1758. s_waitcnt lgkmcnt(0)
  1759. v_mac_f32_e32 v8, v56, v24
  1760. v_mac_f32_e32 v9, v56, v25
  1761. ds_read2_b32 v[24:25], v17 offset1:27
  1762. v_add_u32_e32 v17, vcc, v19, v151
  1763. v_mul_lo_i32 v17, v17, s8
  1764. s_waitcnt lgkmcnt(0)
  1765. v_mac_f32_e32 v10, v56, v24
  1766. v_add_u32_e32 v24, vcc, v17, v255
  1767. v_mac_f32_e32 v12, v56, v25
  1768. v_ashrrev_i32_e32 v25, 31, v24
  1769. v_lshlrev_b64 v[25:26], 2, v[24:25]
  1770. v_add_u32_e32 v25, vcc, s0, v25
  1771. v_addc_u32_e32 v26, vcc, v0, v26, vcc
  1772. v_add_u32_e32 v27, vcc, 0xc400, v24
  1773. v_ashrrev_i32_e32 v28, 31, v27
  1774. v_lshlrev_b64 v[27:28], 2, v[27:28]
  1775. v_add_u32_e32 v27, vcc, s0, v27
  1776. v_addc_u32_e32 v28, vcc, v0, v28, vcc
  1777. v_max_f32_e32 v0, 0, v2
  1778. flat_store_dword v[25:26], v0
  1779. v_max_f32_e32 v0, 0, v7
  1780. flat_store_dword v[27:28], v0
  1781. v_add_u32_e32 v27, vcc, 0x18800, v24
  1782. v_ashrrev_i32_e32 v28, 31, v27
  1783. v_lshlrev_b64 v[27:28], 2, v[27:28]
  1784. v_mov_b32_e32 v0, s1
  1785. v_add_u32_e32 v27, vcc, s0, v27
  1786. v_addc_u32_e32 v28, vcc, v0, v28, vcc
  1787. v_max_f32_e32 v0, 0, v11
  1788. flat_store_dword v[27:28], v0
  1789. v_add_u32_e32 v27, vcc, 0x24c00, v24
  1790. v_ashrrev_i32_e32 v28, 31, v27
  1791. v_lshlrev_b64 v[27:28], 2, v[27:28]
  1792. v_mov_b32_e32 v0, s1
  1793. v_add_u32_e32 v27, vcc, s0, v27
  1794. v_addc_u32_e32 v28, vcc, v0, v28, vcc
  1795. v_max_f32_e32 v0, 0, v13
  1796. flat_store_dword v[27:28], v0
  1797. v_add_u32_e32 v27, vcc, 0x31000, v24
  1798. v_ashrrev_i32_e32 v28, 31, v27
  1799. v_lshlrev_b64 v[27:28], 2, v[27:28]
  1800. v_mov_b32_e32 v0, s1
  1801. v_add_u32_e32 v27, vcc, s0, v27
  1802. v_addc_u32_e32 v28, vcc, v0, v28, vcc
  1803. v_add_u32_e32 v13, vcc, 0x3d400, v24
  1804. v_add_f32_e32 v0, v14, v20
  1805. v_ashrrev_i32_e32 v14, 31, v13
  1806. v_lshlrev_b64 v[13:14], 2, v[13:14]
  1807. v_max_f32_e32 v0, 0, v0
  1808. flat_store_dword v[27:28], v0
  1809. v_mov_b32_e32 v0, s1
  1810. v_add_u32_e32 v13, vcc, s0, v13
  1811. v_addc_u32_e32 v14, vcc, v0, v14, vcc
  1812. v_add_f32_e32 v0, v15, v21
  1813. v_max_f32_e32 v0, 0, v0
  1814. flat_store_dword v[13:14], v0
  1815. v_add_u32_e32 v13, vcc, 0x49800, v24
  1816. v_ashrrev_i32_e32 v14, 31, v13
  1817. v_lshlrev_b64 v[13:14], 2, v[13:14]
  1818. v_mov_b32_e32 v7, s1
  1819. v_add_u32_e32 v13, vcc, s0, v13
  1820. v_add_f32_e32 v0, v16, v22
  1821. v_addc_u32_e32 v14, vcc, v7, v14, vcc
  1822. v_add_u32_e32 v15, vcc, 0x55c00, v24
  1823. v_max_f32_e32 v0, 0, v0
  1824. v_ashrrev_i32_e32 v16, 31, v15
  1825. flat_store_dword v[13:14], v0
  1826. v_lshlrev_b64 v[13:14], 2, v[15:16]
  1827. v_add_f32_e32 v2, v18, v23
  1828. v_max_f32_e32 v0, 0, v2
  1829. v_mov_b32_e32 v2, s1
  1830. v_add_u32_e32 v13, vcc, s0, v13
  1831. v_addc_u32_e32 v14, vcc, v2, v14, vcc
  1832. flat_store_dword v[13:14], v0
  1833. v_or_b32_e32 v13, 8, v1
  1834. v_ashrrev_i32_e32 v14, 31, v13
  1835. v_lshlrev_b64 v[13:14], 2, v[13:14]
  1836. v_mov_b32_e32 v0, s3
  1837. v_add_u32_e32 v17, vcc, s2, v13
  1838. v_addc_u32_e32 v18, vcc, v0, v14, vcc
  1839. v_add_u32_e32 v13, vcc, 0x62000, v24
  1840. v_ashrrev_i32_e32 v14, 31, v13
  1841. v_lshlrev_b64 v[13:14], 2, v[13:14]
  1842. v_mov_b32_e32 v0, s1
  1843. v_add_u32_e32 v21, vcc, s0, v13
  1844. v_addc_u32_e32 v22, vcc, v0, v14, vcc
  1845. v_or_b32_e32 v0, 12, v1
  1846. v_ashrrev_i32_e32 v1, 31, v0
  1847. v_lshlrev_b64 v[0:1], 2, v[0:1]
  1848. v_mov_b32_e32 v2, s3
  1849. v_add_u32_e32 v0, vcc, s2, v0
  1850. v_addc_u32_e32 v1, vcc, v2, v1, vcc
  1851. flat_load_dwordx4 v[13:16], v[0:1]
  1852. flat_load_dwordx4 v[17:20], v[17:18]
  1853. v_mov_b32_e32 v2, s1
  1854. s_waitcnt vmcnt(0) lgkmcnt(0)
  1855. v_add_f32_e32 v0, v3, v17
  1856. v_max_f32_e32 v0, 0, v0
  1857. flat_store_dword v[21:22], v0
  1858. v_add_u32_e32 v0, vcc, 0x6e400, v24
  1859. v_ashrrev_i32_e32 v1, 31, v0
  1860. v_lshlrev_b64 v[0:1], 2, v[0:1]
  1861. v_add_u32_e32 v0, vcc, s0, v0
  1862. v_addc_u32_e32 v1, vcc, v2, v1, vcc
  1863. v_add_f32_e32 v2, v4, v18
  1864. v_max_f32_e32 v2, 0, v2
  1865. flat_store_dword v[0:1], v2
  1866. v_add_u32_e32 v0, vcc, 0x7a800, v24
  1867. v_ashrrev_i32_e32 v1, 31, v0
  1868. v_lshlrev_b64 v[0:1], 2, v[0:1]
  1869. v_mov_b32_e32 v2, s1
  1870. v_add_u32_e32 v0, vcc, s0, v0
  1871. v_addc_u32_e32 v1, vcc, v2, v1, vcc
  1872. v_add_f32_e32 v2, v5, v19
  1873. v_max_f32_e32 v2, 0, v2
  1874. flat_store_dword v[0:1], v2
  1875. v_add_f32_e32 v2, v6, v20
  1876. v_add_u32_e32 v1, vcc, 0x86c00, v24
  1877. v_max_f32_e32 v3, 0, v2
  1878. v_ashrrev_i32_e32 v2, 31, v1
  1879. v_add_u32_e32 v0, vcc, 0x93000, v24
  1880. v_lshlrev_b64 v[1:2], 2, v[1:2]
  1881. v_mov_b32_e32 v4, s1
  1882. v_add_u32_e32 v1, vcc, s0, v1
  1883. v_addc_u32_e32 v2, vcc, v4, v2, vcc
  1884. flat_store_dword v[1:2], v3
  1885. v_add_f32_e32 v1, v8, v13
  1886. v_max_f32_e32 v2, 0, v1
  1887. v_ashrrev_i32_e32 v1, 31, v0
  1888. v_lshlrev_b64 v[0:1], 2, v[0:1]
  1889. v_mov_b32_e32 v3, s1
  1890. v_add_u32_e32 v0, vcc, s0, v0
  1891. v_addc_u32_e32 v1, vcc, v3, v1, vcc
  1892. flat_store_dword v[0:1], v2
  1893. v_add_f32_e32 v0, v9, v14
  1894. v_max_f32_e32 v2, 0, v0
  1895. v_add_u32_e32 v0, vcc, s7, v25
  1896. v_addc_u32_e32 v1, vcc, 0, v26, vcc
  1897. flat_store_dword v[0:1], v2
  1898. v_add_f32_e32 v0, v10, v15
  1899. v_max_f32_e32 v2, 0, v0
  1900. v_add_u32_e32 v0, vcc, s5, v25
  1901. v_addc_u32_e32 v1, vcc, 0, v26, vcc
  1902. flat_store_dword v[0:1], v2
  1903. v_add_f32_e32 v0, v12, v16
  1904. v_max_f32_e32 v2, 0, v0
  1905. v_add_u32_e32 v0, vcc, s4, v25
  1906. v_addc_u32_e32 v1, vcc, 0, v26, vcc
  1907. flat_store_dword v[0:1], v2
  1908. s_endpgm
  1909. .Lfunc_end2:
  1910. .size fuse_conv2d_relu_kernel2, .Lfunc_end2-fuse_conv2d_relu_kernel2
Add Comment
Please, Sign In to add comment