Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- .text
- .hsa_code_object_version 2,1
- .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
- .globl fuse_conv2d_relu_kernel0
- .p2align 8
- .type fuse_conv2d_relu_kernel0,@function
- .amdgpu_hsa_kernel fuse_conv2d_relu_kernel0
- fuse_conv2d_relu_kernel0:
- .amd_kernel_code_t
- amd_code_version_major = 1
- amd_code_version_minor = 1
- amd_machine_kind = 1
- amd_machine_version_major = 8
- amd_machine_version_minor = 0
- amd_machine_version_stepping = 3
- kernel_code_entry_byte_offset = 256
- kernel_code_prefetch_byte_size = 0
- max_scratch_backing_memory_byte_size = 0
- granulated_workitem_vgpr_count = 1
- granulated_wavefront_sgpr_count = 1
- priority = 0
- float_mode = 192
- priv = 0
- enable_dx10_clamp = 1
- debug_mode = 0
- enable_ieee_mode = 1
- enable_sgpr_private_segment_wave_byte_offset = 0
- user_sgpr_count = 6
- enable_trap_handler = 1
- enable_sgpr_workgroup_id_x = 1
- enable_sgpr_workgroup_id_y = 0
- enable_sgpr_workgroup_id_z = 0
- enable_sgpr_workgroup_info = 0
- enable_vgpr_workitem_id = 0
- enable_exception_msb = 0
- granulated_lds_size = 0
- enable_exception = 0
- enable_sgpr_private_segment_buffer = 1
- enable_sgpr_dispatch_ptr = 0
- enable_sgpr_queue_ptr = 0
- enable_sgpr_kernarg_segment_ptr = 1
- enable_sgpr_dispatch_id = 0
- enable_sgpr_flat_scratch_init = 0
- enable_sgpr_private_segment_size = 0
- enable_sgpr_grid_workgroup_count_x = 0
- enable_sgpr_grid_workgroup_count_y = 0
- enable_sgpr_grid_workgroup_count_z = 0
- enable_ordered_append_gds = 0
- private_element_size = 1
- is_ptr64 = 1
- is_dynamic_callstack = 0
- is_debug_enabled = 0
- is_xnack_enabled = 0
- workitem_private_segment_byte_size = 0
- workgroup_group_segment_byte_size = 0
- gds_segment_byte_size = 0
- kernarg_segment_byte_size = 16
- workgroup_fbarrier_count = 0
- wavefront_sgpr_count = 10
- workitem_vgpr_count = 7
- reserved_vgpr_first = 0
- reserved_vgpr_count = 0
- reserved_sgpr_first = 0
- reserved_sgpr_count = 0
- debug_wavefront_private_segment_offset_sgpr = 0
- debug_private_segment_buffer_sgpr = 0
- kernarg_segment_alignment = 4
- group_segment_alignment = 4
- private_segment_alignment = 4
- wavefront_size = 6
- call_convention = -1
- runtime_loader_kernel_symbol = 0
- .end_amd_kernel_code_t
- v_sub_u32_e32 v1, vcc, 0x25bd8, v0
- s_lshl_b32 s0, s6, 8
- v_cmp_lt_i32_e32 vcc, s0, v1
- s_and_saveexec_b64 s[0:1], vcc
- s_cbranch_execz BB0_5
- BB0_1:
- s_mul_i32 s0, s6, 28
- v_add_u32_e32 v1, vcc, s0, v0
- v_mov_b32_e32 v0, 0x8fb823ef
- v_mul_hi_i32 v0, v1, v0
- s_movk_i32 s0, 0xe4
- v_add_u32_e32 v0, vcc, v0, v1
- v_lshrrev_b32_e32 v2, 31, v0
- v_ashrrev_i32_e32 v0, 7, v0
- v_add_u32_e32 v0, vcc, v2, v0
- v_mul_lo_i32 v0, v0, s0
- v_mov_b32_e32 v2, 0xe2
- v_subrev_u32_e32 v0, vcc, v0, v1
- v_cmp_lt_i32_e32 vcc, v0, v2
- s_and_saveexec_b64 s[2:3], vcc
- s_cbranch_execz BB0_5
- BB0_2:
- s_mul_i32 s6, s6, s0
- v_add_u32_e32 v1, vcc, s6, v1
- v_mov_b32_e32 v2, 0x28b30361
- v_mov_b32_e32 v4, 0x8fb823ef
- v_mul_hi_i32 v4, v1, v4
- v_mul_hi_i32 v2, v1, v2
- s_movk_i32 s0, 0xe2
- s_load_dwordx2 s[6:7], s[4:5], 0x0
- v_add_u32_e32 v4, vcc, v4, v1
- v_lshrrev_b32_e32 v3, 31, v2
- v_ashrrev_i32_e32 v2, 13, v2
- v_add_u32_e32 v2, vcc, v2, v3
- v_mov_b32_e32 v3, 0x55555556
- v_lshrrev_b32_e32 v6, 31, v4
- v_ashrrev_i32_e32 v4, 7, v4
- v_mul_hi_i32 v3, v2, v3
- v_add_u32_e32 v4, vcc, v6, v4
- v_mov_b32_e32 v6, 0x487ede05
- v_mul_hi_i32 v6, v4, v6
- v_lshrrev_b32_e32 v5, 31, v3
- v_add_u32_e32 v3, vcc, v3, v5
- v_mul_lo_i32 v3, v3, 3
- v_lshrrev_b32_e32 v5, 31, v6
- v_ashrrev_i32_e32 v6, 6, v6
- v_add_u32_e32 v5, vcc, v6, v5
- v_mov_b32_e32 v6, 0x6c880903
- v_mul_lo_i32 v5, v5, s0
- v_mul_hi_i32 v6, v1, v6
- v_subrev_u32_e32 v3, vcc, v3, v2
- v_cmp_lt_i32_e64 s[2:3], 0, v0
- v_subrev_u32_e32 v1, vcc, v5, v4
- v_lshrrev_b32_e32 v2, 31, v6
- v_add_u32_sdwa v2, vcc, sext(v6), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
- v_add_u32_e32 v5, vcc, -1, v1
- v_mov_b32_e32 v6, 0xe0
- v_mov_b32_e32 v4, 0xe1
- v_cmp_lt_u32_e32 vcc, v5, v6
- v_cmp_ne_u32_e64 s[0:1], v0, v4
- s_and_b64 s[2:3], s[2:3], vcc
- v_mov_b32_e32 v4, 0
- s_and_b64 s[2:3], s[0:1], s[2:3]
- s_and_saveexec_b64 s[0:1], s[2:3]
- s_cbranch_execz BB0_4
- BB0_3:
- s_load_dwordx2 s[2:3], s[4:5], 0x8
- s_mov_b32 s4, 0xc400
- v_mul_lo_i32 v4, v3, s4
- s_mov_b32 s4, 0x24c00
- v_mul_lo_i32 v5, v2, s4
- s_movk_i32 s4, 0xe0
- v_mul_lo_i32 v6, v1, s4
- v_add_u32_e32 v5, vcc, v5, v0
- v_add_u32_e32 v5, vcc, v5, v6
- v_add_u32_e32 v4, vcc, v4, v5
- v_add_u32_e32 v4, vcc, 0xffffff1f, v4
- v_ashrrev_i32_e32 v5, 31, v4
- v_lshlrev_b64 v[4:5], 2, v[4:5]
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v6, s3
- v_add_u32_e32 v4, vcc, s2, v4
- v_addc_u32_e32 v5, vcc, v6, v5, vcc
- flat_load_dword v4, v[4:5]
- BB0_4:
- s_or_b64 exec, exec, s[0:1]
- s_mov_b32 s0, 0xc948
- v_mul_lo_i32 v3, v3, s0
- s_mov_b32 s0, 0x25bd8
- v_mul_lo_i32 v2, v2, s0
- s_movk_i32 s0, 0xe4
- v_mul_lo_i32 v1, v1, s0
- v_add_u32_e32 v0, vcc, v2, v0
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v2, s7
- v_add_u32_e32 v0, vcc, v0, v1
- v_add_u32_e32 v0, vcc, v3, v0
- v_ashrrev_i32_e32 v1, 31, v0
- v_lshlrev_b64 v[0:1], 2, v[0:1]
- v_add_u32_e32 v0, vcc, s6, v0
- v_addc_u32_e32 v1, vcc, v2, v1, vcc
- s_waitcnt vmcnt(0)
- flat_store_dword v[0:1], v4
- BB0_5:
- s_endpgm
- .Lfunc_end0:
- .size fuse_conv2d_relu_kernel0, .Lfunc_end0-fuse_conv2d_relu_kernel0
- .globl fuse_conv2d_relu_kernel1
- .p2align 8
- .type fuse_conv2d_relu_kernel1,@function
- .amdgpu_hsa_kernel fuse_conv2d_relu_kernel1
- fuse_conv2d_relu_kernel1:
- .amd_kernel_code_t
- amd_code_version_major = 1
- amd_code_version_minor = 1
- amd_machine_kind = 1
- amd_machine_version_major = 8
- amd_machine_version_minor = 0
- amd_machine_version_stepping = 3
- kernel_code_entry_byte_offset = 256
- kernel_code_prefetch_byte_size = 0
- max_scratch_backing_memory_byte_size = 0
- granulated_workitem_vgpr_count = 1
- granulated_wavefront_sgpr_count = 1
- priority = 0
- float_mode = 192
- priv = 0
- enable_dx10_clamp = 1
- debug_mode = 0
- enable_ieee_mode = 1
- enable_sgpr_private_segment_wave_byte_offset = 0
- user_sgpr_count = 6
- enable_trap_handler = 1
- enable_sgpr_workgroup_id_x = 1
- enable_sgpr_workgroup_id_y = 0
- enable_sgpr_workgroup_id_z = 0
- enable_sgpr_workgroup_info = 0
- enable_vgpr_workitem_id = 0
- enable_exception_msb = 0
- granulated_lds_size = 0
- enable_exception = 0
- enable_sgpr_private_segment_buffer = 1
- enable_sgpr_dispatch_ptr = 0
- enable_sgpr_queue_ptr = 0
- enable_sgpr_kernarg_segment_ptr = 1
- enable_sgpr_dispatch_id = 0
- enable_sgpr_flat_scratch_init = 0
- enable_sgpr_private_segment_size = 0
- enable_sgpr_grid_workgroup_count_x = 0
- enable_sgpr_grid_workgroup_count_y = 0
- enable_sgpr_grid_workgroup_count_z = 0
- enable_ordered_append_gds = 0
- private_element_size = 1
- is_ptr64 = 1
- is_dynamic_callstack = 0
- is_debug_enabled = 0
- is_xnack_enabled = 0
- workitem_private_segment_byte_size = 0
- workgroup_group_segment_byte_size = 0
- gds_segment_byte_size = 0
- kernarg_segment_byte_size = 8
- workgroup_fbarrier_count = 0
- wavefront_sgpr_count = 9
- workitem_vgpr_count = 8
- reserved_vgpr_first = 0
- reserved_vgpr_count = 0
- reserved_sgpr_first = 0
- reserved_sgpr_count = 0
- debug_wavefront_private_segment_offset_sgpr = 0
- debug_private_segment_buffer_sgpr = 0
- kernarg_segment_alignment = 4
- group_segment_alignment = 4
- private_segment_alignment = 4
- wavefront_size = 6
- call_convention = -1
- runtime_loader_kernel_symbol = 0
- .end_amd_kernel_code_t
- v_lshrrev_b32_e32 v1, 2, v0
- v_sub_u32_e32 v2, vcc, 0x96f6, v1
- s_lshl_b32 s0, s6, 6
- v_cmp_lt_i32_e32 vcc, s0, v2
- s_and_saveexec_b64 s[2:3], vcc
- s_cbranch_execz BB1_4
- BB1_1:
- v_add_u32_e32 v1, vcc, s0, v1
- v_mov_b32_e32 v2, 0x28b30361
- v_mul_hi_i32 v2, v1, v2
- v_mov_b32_e32 v4, 0x8fb823ef
- v_mul_hi_i32 v6, v1, v4
- s_mulk_i32 s6, 0xffc7
- v_lshrrev_b32_e32 v3, 31, v2
- v_ashrrev_i32_e32 v2, 11, v2
- v_add_u32_e32 v2, vcc, v2, v3
- v_mov_b32_e32 v3, 0x55555556
- v_mul_hi_i32 v3, v2, v3
- v_and_b32_e32 v7, 3, v0
- s_movk_i32 s2, 0xe2
- s_mov_b32 s3, 0xc948
- v_lshrrev_b32_e32 v5, 31, v3
- v_add_u32_e32 v3, vcc, v3, v5
- v_add_u32_e32 v5, vcc, v6, v1
- v_lshrrev_b32_e32 v6, 31, v5
- v_ashrrev_i32_e32 v5, 5, v5
- v_add_u32_e32 v5, vcc, v6, v5
- v_mov_b32_e32 v6, 0x487ede05
- v_mul_lo_i32 v3, v3, 3
- v_mul_hi_i32 v6, v5, v6
- s_load_dwordx2 s[0:1], s[4:5], 0x0
- v_subrev_u32_e32 v2, vcc, v3, v2
- v_lshrrev_b32_e32 v3, 31, v6
- v_ashrrev_i32_e32 v6, 6, v6
- v_add_u32_e32 v3, vcc, v6, v3
- v_add_u32_e32 v6, vcc, s6, v1
- v_mul_hi_i32 v4, v6, v4
- v_mul_lo_i32 v3, v3, s2
- v_mul_lo_i32 v2, v2, s3
- s_movk_i32 s3, 0xe4
- v_add_u32_e32 v0, vcc, v4, v6
- v_lshrrev_b32_e32 v4, 31, v0
- v_ashrrev_i32_e32 v0, 5, v0
- v_add_u32_e32 v0, vcc, v4, v0
- v_mul_lo_i32 v0, v0, 57
- v_mov_b32_e32 v4, 0x6c880903
- v_mul_hi_i32 v4, v1, v4
- v_subrev_u32_e32 v3, vcc, v3, v5
- v_subrev_u32_e32 v0, vcc, v0, v6
- v_mul_lo_i32 v3, v3, s3
- v_lshlrev_b32_e32 v0, 2, v0
- v_lshrrev_b32_e32 v6, 31, v4
- v_ashrrev_i32_e32 v4, 14, v4
- v_add_u32_e32 v4, vcc, v4, v6
- v_or_b32_e32 v0, v0, v7
- v_mov_b32_e32 v6, 0x25bd8
- v_mad_i32_i24 v0, v4, v6, v0
- v_add_u32_e32 v0, vcc, v0, v3
- v_add_u32_e32 v0, vcc, v2, v0
- v_mul_lo_i32 v2, v5, 57
- v_sub_u32_e32 v3, vcc, s2, v7
- v_subrev_u32_e32 v1, vcc, v2, v1
- v_lshlrev_b32_e32 v1, 2, v1
- v_cmp_lt_i32_e32 vcc, v1, v3
- v_ashrrev_i32_e32 v1, 31, v0
- v_lshlrev_b64 v[0:1], 2, v[0:1]
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v2, s1
- v_add_u32_e64 v0, s[0:1], s0, v0
- v_addc_u32_e64 v1, s[0:1], v2, v1, s[0:1]
- v_mov_b32_e32 v2, 0
- s_and_saveexec_b64 s[0:1], vcc
- BB1_2:
- flat_load_dword v2, v[0:1]
- BB1_3:
- s_or_b64 exec, exec, s[0:1]
- s_waitcnt vmcnt(0) lgkmcnt(0)
- flat_store_dword v[0:1], v2
- BB1_4:
- s_endpgm
- .Lfunc_end1:
- .size fuse_conv2d_relu_kernel1, .Lfunc_end1-fuse_conv2d_relu_kernel1
- .globl fuse_conv2d_relu_kernel2
- .p2align 8
- .type fuse_conv2d_relu_kernel2,@function
- .amdgpu_hsa_kernel fuse_conv2d_relu_kernel2
- fuse_conv2d_relu_kernel2:
- .amd_kernel_code_t
- amd_code_version_major = 1
- amd_code_version_minor = 1
- amd_machine_kind = 1
- amd_machine_version_major = 8
- amd_machine_version_minor = 0
- amd_machine_version_stepping = 3
- kernel_code_entry_byte_offset = 256
- kernel_code_prefetch_byte_size = 0
- max_scratch_backing_memory_byte_size = 0
- granulated_workitem_vgpr_count = 63
- granulated_wavefront_sgpr_count = 2
- priority = 0
- float_mode = 192
- priv = 0
- enable_dx10_clamp = 1
- debug_mode = 0
- enable_ieee_mode = 1
- enable_sgpr_private_segment_wave_byte_offset = 1
- user_sgpr_count = 6
- enable_trap_handler = 1
- enable_sgpr_workgroup_id_x = 1
- enable_sgpr_workgroup_id_y = 0
- enable_sgpr_workgroup_id_z = 0
- enable_sgpr_workgroup_info = 0
- enable_vgpr_workitem_id = 0
- enable_exception_msb = 0
- granulated_lds_size = 0
- enable_exception = 0
- enable_sgpr_private_segment_buffer = 1
- enable_sgpr_dispatch_ptr = 0
- enable_sgpr_queue_ptr = 0
- enable_sgpr_kernarg_segment_ptr = 1
- enable_sgpr_dispatch_id = 0
- enable_sgpr_flat_scratch_init = 0
- enable_sgpr_private_segment_size = 0
- enable_sgpr_grid_workgroup_count_x = 0
- enable_sgpr_grid_workgroup_count_y = 0
- enable_sgpr_grid_workgroup_count_z = 0
- enable_ordered_append_gds = 0
- private_element_size = 1
- is_ptr64 = 1
- is_dynamic_callstack = 0
- is_debug_enabled = 0
- is_xnack_enabled = 0
- workitem_private_segment_byte_size = 268
- workgroup_group_segment_byte_size = 4464
- gds_segment_byte_size = 0
- kernarg_segment_byte_size = 32
- workgroup_fbarrier_count = 0
- wavefront_sgpr_count = 19
- workitem_vgpr_count = 256
- reserved_vgpr_first = 0
- reserved_vgpr_count = 0
- reserved_sgpr_first = 0
- reserved_sgpr_count = 0
- debug_wavefront_private_segment_offset_sgpr = 0
- debug_private_segment_buffer_sgpr = 0
- kernarg_segment_alignment = 4
- group_segment_alignment = 4
- private_segment_alignment = 4
- wavefront_size = 6
- call_convention = -1
- runtime_loader_kernel_symbol = 0
- .end_amd_kernel_code_t
- s_mov_b64 s[14:15], s[2:3]
- v_mov_b32_e32 v255, v0
- s_mov_b32 s16, s7
- s_mov_b64 s[12:13], s[0:1]
- v_cmp_lt_i32_e32 vcc, 26, v255
- v_mov_b32_e32 v1, 0
- s_and_saveexec_b64 s[0:1], vcc
- s_xor_b64 s[0:1], exec, s[0:1]
- BB2_1:
- v_mov_b32_e32 v1, 0xab
- v_cmp_lt_i32_e32 vcc, v255, v1
- v_mov_b32_e32 v4, 0
- v_cndmask_b32_e64 v1, 0, -1, vcc
- BB2_2:
- s_or_saveexec_b64 s[0:1], s[0:1]
- s_load_dwordx2 s[8:9], s[4:5], 0x8
- s_xor_b64 exec, exec, s[0:1]
- s_cbranch_execz BB2_4
- BB2_3:
- v_mov_b32_e32 v1, 0x92492493
- v_mul_hi_i32 v1, s6, v1
- v_mov_b32_e32 v2, 0x38e38e39
- v_mul_hi_u32 v2, v255, v2
- s_load_dwordx2 s[2:3], s[4:5], 0x0
- v_add_u32_e32 v1, vcc, s6, v1
- v_lshrrev_b32_e32 v3, 31, v1
- v_ashrrev_i32_e32 v1, 7, v1
- v_add_u32_e32 v1, vcc, v3, v1
- v_mul_lo_i32 v1, v1, 48
- v_lshrrev_b32_e32 v2, 1, v2
- v_mul_u32_u24_e32 v3, 9, v2
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v5, s3
- v_add_u32_e32 v1, vcc, v1, v2
- v_mul_lo_i32 v1, v1, 9
- v_subrev_u32_e32 v2, vcc, v3, v255
- v_mov_b32_e32 v7, s3
- s_movk_i32 s7, 0x5e8
- v_add_u32_e32 v1, vcc, v1, v2
- v_ashrrev_i32_e32 v2, 31, v1
- v_lshlrev_b64 v[2:3], 2, v[1:2]
- v_add_u32_e32 v2, vcc, s2, v2
- v_addc_u32_e32 v3, vcc, v5, v3, vcc
- v_add_u32_e32 v13, vcc, 27, v1
- v_ashrrev_i32_e32 v14, 31, v13
- v_add_u32_e32 v5, vcc, 0xd8, v1
- v_add_u32_e32 v6, vcc, 0xbd, v1
- v_add_u32_e32 v8, vcc, 0xa2, v1
- v_add_u32_e32 v9, vcc, 0x87, v1
- v_add_u32_e32 v10, vcc, 0x6c, v1
- v_add_u32_e32 v11, vcc, 0x51, v1
- v_add_u32_e32 v12, vcc, 54, v1
- v_lshlrev_b64 v[13:14], 2, v[13:14]
- v_add_u32_e32 v13, vcc, s2, v13
- v_addc_u32_e32 v14, vcc, v7, v14, vcc
- v_add_u32_e32 v18, vcc, 0xf3, v1
- v_ashrrev_i32_e32 v19, 31, v18
- v_add_u32_e32 v15, vcc, 0x144, v1
- v_add_u32_e32 v16, vcc, 0x129, v1
- v_add_u32_e32 v17, vcc, 0x10e, v1
- v_lshlrev_b64 v[18:19], 2, v[18:19]
- v_add_u32_e32 v20, vcc, s2, v18
- v_ashrrev_i32_e32 v18, 31, v17
- v_mov_b32_e32 v1, s3
- v_addc_u32_e32 v21, vcc, v1, v19, vcc
- v_lshlrev_b64 v[17:18], 2, v[17:18]
- v_add_u32_e32 v22, vcc, s2, v17
- v_ashrrev_i32_e32 v17, 31, v16
- v_addc_u32_e32 v23, vcc, v1, v18, vcc
- v_lshlrev_b64 v[16:17], 2, v[16:17]
- v_add_u32_e32 v18, vcc, s2, v16
- v_ashrrev_i32_e32 v16, 31, v15
- v_addc_u32_e32 v19, vcc, v1, v17, vcc
- v_lshlrev_b64 v[15:16], 2, v[15:16]
- v_ashrrev_i32_e32 v7, 31, v6
- v_add_u32_e32 v15, vcc, s2, v15
- v_addc_u32_e32 v16, vcc, v1, v16, vcc
- v_lshlrev_b64 v[6:7], 2, v[6:7]
- flat_load_dword v1, v[20:21]
- flat_load_dword v20, v[22:23]
- flat_load_dword v19, v[18:19]
- flat_load_dword v21, v[15:16]
- flat_load_dword v22, v[13:14]
- v_add_u32_e32 v13, vcc, s2, v6
- v_ashrrev_i32_e32 v6, 31, v5
- v_mov_b32_e32 v14, s3
- v_addc_u32_e32 v14, vcc, v14, v7, vcc
- v_lshlrev_b64 v[5:6], 2, v[5:6]
- v_mov_b32_e32 v7, s3
- v_add_u32_e32 v5, vcc, s2, v5
- v_addc_u32_e32 v6, vcc, v7, v6, vcc
- v_add_u32_e32 v15, vcc, s7, v2
- v_addc_u32_e32 v16, vcc, 0, v3, vcc
- s_movk_i32 s7, 0x654
- v_add_u32_e32 v17, vcc, s7, v2
- flat_load_dword v23, v[13:14]
- v_addc_u32_e32 v18, vcc, 0, v3, vcc
- flat_load_dword v25, v[5:6]
- flat_load_dword v5, v[2:3]
- flat_load_dword v17, v[17:18]
- flat_load_dword v18, v[15:16]
- v_lshlrev_b32_e32 v24, 2, v255
- s_mov_b32 m0, -1
- v_ashrrev_i32_e32 v13, 31, v12
- s_waitcnt vmcnt(2) lgkmcnt(2)
- ds_write2_b32 v24, v5, v22 offset1:27
- v_lshlrev_b64 v[5:6], 2, v[12:13]
- v_ashrrev_i32_e32 v12, 31, v11
- v_add_u32_e32 v5, vcc, s2, v5
- v_addc_u32_e32 v6, vcc, v7, v6, vcc
- v_lshlrev_b64 v[11:12], 2, v[11:12]
- v_add_u32_e32 v13, vcc, s2, v11
- v_ashrrev_i32_e32 v11, 31, v10
- v_addc_u32_e32 v14, vcc, v7, v12, vcc
- v_lshlrev_b64 v[10:11], 2, v[10:11]
- v_add_u32_e32 v15, vcc, s2, v10
- v_ashrrev_i32_e32 v10, 31, v9
- v_addc_u32_e32 v16, vcc, v7, v11, vcc
- v_lshlrev_b64 v[9:10], 2, v[9:10]
- v_add_u32_e32 v11, vcc, s2, v9
- v_ashrrev_i32_e32 v9, 31, v8
- v_addc_u32_e32 v12, vcc, v7, v10, vcc
- v_lshlrev_b64 v[7:8], 2, v[8:9]
- v_add_u32_e32 v7, vcc, s2, v7
- v_mov_b32_e32 v9, s3
- v_addc_u32_e32 v8, vcc, v9, v8, vcc
- flat_load_dword v5, v[5:6]
- flat_load_dword v6, v[13:14]
- flat_load_dword v9, v[15:16]
- flat_load_dword v10, v[11:12]
- flat_load_dword v7, v[7:8]
- s_movk_i32 s2, 0x57c
- s_waitcnt vmcnt(3) lgkmcnt(3)
- ds_write2_b32 v24, v5, v6 offset0:54 offset1:81
- s_waitcnt vmcnt(1) lgkmcnt(2)
- ds_write2_b32 v24, v9, v10 offset0:108 offset1:135
- s_waitcnt vmcnt(0) lgkmcnt(2)
- ds_write2_b32 v24, v7, v23 offset0:162 offset1:189
- ds_write2_b32 v24, v25, v1 offset0:216 offset1:243
- v_add_u32_e32 v1, vcc, 0x438, v24
- ds_write2_b32 v1, v20, v19 offset1:27
- v_add_u32_e32 v1, vcc, s2, v2
- v_addc_u32_e32 v2, vcc, 0, v3, vcc
- flat_load_dword v1, v[1:2]
- v_add_u32_e32 v2, vcc, 0x510, v24
- s_waitcnt vmcnt(0) lgkmcnt(0)
- ds_write2_b32 v2, v21, v1 offset1:27
- v_add_u32_e32 v1, vcc, 0x5e8, v24
- ds_write2_b32 v1, v18, v17 offset1:27
- v_mov_b32_e32 v1, -1
- BB2_4:
- s_or_b64 exec, exec, s[0:1]
- v_cmp_ne_u32_e32 vcc, 0, v1
- s_and_saveexec_b64 s[0:1], vcc
- s_cbranch_execz BB2_7
- BB2_5:
- v_mov_b32_e32 v1, 0x1f7047dd
- v_mov_b32_e32 v2, 0x92492493
- v_mul_hi_u32 v1, v255, v1
- v_mul_hi_i32 v2, s6, v2
- s_movk_i32 s2, 0xe0
- v_mov_b32_e32 v4, -1
- v_subrev_u32_e32 v3, vcc, v1, v255
- v_lshrrev_b32_e32 v3, 1, v3
- v_add_u32_e32 v2, vcc, s6, v2
- v_add_u32_e32 v1, vcc, v1, v3
- v_lshrrev_b32_e32 v3, 31, v2
- v_ashrrev_i32_e32 v2, 7, v2
- v_add_u32_e32 v2, vcc, v3, v2
- v_mul_lo_i32 v2, v2, s2
- v_lshrrev_b32_e32 v1, 5, v1
- v_sub_u32_e32 v3, vcc, 0xe2, v1
- v_sub_u32_e32 v2, vcc, s6, v2
- v_cmp_lt_i32_e32 vcc, v2, v3
- s_and_b64 exec, exec, vcc
- s_cbranch_execz BB2_7
- BB2_6:
- v_mov_b32_e32 v3, 0x1f7047dd
- v_mul_hi_u32 v3, v255, v3
- v_add_u32_e32 v1, vcc, v2, v1
- v_mul_lo_i32 v1, v1, 57
- v_lshlrev_b32_e32 v5, 4, v255
- v_subrev_u32_e32 v4, vcc, v3, v255
- v_lshrrev_b32_e32 v4, 1, v4
- v_add_u32_e32 v3, vcc, v3, v4
- v_lshrrev_b32_e32 v3, 5, v3
- v_mul_lo_i32 v3, v3, 57
- s_mov_b32 m0, -1
- v_subrev_u32_e32 v2, vcc, v3, v255
- v_add_u32_e32 v1, vcc, v1, v2
- v_lshlrev_b32_e32 v1, 2, v1
- v_ashrrev_i32_e32 v2, 31, v1
- v_lshlrev_b64 v[1:2], 2, v[1:2]
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v3, s9
- v_add_u32_e32 v1, vcc, s8, v1
- v_addc_u32_e32 v2, vcc, v3, v2, vcc
- flat_load_dwordx4 v[1:4], v[1:2]
- v_add_u32_e32 v6, vcc, 0x6c0, v5
- s_waitcnt vmcnt(0) lgkmcnt(0)
- ds_write_b64 v6, v[3:4] offset:8
- ds_write_b64 v5, v[1:2] offset:1728
- v_mov_b32_e32 v4, -1
- BB2_7:
- s_or_b64 exec, exec, s[0:1]
- s_waitcnt lgkmcnt(0)
- s_barrier
- s_mov_b32 m0, -1
- v_mov_b32_e32 v3, 0
- ds_read2_b32 v[6:7], v3 offset0:247 offset1:248
- v_mov_b32_e32 v5, 0x4a4
- v_mov_b32_e32 v10, 0x584
- v_mov_b32_e32 v11, 0x65c
- ds_read2_b32 v[13:14], v10 offset1:1
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v6, off, s[12:15], s16 offset:148
- buffer_store_dword v7, off, s[12:15], s16 offset:152
- s_waitcnt expcnt(0)
- ds_read2_b32 v[6:7], v3 offset0:249 offset1:250
- ds_read2_b32 v[10:11], v11 offset1:1
- v_mov_b32_e32 v12, 0x4b4
- v_lshlrev_b32_e32 v2, 2, v255
- s_waitcnt lgkmcnt(2)
- buffer_store_dword v13, off, s[12:15], s16 offset:68
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v6, off, s[12:15], s16 offset:140
- buffer_store_dword v7, off, s[12:15], s16 offset:144
- s_waitcnt expcnt(1)
- ds_read2_b32 v[5:6], v5 offset1:1
- s_waitcnt expcnt(0)
- v_mov_b32_e32 v7, 0x4ac
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v10, off, s[12:15], s16 offset:44
- buffer_store_dword v11, off, s[12:15], s16 offset:48
- s_waitcnt expcnt(0)
- ds_read2_b32 v[10:11], v12 offset1:1
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v5, off, s[12:15], s16 offset:100
- buffer_store_dword v6, off, s[12:15], s16 offset:104
- s_waitcnt expcnt(1)
- v_mov_b32_e32 v5, 0x57c
- s_waitcnt expcnt(0)
- v_mov_b32_e32 v6, 0x654
- ds_read2_b32 v[8:9], v5 offset1:1
- ds_read2_b32 v[5:6], v6 offset1:1
- s_waitcnt lgkmcnt(2)
- buffer_store_dword v10, off, s[12:15], s16 offset:116
- buffer_store_dword v11, off, s[12:15], s16 offset:120
- s_waitcnt expcnt(1)
- v_mov_b32_e32 v10, 0x58c
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v8, off, s[12:15], s16 offset:60
- s_waitcnt lgkmcnt(0)
- buffer_store_dword v5, off, s[12:15], s16 offset:52
- buffer_store_dword v6, off, s[12:15], s16 offset:56
- s_waitcnt expcnt(0)
- ds_read2_b32 v[5:6], v7 offset1:1
- buffer_store_dword v9, off, s[12:15], s16 offset:64
- v_add_u32_e32 v7, vcc, 0xde0, v2
- v_mov_b32_e32 v8, 0x594
- ds_read_b32 v28, v2 offset:3560
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v5, off, s[12:15], s16 offset:108
- buffer_store_dword v6, off, s[12:15], s16 offset:112
- s_waitcnt expcnt(0)
- v_add_u32_e32 v6, vcc, 0x6c8, v2
- buffer_store_dword v14, off, s[12:15], s16 offset:72
- ds_read2_b32 v[43:44], v6 offset1:228
- ds_read2_b32 v[12:13], v8 offset1:1
- ds_read2_b32 v[33:34], v7 offset1:1
- ds_read2_b32 v[6:7], v10 offset1:1
- v_mov_b32_e32 v11, 0x664
- v_add_u32_e32 v155, vcc, 0x6c0, v2
- v_add_u32_e32 v5, vcc, 0xa50, v2
- v_mov_b32_e32 v2, 0x4bc
- s_waitcnt lgkmcnt(0)
- buffer_store_dword v6, off, s[12:15], s16 offset:84
- buffer_store_dword v7, off, s[12:15], s16 offset:88
- s_waitcnt expcnt(0)
- ds_read2_b32 v[6:7], v11 offset1:1
- ds_read2_b32 v[47:48], v5 offset1:1
- ds_read2_b32 v[57:58], v155 offset1:1
- ds_read2_b32 v[0:1], v3 offset0:85 offset1:86
- ds_read2_b32 v[166:167], v3 offset0:87 offset1:88
- s_waitcnt lgkmcnt(4)
- buffer_store_dword v6, off, s[12:15], s16 offset:36
- buffer_store_dword v7, off, s[12:15], s16 offset:40
- s_waitcnt expcnt(0)
- ds_read2_b32 v[6:7], v2 offset1:1
- v_add_u32_e32 v2, vcc, 0x458, v3
- s_waitcnt lgkmcnt(2)
- buffer_store_dword v0, off, s[12:15], s16 offset:220
- ds_read2_b32 v[15:16], v3 offset0:135 offset1:136
- buffer_store_dword v1, off, s[12:15], s16 offset:224
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v6, off, s[12:15], s16 offset:124
- buffer_store_dword v7, off, s[12:15], s16 offset:128
- s_waitcnt expcnt(0)
- ds_read2_b32 v[6:7], v2 offset1:27
- v_add_u32_e32 v2, vcc, 0x530, v3
- ds_read2_b32 v[168:169], v3 offset0:33 offset1:34
- ds_read2_b32 v[17:18], v3 offset0:81 offset1:82
- ds_read2_b32 v[0:1], v3 offset0:83 offset1:84
- s_waitcnt lgkmcnt(3)
- buffer_store_dword v6, off, s[12:15], s16 offset:132
- buffer_store_dword v7, off, s[12:15], s16 offset:136
- s_waitcnt expcnt(0)
- ds_read2_b32 v[6:7], v2 offset1:27
- v_add_u32_e32 v2, vcc, 0x608, v3
- buffer_store_dword v12, off, s[12:15], s16 offset:76
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v0, off, s[12:15], s16 offset:228
- buffer_store_dword v13, off, s[12:15], s16 offset:80
- s_waitcnt lgkmcnt(0)
- buffer_store_dword v6, off, s[12:15], s16 offset:92
- buffer_store_dword v7, off, s[12:15], s16 offset:96
- s_waitcnt expcnt(0)
- ds_read2_b32 v[6:7], v2 offset1:27
- buffer_store_dword v1, off, s[12:15], s16 offset:232
- ds_read2_b32 v[12:13], v3 offset0:27 offset1:28
- s_waitcnt expcnt(0)
- ds_read2_b32 v[0:1], v3 offset0:29 offset1:30
- v_mov_b32_e32 v9, 0x66c
- s_waitcnt lgkmcnt(2)
- buffer_store_dword v6, off, s[12:15], s16 offset:20
- buffer_store_dword v7, off, s[12:15], s16 offset:24
- s_waitcnt expcnt(1)
- ds_read2_b32 v[5:6], v3 offset0:195 offset1:196
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v0, off, s[12:15], s16 offset:236
- buffer_store_dword v1, off, s[12:15], s16 offset:240
- s_waitcnt expcnt(0)
- ds_read2_b32 v[0:1], v3 offset0:31 offset1:32
- ds_read2_b32 v[8:9], v9 offset1:1
- s_waitcnt lgkmcnt(2)
- buffer_store_dword v5, off, s[12:15], s16 offset:180
- buffer_store_dword v6, off, s[12:15], s16 offset:184
- s_waitcnt expcnt(0)
- ds_read2_b32 v[5:6], v3 offset0:243 offset1:244
- s_waitcnt lgkmcnt(2)
- buffer_store_dword v0, off, s[12:15], s16 offset:244
- buffer_store_dword v1, off, s[12:15], s16 offset:248
- ds_read2_b32 v[158:159], v3 offset0:137 offset1:138
- s_waitcnt expcnt(0)
- ds_read2_b32 v[0:1], v3 offset0:139 offset1:140
- s_waitcnt lgkmcnt(2)
- buffer_store_dword v5, off, s[12:15], s16 offset:164
- buffer_store_dword v6, off, s[12:15], s16 offset:168
- s_waitcnt expcnt(0)
- ds_read2_b32 v[5:6], v3 offset0:245 offset1:246
- ds_read2_b32 v[156:157], v3 offset0:141 offset1:142
- ds_read2_b32 v[164:165], v3 offset0:8 offset1:35
- ds_read2_b32 v[162:163], v3 offset0:62 offset1:89
- buffer_store_dword v8, off, s[12:15], s16 offset:28
- s_waitcnt lgkmcnt(3)
- buffer_store_dword v5, off, s[12:15], s16 offset:156
- buffer_store_dword v6, off, s[12:15], s16 offset:160
- s_waitcnt expcnt(0)
- ds_read2_b32 v[5:6], v3 offset0:189 offset1:190
- buffer_store_dword v9, off, s[12:15], s16 offset:32
- ds_read2_b64 v[171:174], v3 offset1:27
- ds_read2_b64 v[131:134], v3 offset0:54 offset1:81
- ds_read2_b64 v[105:108], v3 offset0:108 offset1:135
- s_waitcnt lgkmcnt(3)
- buffer_store_dword v5, off, s[12:15], s16 offset:204
- buffer_store_dword v6, off, s[12:15], s16 offset:208
- s_waitcnt expcnt(0)
- ds_read2_b32 v[5:6], v3 offset0:191 offset1:192
- ds_read2_b64 v[77:80], v3 offset0:162 offset1:189
- ds_read2_b64 v[175:178], v3 offset0:1 offset1:28
- ds_read2_b64 v[135:138], v3 offset0:55 offset1:82
- ds_read2_b64 v[109:112], v3 offset0:109 offset1:136
- s_waitcnt lgkmcnt(4)
- buffer_store_dword v5, off, s[12:15], s16 offset:196
- buffer_store_dword v6, off, s[12:15], s16 offset:200
- s_waitcnt expcnt(0)
- ds_read2_b32 v[5:6], v3 offset0:193 offset1:194
- ds_read2_b64 v[81:84], v3 offset0:163 offset1:190
- ds_read2_b64 v[179:182], v3 offset0:2 offset1:29
- ds_read2_b64 v[139:142], v3 offset0:56 offset1:83
- ds_read2_b64 v[113:116], v3 offset0:110 offset1:137
- s_waitcnt lgkmcnt(4)
- buffer_store_dword v5, off, s[12:15], s16 offset:188
- buffer_store_dword v6, off, s[12:15], s16 offset:192
- ds_read2_b32 v[160:161], v3 offset0:116 offset1:143
- s_waitcnt expcnt(0)
- ds_read2_b32 v[5:6], v3 offset0:170 offset1:197
- ds_read2_b64 v[87:90], v3 offset0:164 offset1:191
- ds_read2_b64 v[183:186], v3 offset0:3 offset1:30
- ds_read2_b64 v[147:150], v3 offset0:57 offset1:84
- ds_read2_b64 v[117:120], v3 offset0:111 offset1:138
- s_waitcnt lgkmcnt(4)
- buffer_store_dword v5, off, s[12:15], s16 offset:212
- buffer_store_dword v6, off, s[12:15], s16 offset:216
- s_waitcnt expcnt(0)
- ds_read2_b32 v[5:6], v3 offset0:224 offset1:251
- ds_read2_b64 v[91:94], v3 offset0:165 offset1:192
- v_cmp_ne_u32_e32 vcc, 0, v4
- s_waitcnt lgkmcnt(1)
- buffer_store_dword v5, off, s[12:15], s16 offset:172
- buffer_store_dword v6, off, s[12:15], s16 offset:176
- s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- s_barrier
- s_and_saveexec_b64 s[0:1], vcc
- s_cbranch_execz BB2_10
- BB2_8:
- v_mov_b32_e32 v2, 0x1f7047dd
- v_mov_b32_e32 v3, 0x92492493
- v_mul_hi_u32 v2, v255, v2
- v_mul_hi_i32 v3, s6, v3
- s_movk_i32 s2, 0xe0
- v_subrev_u32_e32 v5, vcc, v2, v255
- v_lshrrev_b32_e32 v5, 1, v5
- v_add_u32_e32 v3, vcc, s6, v3
- v_add_u32_e32 v2, vcc, v2, v5
- v_lshrrev_b32_e32 v5, 31, v3
- v_ashrrev_i32_e32 v3, 7, v3
- v_add_u32_e32 v3, vcc, v5, v3
- v_mul_lo_i32 v3, v3, s2
- v_lshrrev_b32_e32 v2, 5, v2
- v_sub_u32_e32 v5, vcc, 0xe2, v2
- v_sub_u32_e32 v3, vcc, s6, v3
- v_cmp_lt_i32_e32 vcc, v3, v5
- s_and_b64 exec, exec, vcc
- s_cbranch_execz BB2_10
- BB2_9:
- v_mov_b32_e32 v5, 0x1f7047dd
- v_mul_hi_u32 v5, v255, v5
- v_add_u32_e32 v2, vcc, v3, v2
- v_mul_lo_i32 v2, v2, 57
- s_mov_b32 m0, -1
- v_subrev_u32_e32 v6, vcc, v5, v255
- v_lshrrev_b32_e32 v6, 1, v6
- v_add_u32_e32 v5, vcc, v5, v6
- v_lshrrev_b32_e32 v5, 5, v5
- v_mul_lo_i32 v5, v5, 57
- v_subrev_u32_e32 v3, vcc, v5, v255
- v_add_u32_e32 v2, vcc, v2, v3
- v_lshlrev_b32_e32 v2, 2, v2
- v_add_u32_e32 v2, vcc, 0xc948, v2
- v_ashrrev_i32_e32 v3, 31, v2
- v_lshlrev_b64 v[2:3], 2, v[2:3]
- v_mov_b32_e32 v5, s9
- v_add_u32_e32 v2, vcc, s8, v2
- v_addc_u32_e32 v3, vcc, v5, v3, vcc
- flat_load_dwordx4 v[5:8], v[2:3]
- v_mad_u32_u24 v2, v255, 12, v155
- s_waitcnt vmcnt(0) lgkmcnt(0)
- ds_write2_b64 v2, v[5:6], v[7:8] offset1:1
- BB2_10:
- s_or_b64 exec, exec, s[0:1]
- s_waitcnt lgkmcnt(0)
- s_barrier
- s_mov_b32 m0, -1
- v_mov_b32_e32 v7, 0
- v_mov_b32_e32 v2, 0x45c
- ds_read2_b32 v[217:218], v7 offset0:229 offset1:230
- ds_read2_b32 v[215:216], v7 offset0:231 offset1:232
- ds_read2_b32 v[205:206], v2 offset1:1
- v_mov_b32_e32 v2, 0x534
- v_mov_b32_e32 v3, 0x60c
- v_mov_b32_e32 v5, 0x464
- ds_read2_b32 v[195:196], v2 offset1:1
- ds_read2_b32 v[187:188], v3 offset1:1
- ds_read2_b32 v[209:210], v5 offset1:1
- v_mov_b32_e32 v2, 0x53c
- v_mov_b32_e32 v3, 0x614
- v_mov_b32_e32 v5, 0x46c
- ds_read2_b32 v[199:200], v2 offset1:1
- ds_read2_b32 v[189:190], v3 offset1:1
- ds_read2_b32 v[211:212], v5 offset1:1
- v_add_u32_e32 v5, vcc, 0x720, v155
- ds_read2_b32 v[41:42], v155 offset0:228 offset1:229
- ds_read_b32 v19, v155 offset:1832
- ds_read2_b32 v[22:23], v5 offset1:1
- v_mov_b32_e32 v2, 0x544
- v_mov_b32_e32 v3, 0x61c
- v_mov_b32_e32 v5, 0x474
- ds_read2_b32 v[201:202], v2 offset1:1
- ds_read2_b32 v[191:192], v3 offset1:1
- ds_read2_b32 v[213:214], v5 offset1:1
- v_add_u32_e32 v5, vcc, 0x3a4, v7
- ds_read2_b32 v[219:220], v5 offset1:27
- v_add_u32_e32 v5, vcc, 0x47c, v7
- ds_read2_b32 v[207:208], v5 offset1:27
- v_add_u32_e32 v5, vcc, 0x554, v7
- ds_read2_b32 v[197:198], v5 offset1:27
- v_add_u32_e32 v5, vcc, 0x62c, v7
- v_mov_b32_e32 v2, 0x54c
- v_mov_b32_e32 v3, 0x624
- ds_read2_b32 v[26:27], v5 offset1:27
- ds_read2_b32 v[203:204], v2 offset1:1
- ds_read2_b32 v[193:194], v3 offset1:1
- ds_read2_b32 v[59:60], v155 offset1:1
- ds_read2_b32 v[75:76], v7 offset0:67 offset1:68
- ds_read2_b32 v[69:70], v7 offset0:69 offset1:70
- ds_read2_b32 v[49:50], v7 offset0:117 offset1:118
- ds_read2_b32 v[5:6], v7 offset0:15 offset1:16
- ds_read2_b32 v[95:96], v7 offset0:63 offset1:64
- ds_read2_b32 v[85:86], v7 offset0:65 offset1:66
- ds_read2_b32 v[243:244], v7 offset0:177 offset1:178
- ds_read2_b32 v[239:240], v7 offset0:225 offset1:226
- ds_read2_b32 v[237:238], v7 offset0:227 offset1:228
- ds_read2_b32 v[24:25], v7 offset0:9 offset1:10
- ds_read2_b32 v[2:3], v7 offset0:11 offset1:12
- ds_read2_b32 v[99:100], v7 offset0:13 offset1:14
- ds_read2_b32 v[45:46], v7 offset0:171 offset1:172
- ds_read2_b32 v[20:21], v7 offset0:173 offset1:174
- ds_read2_b32 v[253:254], v7 offset0:175 offset1:176
- ds_read2_b32 v[35:36], v155 offset0:2 offset1:230
- ds_read2_b32 v[67:68], v7 offset0:119 offset1:120
- ds_read2_b32 v[65:66], v7 offset0:121 offset1:122
- ds_read2_b32 v[55:56], v7 offset0:123 offset1:124
- ds_read2_b32 v[121:122], v7 offset0:17 offset1:44
- ds_read2_b32 v[97:98], v7 offset0:71 offset1:98
- ds_read2_b32 v[8:9], v7 offset0:125 offset1:152
- ds_read2_b32 v[241:242], v7 offset0:179 offset1:206
- ds_read2_b64 v[127:130], v7 offset0:18 offset1:45
- ds_read2_b64 v[245:248], v7 offset0:72 offset1:99
- ds_read2_b64 v[221:224], v7 offset0:126 offset1:153
- ds_read2_b64 v[71:74], v7 offset0:180 offset1:207
- ds_read2_b64 v[143:146], v7 offset0:19 offset1:46
- ds_read2_b64 v[249:252], v7 offset0:73 offset1:100
- ds_read2_b64 v[225:228], v7 offset0:127 offset1:154
- ds_read2_b64 v[37:40], v7 offset0:181 offset1:208
- ds_read2_b64 v[101:104], v7 offset0:20 offset1:47
- ds_read2_b64 v[29:32], v7 offset0:74 offset1:101
- ds_read2_b64 v[229:232], v7 offset0:128 offset1:155
- ds_read2_b64 v[51:54], v7 offset0:182 offset1:209
- ds_read2_b64 v[123:126], v7 offset0:21 offset1:48
- ds_read2_b64 v[61:64], v7 offset0:75 offset1:102
- ds_read2_b64 v[233:236], v7 offset0:129 offset1:156
- ds_read2_b64 v[151:154], v7 offset0:183 offset1:210
- s_load_dwordx2 s[0:1], s[4:5], 0x10
- s_load_dwordx2 s[2:3], s[4:5], 0x18
- v_cmp_ne_u32_e32 vcc, 0, v4
- s_xor_b64 s[4:5], vcc, -1
- s_waitcnt lgkmcnt(0)
- buffer_store_dword v151, off, s[12:15], s16 offset:4
- buffer_store_dword v152, off, s[12:15], s16 offset:8
- buffer_store_dword v153, off, s[12:15], s16 offset:12
- buffer_store_dword v154, off, s[12:15], s16 offset:16
- s_waitcnt vmcnt(0) expcnt(0)
- s_barrier
- s_and_saveexec_b64 s[10:11], s[4:5]
- s_xor_b64 s[4:5], exec, s[10:11]
- BB2_11:
- v_mov_b32_e32 v4, 0x92492493
- v_mul_hi_i32 v4, s6, v4
- s_movk_i32 s7, 0xe0
- v_add_u32_e32 v4, vcc, s6, v4
- v_lshrrev_b32_e32 v7, 31, v4
- v_ashrrev_i32_e32 v4, 7, v4
- v_add_u32_e32 v4, vcc, v7, v4
- v_mul_lo_i32 v4, v4, s7
- v_sub_u32_e32 v151, vcc, s6, v4
- BB2_12:
- s_or_saveexec_b64 s[4:5], s[4:5]
- s_xor_b64 exec, exec, s[4:5]
- s_cbranch_execz BB2_16
- BB2_13:
- v_mov_b32_e32 v4, 0x1f7047dd
- v_mov_b32_e32 v7, 0x92492493
- v_mul_hi_u32 v4, v255, v4
- v_mul_hi_i32 v7, s6, v7
- s_movk_i32 s7, 0xe0
- v_subrev_u32_e32 v10, vcc, v4, v255
- v_lshrrev_b32_e32 v10, 1, v10
- v_add_u32_e32 v7, vcc, s6, v7
- v_add_u32_e32 v4, vcc, v4, v10
- v_lshrrev_b32_e32 v10, 31, v7
- v_ashrrev_i32_e32 v7, 7, v7
- v_add_u32_e32 v7, vcc, v10, v7
- v_mul_lo_i32 v7, v7, s7
- v_lshrrev_b32_e32 v4, 5, v4
- v_sub_u32_e32 v10, vcc, 0xe2, v4
- v_sub_u32_e32 v151, vcc, s6, v7
- v_cmp_lt_i32_e32 vcc, v151, v10
- s_and_saveexec_b64 s[10:11], vcc
- s_cbranch_execz BB2_15
- BB2_14:
- v_mul_lo_i32 v7, v4, 57
- v_add_u32_e32 v4, vcc, v151, v4
- v_mul_lo_i32 v4, v4, 57
- buffer_store_dword v37, off, s[12:15], s16 offset:252
- v_subrev_u32_e32 v7, vcc, v7, v255
- buffer_store_dword v38, off, s[12:15], s16 offset:256
- v_add_u32_e32 v4, vcc, v4, v7
- v_lshlrev_b32_e32 v4, 2, v4
- v_add_u32_e32 v10, vcc, 0x19290, v4
- v_ashrrev_i32_e32 v11, 31, v10
- v_lshlrev_b64 v[10:11], 2, v[10:11]
- v_mov_b32_e32 v4, s9
- v_add_u32_e32 v10, vcc, s8, v10
- buffer_store_dword v39, off, s[12:15], s16 offset:260
- buffer_store_dword v40, off, s[12:15], s16 offset:264
- s_waitcnt expcnt(0)
- v_mov_b32_e32 v40, v13
- v_addc_u32_e32 v11, vcc, v4, v11, vcc
- v_mov_b32_e32 v39, v12
- v_mov_b32_e32 v12, v151
- flat_load_dwordx4 v[151:154], v[10:11]
- v_mov_b32_e32 v13, v168
- v_mov_b32_e32 v14, v169
- v_mov_b32_e32 v170, v167
- v_mov_b32_e32 v169, v166
- v_mov_b32_e32 v168, v165
- v_mov_b32_e32 v167, v164
- v_mov_b32_e32 v166, v18
- v_mov_b32_e32 v165, v17
- v_mov_b32_e32 v17, v162
- v_mov_b32_e32 v18, v163
- v_mov_b32_e32 v164, v161
- v_mov_b32_e32 v163, v160
- v_mov_b32_e32 v162, v159
- v_mov_b32_e32 v161, v158
- v_mov_b32_e32 v160, v1
- v_mov_b32_e32 v159, v0
- v_mov_b32_e32 v0, v156
- v_mov_b32_e32 v1, v157
- v_mov_b32_e32 v0, v159
- v_mov_b32_e32 v1, v160
- v_mov_b32_e32 v158, v161
- v_mov_b32_e32 v159, v162
- v_mov_b32_e32 v160, v163
- v_mov_b32_e32 v161, v164
- v_mov_b32_e32 v163, v18
- v_mov_b32_e32 v162, v17
- v_mov_b32_e32 v17, v165
- v_mov_b32_e32 v18, v166
- v_mov_b32_e32 v164, v167
- v_mov_b32_e32 v7, v155
- v_mov_b32_e32 v165, v168
- v_mov_b32_e32 v166, v169
- v_mov_b32_e32 v167, v170
- v_mov_b32_e32 v38, v16
- v_mov_b32_e32 v169, v14
- v_mad_u32_u24 v4, v255, 12, v7
- s_mov_b32 m0, -1
- v_mov_b32_e32 v37, v15
- v_mov_b32_e32 v168, v13
- s_waitcnt vmcnt(0) lgkmcnt(0)
- ds_write2_b64 v4, v[151:152], v[153:154] offset1:1
- v_mov_b32_e32 v151, v12
- v_mov_b32_e32 v12, v39
- v_mov_b32_e32 v13, v40
- buffer_load_dword v37, off, s[12:15], s16 offset:252
- s_waitcnt vmcnt(0)
- buffer_load_dword v38, off, s[12:15], s16 offset:256
- s_waitcnt vmcnt(0)
- buffer_load_dword v39, off, s[12:15], s16 offset:260
- s_waitcnt vmcnt(0)
- buffer_load_dword v40, off, s[12:15], s16 offset:264
- BB2_15:
- s_or_b64 exec, exec, s[10:11]
- BB2_16:
- s_or_b64 exec, exec, s[4:5]
- v_mul_f32_e32 v7, v57, v171
- v_mac_f32_e32 v7, v58, v172
- v_mac_f32_e32 v7, v43, v175
- v_mac_f32_e32 v7, v47, v176
- v_mac_f32_e32 v7, v48, v179
- v_mac_f32_e32 v7, v44, v180
- v_mac_f32_e32 v7, v33, v183
- v_mac_f32_e32 v7, v34, v184
- v_mac_f32_e32 v7, v28, v164
- v_mac_f32_e32 v7, v59, v24
- v_mac_f32_e32 v7, v60, v25
- s_waitcnt vmcnt(0) lgkmcnt(0)
- s_barrier
- v_mac_f32_e32 v7, v35, v2
- v_mac_f32_e32 v7, v41, v3
- buffer_load_dword v3, off, s[12:15], s16 offset:212
- v_mul_f32_e32 v15, v57, v15
- v_mac_f32_e32 v15, v58, v16
- v_mul_f32_e32 v16, v57, v133
- v_mac_f32_e32 v16, v58, v134
- v_mac_f32_e32 v16, v43, v137
- v_mac_f32_e32 v16, v47, v138
- v_mac_f32_e32 v16, v48, v141
- s_waitcnt vmcnt(0)
- buffer_load_dword v4, off, s[12:15], s16 offset:216
- v_mac_f32_e32 v16, v44, v142
- v_mac_f32_e32 v16, v33, v149
- v_mac_f32_e32 v16, v34, v150
- v_mul_f32_e32 v17, v57, v17
- v_mac_f32_e32 v17, v58, v18
- v_mac_f32_e32 v7, v42, v99
- v_mac_f32_e32 v7, v36, v100
- v_mac_f32_e32 v7, v22, v5
- v_mac_f32_e32 v7, v23, v6
- v_mov_b32_e32 v14, v13
- v_mov_b32_e32 v13, v12
- v_mul_f32_e32 v13, v57, v13
- v_mac_f32_e32 v13, v58, v14
- v_mul_f32_e32 v14, v57, v131
- v_mac_f32_e32 v14, v58, v132
- v_mac_f32_e32 v14, v43, v135
- v_mac_f32_e32 v15, v43, v158
- v_mac_f32_e32 v14, v47, v136
- v_mac_f32_e32 v15, v47, v159
- v_mac_f32_e32 v15, v48, v0
- v_mac_f32_e32 v14, v48, v139
- v_mac_f32_e32 v14, v44, v140
- v_mac_f32_e32 v15, v44, v1
- v_mac_f32_e32 v14, v33, v147
- v_mac_f32_e32 v15, v33, v156
- v_mac_f32_e32 v14, v34, v148
- v_mac_f32_e32 v15, v34, v157
- v_mac_f32_e32 v14, v28, v160
- v_mac_f32_e32 v15, v28, v161
- v_mac_f32_e32 v14, v59, v49
- v_mac_f32_e32 v15, v59, v245
- v_mac_f32_e32 v14, v60, v50
- v_mac_f32_e32 v15, v60, v246
- v_mac_f32_e32 v14, v35, v67
- v_mac_f32_e32 v15, v35, v249
- v_mac_f32_e32 v14, v41, v68
- v_mac_f32_e32 v15, v41, v250
- v_mac_f32_e32 v14, v42, v65
- v_mac_f32_e32 v15, v42, v29
- v_mac_f32_e32 v14, v36, v66
- v_mac_f32_e32 v15, v36, v30
- v_mac_f32_e32 v14, v22, v55
- v_mac_f32_e32 v15, v22, v61
- v_mac_f32_e32 v14, v23, v56
- v_mac_f32_e32 v15, v23, v62
- v_mac_f32_e32 v14, v19, v8
- v_mac_f32_e32 v15, v19, v9
- buffer_load_dword v8, off, s[12:15], s16 offset:172
- s_waitcnt vmcnt(0)
- buffer_load_dword v9, off, s[12:15], s16 offset:176
- buffer_load_dword v24, off, s[12:15], s16 offset:92
- s_waitcnt vmcnt(0)
- buffer_load_dword v25, off, s[12:15], s16 offset:96
- buffer_load_dword v152, off, s[12:15], s16 offset:236
- s_waitcnt vmcnt(0)
- buffer_load_dword v153, off, s[12:15], s16 offset:240
- v_mul_f32_e32 v11, v57, v173
- v_mac_f32_e32 v11, v58, v174
- v_mac_f32_e32 v11, v43, v177
- v_mac_f32_e32 v11, v47, v178
- v_mac_f32_e32 v11, v48, v181
- v_mac_f32_e32 v11, v44, v182
- v_mac_f32_e32 v11, v33, v185
- v_mac_f32_e32 v11, v34, v186
- v_mac_f32_e32 v11, v28, v162
- v_mac_f32_e32 v11, v59, v95
- v_mac_f32_e32 v11, v60, v96
- v_mac_f32_e32 v11, v35, v85
- v_mac_f32_e32 v11, v41, v86
- v_mac_f32_e32 v11, v42, v75
- v_mac_f32_e32 v11, v36, v76
- v_mac_f32_e32 v11, v22, v69
- v_mac_f32_e32 v11, v23, v70
- s_mov_b32 m0, -1
- v_mac_f32_e32 v16, v28, v3
- buffer_load_dword v2, off, s[12:15], s16 offset:204
- s_waitcnt vmcnt(0)
- buffer_load_dword v3, off, s[12:15], s16 offset:208
- v_mac_f32_e32 v16, v59, v45
- v_mac_f32_e32 v16, v60, v46
- v_mac_f32_e32 v16, v35, v20
- v_mac_f32_e32 v16, v41, v21
- buffer_load_dword v20, off, s[12:15], s16 offset:68
- s_waitcnt vmcnt(0)
- buffer_load_dword v21, off, s[12:15], s16 offset:72
- v_mac_f32_e32 v16, v42, v253
- v_mac_f32_e32 v16, v36, v254
- v_mac_f32_e32 v16, v22, v243
- v_mac_f32_e32 v16, v23, v244
- v_mac_f32_e32 v7, v19, v121
- v_mac_f32_e32 v11, v19, v97
- v_mac_f32_e32 v16, v19, v241
- s_movk_i32 s9, 0xe00
- s_movk_i32 s8, 0xe0
- v_mov_b32_e32 v0, s1
- s_mov_b32 s7, 0x27d000
- s_mov_b32 s5, 0x2ae000
- s_mov_b32 s4, 0x2df000
- v_mac_f32_e32 v13, v43, v152
- v_mac_f32_e32 v13, v47, v153
- buffer_load_dword v152, off, s[12:15], s16 offset:244
- s_waitcnt vmcnt(0)
- buffer_load_dword v153, off, s[12:15], s16 offset:248
- v_mul_f32_e32 v18, v57, v2
- v_mac_f32_e32 v18, v58, v3
- buffer_load_dword v2, off, s[12:15], s16 offset:196
- s_waitcnt vmcnt(0)
- buffer_load_dword v3, off, s[12:15], s16 offset:200
- v_mac_f32_e32 v13, v48, v152
- v_mac_f32_e32 v13, v44, v153
- buffer_load_dword v152, off, s[12:15], s16 offset:228
- s_waitcnt vmcnt(0)
- buffer_load_dword v153, off, s[12:15], s16 offset:232
- v_mac_f32_e32 v13, v33, v168
- v_mac_f32_e32 v13, v34, v169
- v_mac_f32_e32 v13, v28, v165
- v_mac_f32_e32 v13, v59, v127
- v_mac_f32_e32 v13, v60, v128
- v_mac_f32_e32 v13, v35, v143
- v_mac_f32_e32 v13, v41, v144
- v_mac_f32_e32 v13, v42, v101
- v_mac_f32_e32 v13, v36, v102
- v_mac_f32_e32 v13, v22, v123
- v_mac_f32_e32 v13, v23, v124
- v_mac_f32_e32 v13, v19, v122
- v_mac_f32_e32 v18, v43, v2
- v_mac_f32_e32 v18, v47, v3
- buffer_load_dword v2, off, s[12:15], s16 offset:188
- s_waitcnt vmcnt(0)
- buffer_load_dword v3, off, s[12:15], s16 offset:192
- v_mac_f32_e32 v17, v43, v152
- v_mac_f32_e32 v17, v47, v153
- buffer_load_dword v152, off, s[12:15], s16 offset:220
- s_waitcnt vmcnt(0)
- buffer_load_dword v153, off, s[12:15], s16 offset:224
- v_mac_f32_e32 v18, v48, v2
- v_mac_f32_e32 v18, v44, v3
- buffer_load_dword v2, off, s[12:15], s16 offset:180
- s_waitcnt vmcnt(0)
- buffer_load_dword v3, off, s[12:15], s16 offset:184
- v_mac_f32_e32 v17, v48, v152
- v_mac_f32_e32 v17, v44, v153
- v_mac_f32_e32 v17, v33, v166
- v_mac_f32_e32 v17, v34, v167
- v_mac_f32_e32 v17, v28, v163
- v_mac_f32_e32 v17, v59, v129
- v_mac_f32_e32 v17, v60, v130
- v_mac_f32_e32 v17, v35, v145
- v_mac_f32_e32 v17, v41, v146
- v_mac_f32_e32 v17, v42, v103
- v_mac_f32_e32 v17, v36, v104
- v_mac_f32_e32 v17, v22, v125
- v_mac_f32_e32 v17, v23, v126
- v_mac_f32_e32 v17, v19, v98
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v18, v33, v2
- v_mac_f32_e32 v18, v34, v3
- v_mac_f32_e32 v18, v28, v4
- buffer_load_dword v4, off, s[12:15], s16 offset:164
- s_waitcnt vmcnt(0)
- buffer_load_dword v5, off, s[12:15], s16 offset:168
- v_mul_f32_e32 v3, v57, v105
- v_mac_f32_e32 v3, v58, v106
- v_mac_f32_e32 v3, v43, v109
- v_mac_f32_e32 v3, v47, v110
- v_mac_f32_e32 v3, v48, v113
- v_mac_f32_e32 v3, v44, v114
- v_mac_f32_e32 v3, v33, v117
- v_mac_f32_e32 v3, v34, v118
- v_mac_f32_e32 v3, v28, v8
- v_mac_f32_e32 v18, v59, v247
- v_mac_f32_e32 v18, v60, v248
- v_mac_f32_e32 v18, v35, v251
- v_mac_f32_e32 v18, v41, v252
- v_mac_f32_e32 v18, v42, v31
- v_mac_f32_e32 v18, v36, v32
- buffer_load_dword v29, off, s[12:15], s16 offset:4
- s_waitcnt vmcnt(0)
- buffer_load_dword v30, off, s[12:15], s16 offset:8
- s_waitcnt vmcnt(0)
- buffer_load_dword v31, off, s[12:15], s16 offset:12
- s_waitcnt vmcnt(0)
- buffer_load_dword v32, off, s[12:15], s16 offset:16
- v_mac_f32_e32 v3, v59, v239
- v_mac_f32_e32 v3, v60, v240
- v_mac_f32_e32 v3, v35, v237
- v_mac_f32_e32 v3, v41, v238
- v_mac_f32_e32 v3, v42, v217
- v_mac_f32_e32 v3, v36, v218
- v_mac_f32_e32 v18, v22, v63
- v_mac_f32_e32 v3, v22, v215
- v_mac_f32_e32 v18, v23, v64
- v_mac_f32_e32 v3, v23, v216
- v_mac_f32_e32 v18, v19, v242
- v_mac_f32_e32 v3, v19, v219
- v_mov_b32_e32 v2, 0x414
- v_mul_f32_e32 v4, v57, v4
- v_mac_f32_e32 v4, v58, v5
- buffer_load_dword v5, off, s[12:15], s16 offset:156
- s_waitcnt vmcnt(0)
- buffer_load_dword v6, off, s[12:15], s16 offset:160
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v4, v43, v5
- v_mac_f32_e32 v4, v47, v6
- buffer_load_dword v5, off, s[12:15], s16 offset:148
- s_waitcnt vmcnt(0)
- buffer_load_dword v6, off, s[12:15], s16 offset:152
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v4, v48, v5
- v_mac_f32_e32 v4, v44, v6
- buffer_load_dword v5, off, s[12:15], s16 offset:140
- s_waitcnt vmcnt(0)
- buffer_load_dword v6, off, s[12:15], s16 offset:144
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v4, v33, v5
- v_mac_f32_e32 v4, v34, v6
- v_mac_f32_e32 v4, v28, v9
- buffer_load_dword v9, off, s[12:15], s16 offset:132
- v_mul_f32_e32 v5, v57, v107
- v_mac_f32_e32 v5, v58, v108
- v_mac_f32_e32 v5, v43, v111
- v_mac_f32_e32 v5, v47, v112
- v_mac_f32_e32 v5, v48, v115
- s_waitcnt vmcnt(0)
- buffer_load_dword v10, off, s[12:15], s16 offset:136
- v_mac_f32_e32 v5, v44, v116
- v_mac_f32_e32 v5, v33, v119
- v_mac_f32_e32 v5, v34, v120
- v_mac_f32_e32 v4, v59, v221
- v_mac_f32_e32 v4, v60, v222
- v_mac_f32_e32 v4, v35, v225
- v_mac_f32_e32 v4, v41, v226
- v_mac_f32_e32 v4, v42, v229
- v_mac_f32_e32 v4, v36, v230
- v_mac_f32_e32 v4, v22, v233
- v_mac_f32_e32 v4, v23, v234
- v_mac_f32_e32 v4, v19, v220
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v5, v28, v9
- buffer_load_dword v8, off, s[12:15], s16 offset:100
- s_waitcnt vmcnt(0)
- buffer_load_dword v9, off, s[12:15], s16 offset:104
- v_mac_f32_e32 v5, v59, v205
- v_mac_f32_e32 v5, v60, v206
- v_mac_f32_e32 v5, v35, v209
- v_mac_f32_e32 v5, v41, v210
- v_mac_f32_e32 v5, v42, v211
- v_mac_f32_e32 v5, v36, v212
- v_mac_f32_e32 v5, v22, v213
- v_mac_f32_e32 v5, v23, v214
- v_mac_f32_e32 v5, v19, v207
- s_waitcnt vmcnt(0)
- v_mul_f32_e32 v6, v57, v8
- v_mac_f32_e32 v6, v58, v9
- buffer_load_dword v8, off, s[12:15], s16 offset:108
- s_waitcnt vmcnt(0)
- buffer_load_dword v9, off, s[12:15], s16 offset:112
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v6, v43, v8
- v_mac_f32_e32 v6, v47, v9
- buffer_load_dword v8, off, s[12:15], s16 offset:116
- s_waitcnt vmcnt(0)
- buffer_load_dword v9, off, s[12:15], s16 offset:120
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v6, v48, v8
- v_mac_f32_e32 v6, v44, v9
- buffer_load_dword v8, off, s[12:15], s16 offset:124
- s_waitcnt vmcnt(0)
- buffer_load_dword v9, off, s[12:15], s16 offset:128
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v6, v33, v8
- v_mac_f32_e32 v6, v34, v9
- v_mac_f32_e32 v6, v28, v10
- buffer_load_dword v9, off, s[12:15], s16 offset:60
- s_waitcnt vmcnt(0)
- buffer_load_dword v10, off, s[12:15], s16 offset:64
- v_mul_f32_e32 v8, v57, v77
- v_mac_f32_e32 v8, v58, v78
- v_mac_f32_e32 v8, v43, v81
- v_mac_f32_e32 v8, v47, v82
- v_mac_f32_e32 v8, v48, v87
- v_mac_f32_e32 v8, v44, v88
- v_mac_f32_e32 v8, v33, v91
- v_mac_f32_e32 v8, v34, v92
- v_mac_f32_e32 v8, v28, v24
- v_mac_f32_e32 v6, v59, v223
- v_mac_f32_e32 v8, v59, v195
- v_mac_f32_e32 v6, v60, v224
- v_mac_f32_e32 v8, v60, v196
- v_mac_f32_e32 v6, v35, v227
- v_mac_f32_e32 v8, v35, v199
- v_mac_f32_e32 v6, v41, v228
- v_mac_f32_e32 v8, v41, v200
- v_mac_f32_e32 v6, v42, v231
- v_mac_f32_e32 v8, v42, v201
- v_mac_f32_e32 v6, v36, v232
- v_mac_f32_e32 v8, v36, v202
- v_mac_f32_e32 v6, v22, v235
- v_mac_f32_e32 v8, v22, v203
- v_mac_f32_e32 v6, v23, v236
- v_mac_f32_e32 v8, v23, v204
- v_mac_f32_e32 v6, v19, v208
- v_mac_f32_e32 v8, v19, v197
- s_waitcnt vmcnt(0)
- v_mul_f32_e32 v9, v57, v9
- v_mac_f32_e32 v9, v58, v10
- v_mac_f32_e32 v9, v43, v20
- v_mac_f32_e32 v9, v47, v21
- buffer_load_dword v20, off, s[12:15], s16 offset:84
- s_waitcnt vmcnt(0)
- buffer_load_dword v21, off, s[12:15], s16 offset:88
- v_mul_f32_e32 v10, v57, v79
- v_mac_f32_e32 v10, v58, v80
- v_mac_f32_e32 v10, v43, v83
- v_mac_f32_e32 v10, v47, v84
- v_mac_f32_e32 v10, v48, v89
- v_mac_f32_e32 v10, v44, v90
- v_mac_f32_e32 v10, v33, v93
- v_mac_f32_e32 v10, v34, v94
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v9, v48, v20
- v_mac_f32_e32 v9, v44, v21
- buffer_load_dword v20, off, s[12:15], s16 offset:76
- s_waitcnt vmcnt(0)
- buffer_load_dword v21, off, s[12:15], s16 offset:80
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v9, v33, v20
- v_mac_f32_e32 v9, v34, v21
- v_mac_f32_e32 v9, v28, v25
- buffer_load_dword v24, off, s[12:15], s16 offset:52
- s_waitcnt vmcnt(0)
- buffer_load_dword v25, off, s[12:15], s16 offset:56
- buffer_load_dword v20, off, s[12:15], s16 offset:20
- s_waitcnt vmcnt(0)
- buffer_load_dword v21, off, s[12:15], s16 offset:24
- v_mac_f32_e32 v9, v59, v71
- v_mac_f32_e32 v9, v60, v72
- v_mac_f32_e32 v9, v35, v37
- v_mac_f32_e32 v9, v41, v38
- v_mac_f32_e32 v9, v42, v51
- v_mac_f32_e32 v9, v36, v52
- v_mac_f32_e32 v9, v22, v29
- v_mac_f32_e32 v9, v23, v30
- v_mac_f32_e32 v9, v19, v198
- v_mov_b32_e32 v52, 0x5c4
- v_mul_f32_e32 v12, v57, v24
- v_mac_f32_e32 v12, v58, v25
- buffer_load_dword v24, off, s[12:15], s16 offset:44
- s_waitcnt vmcnt(0)
- buffer_load_dword v25, off, s[12:15], s16 offset:48
- v_mac_f32_e32 v10, v28, v20
- v_mac_f32_e32 v10, v59, v187
- v_mac_f32_e32 v10, v60, v188
- v_mac_f32_e32 v10, v35, v189
- v_mac_f32_e32 v10, v41, v190
- v_mac_f32_e32 v10, v42, v191
- v_mac_f32_e32 v10, v36, v192
- v_mac_f32_e32 v10, v22, v193
- v_mac_f32_e32 v10, v23, v194
- v_mac_f32_e32 v10, v19, v26
- v_mov_b32_e32 v20, 0x4ec
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v12, v43, v24
- v_mac_f32_e32 v12, v47, v25
- buffer_load_dword v24, off, s[12:15], s16 offset:36
- s_waitcnt vmcnt(0)
- buffer_load_dword v25, off, s[12:15], s16 offset:40
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v12, v48, v24
- v_mac_f32_e32 v12, v44, v25
- buffer_load_dword v24, off, s[12:15], s16 offset:28
- s_waitcnt vmcnt(0)
- buffer_load_dword v25, off, s[12:15], s16 offset:32
- s_waitcnt vmcnt(0)
- v_mac_f32_e32 v12, v33, v24
- v_mac_f32_e32 v12, v34, v25
- v_mac_f32_e32 v12, v28, v21
- v_mac_f32_e32 v12, v59, v73
- v_mac_f32_e32 v12, v60, v74
- v_mac_f32_e32 v12, v35, v39
- v_mac_f32_e32 v12, v41, v40
- v_mac_f32_e32 v12, v42, v53
- v_mac_f32_e32 v12, v36, v54
- v_mac_f32_e32 v12, v22, v31
- v_mac_f32_e32 v12, v23, v32
- v_mac_f32_e32 v12, v19, v27
- v_mov_b32_e32 v19, 0
- ds_read2_b32 v[24:25], v19 offset0:213 offset1:214
- ds_read2_b32 v[26:27], v2 offset1:1
- ds_read2_b32 v[28:29], v20 offset1:1
- v_mov_b32_e32 v2, 0x5dc
- v_mov_b32_e32 v20, 0x6b4
- ds_read2_b32 v[30:31], v2 offset1:1
- ds_read2_b32 v[32:33], v20 offset1:1
- ds_read2_b32 v[34:35], v155 offset1:1
- ds_read2_b64 v[20:23], v19 offset0:9 offset1:36
- ds_read2_b32 v[44:45], v19 offset0:103 offset1:104
- v_mov_b32_e32 v2, 0x69c
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v4, v34, v26
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v7, v34, v20
- v_mac_f32_e32 v11, v34, v22
- v_mac_f32_e32 v7, v35, v21
- v_mac_f32_e32 v11, v35, v23
- ds_read2_b64 v[20:23], v19 offset0:63 offset1:90
- v_mac_f32_e32 v4, v35, v27
- v_mov_b32_e32 v26, 0x41c
- v_mac_f32_e32 v6, v34, v28
- v_mac_f32_e32 v6, v35, v29
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v14, v34, v20
- v_mac_f32_e32 v16, v34, v22
- v_mac_f32_e32 v14, v35, v21
- v_mac_f32_e32 v16, v35, v23
- ds_read2_b64 v[20:23], v19 offset0:117 offset1:144
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v3, v34, v20
- v_mac_f32_e32 v5, v34, v22
- v_mac_f32_e32 v3, v35, v21
- v_mac_f32_e32 v5, v35, v23
- ds_read2_b64 v[20:23], v19 offset0:171 offset1:198
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v8, v34, v20
- v_mac_f32_e32 v8, v35, v21
- ds_read2_b32 v[20:21], v19 offset0:45 offset1:46
- ds_read2_b32 v[36:37], v19 offset0:47 offset1:48
- ds_read2_b32 v[38:39], v19 offset0:49 offset1:50
- v_mac_f32_e32 v10, v34, v22
- v_mac_f32_e32 v10, v35, v23
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v13, v34, v20
- v_mac_f32_e32 v13, v35, v21
- ds_read2_b32 v[40:41], v19 offset0:51 offset1:52
- ds_read2_b32 v[20:21], v19 offset0:99 offset1:100
- ds_read2_b32 v[42:43], v19 offset0:101 offset1:102
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v17, v34, v20
- v_mac_f32_e32 v17, v35, v21
- ds_read2_b32 v[46:47], v19 offset0:105 offset1:106
- ds_read2_b32 v[20:21], v19 offset0:153 offset1:154
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v15, v34, v20
- v_mac_f32_e32 v15, v35, v21
- ds_read2_b32 v[20:21], v19 offset0:207 offset1:208
- ds_read2_b32 v[48:49], v19 offset0:209 offset1:210
- ds_read2_b32 v[50:51], v19 offset0:211 offset1:212
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v18, v34, v20
- v_mac_f32_e32 v18, v35, v21
- ds_read2_b32 v[20:21], v52 offset1:1
- ds_read2_b32 v[22:23], v2 offset1:1
- ds_read2_b32 v[26:27], v26 offset1:1
- v_add_u32_e32 v2, vcc, 0x720, v155
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v9, v34, v20
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v12, v34, v22
- v_mac_f32_e32 v9, v35, v21
- v_mac_f32_e32 v12, v35, v23
- ds_read2_b32 v[28:29], v155 offset0:2 offset1:230
- ds_read2_b64 v[20:23], v19 offset0:10 offset1:37
- ds_read2_b32 v[34:35], v155 offset0:228 offset1:229
- ds_read_b32 v56, v155 offset:1832
- ds_read2_b32 v[1:2], v2 offset1:1
- s_waitcnt lgkmcnt(4)
- v_mac_f32_e32 v13, v28, v36
- s_waitcnt lgkmcnt(3)
- v_mac_f32_e32 v7, v28, v20
- v_mac_f32_e32 v11, v28, v22
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v7, v34, v21
- v_mac_f32_e32 v11, v34, v23
- ds_read2_b64 v[20:23], v19 offset0:64 offset1:91
- v_mac_f32_e32 v17, v28, v42
- v_mac_f32_e32 v4, v28, v26
- v_mac_f32_e32 v13, v34, v37
- v_mov_b32_e32 v26, 0x5cc
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v14, v28, v20
- v_mac_f32_e32 v16, v28, v22
- v_mac_f32_e32 v14, v34, v21
- v_mac_f32_e32 v16, v34, v23
- ds_read2_b64 v[20:23], v19 offset0:118 offset1:145
- v_mov_b32_e32 v42, 0x6a4
- v_mac_f32_e32 v17, v34, v43
- v_mac_f32_e32 v4, v34, v27
- v_mac_f32_e32 v18, v28, v48
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v3, v28, v20
- v_mac_f32_e32 v5, v28, v22
- v_mac_f32_e32 v3, v34, v21
- v_mac_f32_e32 v5, v34, v23
- ds_read2_b64 v[20:23], v19 offset0:172 offset1:199
- v_mac_f32_e32 v18, v34, v49
- v_mac_f32_e32 v13, v35, v38
- v_mac_f32_e32 v17, v35, v44
- v_mac_f32_e32 v18, v35, v50
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v8, v28, v20
- v_mac_f32_e32 v10, v28, v22
- v_mac_f32_e32 v8, v34, v21
- v_mac_f32_e32 v10, v34, v23
- ds_read2_b64 v[20:23], v19 offset0:11 offset1:38
- v_mac_f32_e32 v13, v29, v39
- v_mac_f32_e32 v17, v29, v45
- v_mac_f32_e32 v18, v29, v51
- v_mac_f32_e32 v13, v1, v40
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v7, v35, v20
- v_mac_f32_e32 v11, v35, v22
- v_mac_f32_e32 v7, v29, v21
- v_mac_f32_e32 v11, v29, v23
- ds_read2_b64 v[20:23], v19 offset0:65 offset1:92
- v_mac_f32_e32 v17, v1, v46
- v_mac_f32_e32 v18, v1, v24
- v_mac_f32_e32 v13, v2, v41
- v_mac_f32_e32 v17, v2, v47
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v14, v35, v20
- v_mac_f32_e32 v16, v35, v22
- v_mac_f32_e32 v14, v29, v21
- v_mac_f32_e32 v16, v29, v23
- ds_read2_b64 v[20:23], v19 offset0:119 offset1:146
- v_mac_f32_e32 v18, v2, v25
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v3, v35, v20
- v_mac_f32_e32 v5, v35, v22
- v_mac_f32_e32 v3, v29, v21
- v_mac_f32_e32 v5, v29, v23
- ds_read2_b64 v[20:23], v19 offset0:173 offset1:200
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v8, v35, v20
- v_mac_f32_e32 v10, v35, v22
- v_mac_f32_e32 v8, v29, v21
- v_mac_f32_e32 v10, v29, v23
- ds_read2_b32 v[20:21], v19 offset0:155 offset1:156
- ds_read2_b32 v[22:23], v19 offset0:157 offset1:158
- ds_read2_b32 v[36:37], v19 offset0:159 offset1:160
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v15, v28, v20
- v_mov_b32_e32 v20, 0x4f4
- v_mac_f32_e32 v15, v34, v21
- ds_read2_b32 v[20:21], v20 offset1:1
- ds_read2_b32 v[26:27], v26 offset1:1
- ds_read2_b32 v[42:43], v42 offset1:1
- s_waitcnt lgkmcnt(4)
- v_mac_f32_e32 v15, v35, v22
- v_mov_b32_e32 v22, 0x4fc
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v6, v28, v20
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v9, v28, v26
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v12, v28, v42
- v_mov_b32_e32 v20, 0x424
- v_mac_f32_e32 v6, v34, v21
- v_mac_f32_e32 v9, v34, v27
- v_mac_f32_e32 v12, v34, v43
- v_mov_b32_e32 v28, 0x5d4
- ds_read2_b32 v[20:21], v20 offset1:1
- ds_read2_b32 v[26:27], v22 offset1:1
- ds_read2_b32 v[42:43], v28 offset1:1
- v_mov_b32_e32 v22, 0x42c
- v_mac_f32_e32 v15, v29, v23
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v4, v35, v20
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v6, v35, v26
- v_mov_b32_e32 v20, 0x6ac
- v_mov_b32_e32 v26, 0x504
- ds_read2_b32 v[48:49], v20 offset1:1
- ds_read2_b32 v[52:53], v22 offset1:1
- ds_read2_b32 v[54:55], v26 offset1:1
- v_mac_f32_e32 v4, v29, v21
- ds_read2_b64 v[20:23], v19 offset0:12 offset1:39
- s_waitcnt lgkmcnt(4)
- v_mac_f32_e32 v9, v35, v42
- s_waitcnt lgkmcnt(3)
- v_mac_f32_e32 v12, v35, v48
- v_mac_f32_e32 v6, v29, v27
- v_mac_f32_e32 v9, v29, v43
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v7, v1, v20
- v_mac_f32_e32 v11, v1, v22
- v_mac_f32_e32 v7, v2, v21
- v_mac_f32_e32 v11, v2, v23
- ds_read2_b64 v[20:23], v19 offset0:66 offset1:93
- v_mac_f32_e32 v12, v29, v49
- v_mac_f32_e32 v15, v1, v36
- v_mac_f32_e32 v4, v1, v52
- v_mac_f32_e32 v6, v1, v54
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v14, v1, v20
- v_mac_f32_e32 v16, v1, v22
- v_mac_f32_e32 v14, v2, v21
- v_mac_f32_e32 v16, v2, v23
- ds_read2_b64 v[20:23], v19 offset0:120 offset1:147
- v_mac_f32_e32 v9, v1, v30
- v_mac_f32_e32 v12, v1, v32
- v_mac_f32_e32 v15, v2, v37
- v_mac_f32_e32 v4, v2, v53
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v3, v1, v20
- v_mac_f32_e32 v5, v1, v22
- v_mac_f32_e32 v3, v2, v21
- v_mac_f32_e32 v5, v2, v23
- ds_read2_b64 v[20:23], v19 offset0:174 offset1:201
- v_mac_f32_e32 v6, v2, v55
- v_mac_f32_e32 v9, v2, v31
- v_mac_f32_e32 v12, v2, v33
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v8, v1, v20
- v_mov_b32_e32 v20, 0x92492493
- v_mul_hi_i32 v20, s6, v20
- v_mac_f32_e32 v10, v1, v22
- v_mac_f32_e32 v8, v2, v21
- v_mac_f32_e32 v10, v2, v23
- v_add_u32_e32 v1, vcc, s6, v20
- v_lshrrev_b32_e32 v2, 31, v1
- v_ashrrev_i32_e32 v1, 7, v1
- v_add_u32_e32 v28, vcc, v2, v1
- v_lshlrev_b32_e32 v1, 4, v28
- v_ashrrev_i32_e32 v2, 31, v1
- v_lshlrev_b64 v[20:21], 2, v[1:2]
- v_mov_b32_e32 v22, s3
- v_add_u32_e32 v24, vcc, s2, v20
- v_addc_u32_e32 v25, vcc, v22, v21, vcc
- ds_read2_b32 v[20:21], v19 offset0:26 offset1:53
- v_mov_b32_e32 v2, s3
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v7, v56, v20
- v_mac_f32_e32 v13, v56, v21
- ds_read2_b32 v[20:21], v19 offset0:80 offset1:107
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v11, v56, v20
- v_or_b32_e32 v20, 4, v1
- v_mac_f32_e32 v17, v56, v21
- v_ashrrev_i32_e32 v21, 31, v20
- v_lshlrev_b64 v[20:21], 2, v[20:21]
- v_add_u32_e32 v20, vcc, s2, v20
- v_addc_u32_e32 v21, vcc, v2, v21, vcc
- flat_load_dwordx4 v[20:23], v[20:21]
- flat_load_dwordx4 v[24:27], v[24:25]
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_add_f32_e32 v2, v7, v24
- v_add_f32_e32 v7, v13, v25
- ds_read2_b32 v[24:25], v19 offset0:134 offset1:161
- v_add_f32_e32 v13, v17, v27
- v_add_u32_e32 v17, vcc, 0x3c8, v19
- v_add_f32_e32 v11, v11, v26
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v14, v56, v24
- v_mac_f32_e32 v15, v56, v25
- ds_read2_b32 v[24:25], v19 offset0:188 offset1:215
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v16, v56, v24
- v_mac_f32_e32 v18, v56, v25
- ds_read2_b32 v[24:25], v17 offset1:27
- v_add_u32_e32 v17, vcc, 0x4a0, v19
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v3, v56, v24
- v_mac_f32_e32 v4, v56, v25
- ds_read2_b32 v[24:25], v17 offset1:27
- v_add_u32_e32 v17, vcc, 0x578, v19
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v5, v56, v24
- v_mac_f32_e32 v6, v56, v25
- ds_read2_b32 v[24:25], v17 offset1:27
- v_add_u32_e32 v17, vcc, 0x650, v19
- v_mul_lo_i32 v19, v28, s9
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v8, v56, v24
- v_mac_f32_e32 v9, v56, v25
- ds_read2_b32 v[24:25], v17 offset1:27
- v_add_u32_e32 v17, vcc, v19, v151
- v_mul_lo_i32 v17, v17, s8
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v10, v56, v24
- v_add_u32_e32 v24, vcc, v17, v255
- v_mac_f32_e32 v12, v56, v25
- v_ashrrev_i32_e32 v25, 31, v24
- v_lshlrev_b64 v[25:26], 2, v[24:25]
- v_add_u32_e32 v25, vcc, s0, v25
- v_addc_u32_e32 v26, vcc, v0, v26, vcc
- v_add_u32_e32 v27, vcc, 0xc400, v24
- v_ashrrev_i32_e32 v28, 31, v27
- v_lshlrev_b64 v[27:28], 2, v[27:28]
- v_add_u32_e32 v27, vcc, s0, v27
- v_addc_u32_e32 v28, vcc, v0, v28, vcc
- v_max_f32_e32 v0, 0, v2
- flat_store_dword v[25:26], v0
- v_max_f32_e32 v0, 0, v7
- flat_store_dword v[27:28], v0
- v_add_u32_e32 v27, vcc, 0x18800, v24
- v_ashrrev_i32_e32 v28, 31, v27
- v_lshlrev_b64 v[27:28], 2, v[27:28]
- v_mov_b32_e32 v0, s1
- v_add_u32_e32 v27, vcc, s0, v27
- v_addc_u32_e32 v28, vcc, v0, v28, vcc
- v_max_f32_e32 v0, 0, v11
- flat_store_dword v[27:28], v0
- v_add_u32_e32 v27, vcc, 0x24c00, v24
- v_ashrrev_i32_e32 v28, 31, v27
- v_lshlrev_b64 v[27:28], 2, v[27:28]
- v_mov_b32_e32 v0, s1
- v_add_u32_e32 v27, vcc, s0, v27
- v_addc_u32_e32 v28, vcc, v0, v28, vcc
- v_max_f32_e32 v0, 0, v13
- flat_store_dword v[27:28], v0
- v_add_u32_e32 v27, vcc, 0x31000, v24
- v_ashrrev_i32_e32 v28, 31, v27
- v_lshlrev_b64 v[27:28], 2, v[27:28]
- v_mov_b32_e32 v0, s1
- v_add_u32_e32 v27, vcc, s0, v27
- v_addc_u32_e32 v28, vcc, v0, v28, vcc
- v_add_u32_e32 v13, vcc, 0x3d400, v24
- v_add_f32_e32 v0, v14, v20
- v_ashrrev_i32_e32 v14, 31, v13
- v_lshlrev_b64 v[13:14], 2, v[13:14]
- v_max_f32_e32 v0, 0, v0
- flat_store_dword v[27:28], v0
- v_mov_b32_e32 v0, s1
- v_add_u32_e32 v13, vcc, s0, v13
- v_addc_u32_e32 v14, vcc, v0, v14, vcc
- v_add_f32_e32 v0, v15, v21
- v_max_f32_e32 v0, 0, v0
- flat_store_dword v[13:14], v0
- v_add_u32_e32 v13, vcc, 0x49800, v24
- v_ashrrev_i32_e32 v14, 31, v13
- v_lshlrev_b64 v[13:14], 2, v[13:14]
- v_mov_b32_e32 v7, s1
- v_add_u32_e32 v13, vcc, s0, v13
- v_add_f32_e32 v0, v16, v22
- v_addc_u32_e32 v14, vcc, v7, v14, vcc
- v_add_u32_e32 v15, vcc, 0x55c00, v24
- v_max_f32_e32 v0, 0, v0
- v_ashrrev_i32_e32 v16, 31, v15
- flat_store_dword v[13:14], v0
- v_lshlrev_b64 v[13:14], 2, v[15:16]
- v_add_f32_e32 v2, v18, v23
- v_max_f32_e32 v0, 0, v2
- v_mov_b32_e32 v2, s1
- v_add_u32_e32 v13, vcc, s0, v13
- v_addc_u32_e32 v14, vcc, v2, v14, vcc
- flat_store_dword v[13:14], v0
- v_or_b32_e32 v13, 8, v1
- v_ashrrev_i32_e32 v14, 31, v13
- v_lshlrev_b64 v[13:14], 2, v[13:14]
- v_mov_b32_e32 v0, s3
- v_add_u32_e32 v17, vcc, s2, v13
- v_addc_u32_e32 v18, vcc, v0, v14, vcc
- v_add_u32_e32 v13, vcc, 0x62000, v24
- v_ashrrev_i32_e32 v14, 31, v13
- v_lshlrev_b64 v[13:14], 2, v[13:14]
- v_mov_b32_e32 v0, s1
- v_add_u32_e32 v21, vcc, s0, v13
- v_addc_u32_e32 v22, vcc, v0, v14, vcc
- v_or_b32_e32 v0, 12, v1
- v_ashrrev_i32_e32 v1, 31, v0
- v_lshlrev_b64 v[0:1], 2, v[0:1]
- v_mov_b32_e32 v2, s3
- v_add_u32_e32 v0, vcc, s2, v0
- v_addc_u32_e32 v1, vcc, v2, v1, vcc
- flat_load_dwordx4 v[13:16], v[0:1]
- flat_load_dwordx4 v[17:20], v[17:18]
- v_mov_b32_e32 v2, s1
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_add_f32_e32 v0, v3, v17
- v_max_f32_e32 v0, 0, v0
- flat_store_dword v[21:22], v0
- v_add_u32_e32 v0, vcc, 0x6e400, v24
- v_ashrrev_i32_e32 v1, 31, v0
- v_lshlrev_b64 v[0:1], 2, v[0:1]
- v_add_u32_e32 v0, vcc, s0, v0
- v_addc_u32_e32 v1, vcc, v2, v1, vcc
- v_add_f32_e32 v2, v4, v18
- v_max_f32_e32 v2, 0, v2
- flat_store_dword v[0:1], v2
- v_add_u32_e32 v0, vcc, 0x7a800, v24
- v_ashrrev_i32_e32 v1, 31, v0
- v_lshlrev_b64 v[0:1], 2, v[0:1]
- v_mov_b32_e32 v2, s1
- v_add_u32_e32 v0, vcc, s0, v0
- v_addc_u32_e32 v1, vcc, v2, v1, vcc
- v_add_f32_e32 v2, v5, v19
- v_max_f32_e32 v2, 0, v2
- flat_store_dword v[0:1], v2
- v_add_f32_e32 v2, v6, v20
- v_add_u32_e32 v1, vcc, 0x86c00, v24
- v_max_f32_e32 v3, 0, v2
- v_ashrrev_i32_e32 v2, 31, v1
- v_add_u32_e32 v0, vcc, 0x93000, v24
- v_lshlrev_b64 v[1:2], 2, v[1:2]
- v_mov_b32_e32 v4, s1
- v_add_u32_e32 v1, vcc, s0, v1
- v_addc_u32_e32 v2, vcc, v4, v2, vcc
- flat_store_dword v[1:2], v3
- v_add_f32_e32 v1, v8, v13
- v_max_f32_e32 v2, 0, v1
- v_ashrrev_i32_e32 v1, 31, v0
- v_lshlrev_b64 v[0:1], 2, v[0:1]
- v_mov_b32_e32 v3, s1
- v_add_u32_e32 v0, vcc, s0, v0
- v_addc_u32_e32 v1, vcc, v3, v1, vcc
- flat_store_dword v[0:1], v2
- v_add_f32_e32 v0, v9, v14
- v_max_f32_e32 v2, 0, v0
- v_add_u32_e32 v0, vcc, s7, v25
- v_addc_u32_e32 v1, vcc, 0, v26, vcc
- flat_store_dword v[0:1], v2
- v_add_f32_e32 v0, v10, v15
- v_max_f32_e32 v2, 0, v0
- v_add_u32_e32 v0, vcc, s5, v25
- v_addc_u32_e32 v1, vcc, 0, v26, vcc
- flat_store_dword v[0:1], v2
- v_add_f32_e32 v0, v12, v16
- v_max_f32_e32 v2, 0, v0
- v_add_u32_e32 v0, vcc, s4, v25
- v_addc_u32_e32 v1, vcc, 0, v26, vcc
- flat_store_dword v[0:1], v2
- s_endpgm
- .Lfunc_end2:
- .size fuse_conv2d_relu_kernel2, .Lfunc_end2-fuse_conv2d_relu_kernel2
Add Comment
Please, Sign In to add comment