Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- .text
- .hsa_code_object_version 2,1
- .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
- .globl fuse_conv2d_relu_kernel0
- .p2align 8
- .type fuse_conv2d_relu_kernel0,@function
- .amdgpu_hsa_kernel fuse_conv2d_relu_kernel0
- fuse_conv2d_relu_kernel0:
- .amd_kernel_code_t
- amd_code_version_major = 1
- amd_code_version_minor = 1
- amd_machine_kind = 1
- amd_machine_version_major = 8
- amd_machine_version_minor = 0
- amd_machine_version_stepping = 3
- kernel_code_entry_byte_offset = 256
- kernel_code_prefetch_byte_size = 0
- max_scratch_backing_memory_byte_size = 0
- granulated_workitem_vgpr_count = 1
- granulated_wavefront_sgpr_count = 1
- priority = 0
- float_mode = 192
- priv = 0
- enable_dx10_clamp = 1
- debug_mode = 0
- enable_ieee_mode = 1
- enable_sgpr_private_segment_wave_byte_offset = 0
- user_sgpr_count = 6
- enable_trap_handler = 1
- enable_sgpr_workgroup_id_x = 1
- enable_sgpr_workgroup_id_y = 0
- enable_sgpr_workgroup_id_z = 0
- enable_sgpr_workgroup_info = 0
- enable_vgpr_workitem_id = 0
- enable_exception_msb = 0
- granulated_lds_size = 0
- enable_exception = 0
- enable_sgpr_private_segment_buffer = 1
- enable_sgpr_dispatch_ptr = 0
- enable_sgpr_queue_ptr = 0
- enable_sgpr_kernarg_segment_ptr = 1
- enable_sgpr_dispatch_id = 0
- enable_sgpr_flat_scratch_init = 0
- enable_sgpr_private_segment_size = 0
- enable_sgpr_grid_workgroup_count_x = 0
- enable_sgpr_grid_workgroup_count_y = 0
- enable_sgpr_grid_workgroup_count_z = 0
- enable_ordered_append_gds = 0
- private_element_size = 1
- is_ptr64 = 1
- is_dynamic_callstack = 0
- is_debug_enabled = 0
- is_xnack_enabled = 0
- workitem_private_segment_byte_size = 0
- workgroup_group_segment_byte_size = 0
- gds_segment_byte_size = 0
- kernarg_segment_byte_size = 16
- workgroup_fbarrier_count = 0
- wavefront_sgpr_count = 10
- workitem_vgpr_count = 7
- reserved_vgpr_first = 0
- reserved_vgpr_count = 0
- reserved_sgpr_first = 0
- reserved_sgpr_count = 0
- debug_wavefront_private_segment_offset_sgpr = 0
- debug_private_segment_buffer_sgpr = 0
- kernarg_segment_alignment = 4
- group_segment_alignment = 4
- private_segment_alignment = 4
- wavefront_size = 6
- call_convention = -1
- runtime_loader_kernel_symbol = 0
- .end_amd_kernel_code_t
- v_sub_u32_e32 v1, vcc, 0x25bd8, v0
- s_lshl_b32 s0, s6, 8
- v_cmp_lt_i32_e32 vcc, s0, v1
- s_and_saveexec_b64 s[0:1], vcc
- s_cbranch_execz BB0_5
- BB0_1:
- s_mul_i32 s0, s6, 28
- v_add_u32_e32 v1, vcc, s0, v0
- v_mov_b32_e32 v0, 0x8fb823ef
- v_mul_hi_i32 v0, v1, v0
- s_movk_i32 s0, 0xe4
- v_add_u32_e32 v0, vcc, v0, v1
- v_lshrrev_b32_e32 v2, 31, v0
- v_ashrrev_i32_e32 v0, 7, v0
- v_add_u32_e32 v0, vcc, v2, v0
- v_mul_lo_i32 v0, v0, s0
- v_mov_b32_e32 v2, 0xe2
- v_subrev_u32_e32 v0, vcc, v0, v1
- v_cmp_lt_i32_e32 vcc, v0, v2
- s_and_saveexec_b64 s[2:3], vcc
- s_cbranch_execz BB0_5
- BB0_2:
- s_mul_i32 s6, s6, s0
- v_add_u32_e32 v1, vcc, s6, v1
- v_mov_b32_e32 v2, 0x28b30361
- v_mov_b32_e32 v4, 0x8fb823ef
- v_mul_hi_i32 v4, v1, v4
- v_mul_hi_i32 v2, v1, v2
- s_movk_i32 s0, 0xe2
- s_load_dwordx2 s[6:7], s[4:5], 0x0
- v_add_u32_e32 v4, vcc, v4, v1
- v_lshrrev_b32_e32 v3, 31, v2
- v_ashrrev_i32_e32 v2, 13, v2
- v_add_u32_e32 v2, vcc, v2, v3
- v_mov_b32_e32 v3, 0x55555556
- v_lshrrev_b32_e32 v6, 31, v4
- v_ashrrev_i32_e32 v4, 7, v4
- v_mul_hi_i32 v3, v2, v3
- v_add_u32_e32 v4, vcc, v6, v4
- v_mov_b32_e32 v6, 0x487ede05
- v_mul_hi_i32 v6, v4, v6
- v_lshrrev_b32_e32 v5, 31, v3
- v_add_u32_e32 v3, vcc, v3, v5
- v_mul_lo_i32 v3, v3, 3
- v_lshrrev_b32_e32 v5, 31, v6
- v_ashrrev_i32_e32 v6, 6, v6
- v_add_u32_e32 v5, vcc, v6, v5
- v_mov_b32_e32 v6, 0x6c880903
- v_mul_lo_i32 v5, v5, s0
- v_mul_hi_i32 v6, v1, v6
- v_subrev_u32_e32 v3, vcc, v3, v2
- v_cmp_lt_i32_e64 s[2:3], 0, v0
- v_subrev_u32_e32 v1, vcc, v5, v4
- v_lshrrev_b32_e32 v2, 31, v6
- v_add_u32_sdwa v2, vcc, sext(v6), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
- v_add_u32_e32 v5, vcc, -1, v1
- v_mov_b32_e32 v6, 0xe0
- v_mov_b32_e32 v4, 0xe1
- v_cmp_lt_u32_e32 vcc, v5, v6
- v_cmp_ne_u32_e64 s[0:1], v0, v4
- s_and_b64 s[2:3], s[2:3], vcc
- v_mov_b32_e32 v4, 0
- s_and_b64 s[2:3], s[0:1], s[2:3]
- s_and_saveexec_b64 s[0:1], s[2:3]
- s_cbranch_execz BB0_4
- BB0_3:
- s_load_dwordx2 s[2:3], s[4:5], 0x8
- s_mov_b32 s4, 0xc400
- v_mul_lo_i32 v4, v3, s4
- s_mov_b32 s4, 0x24c00
- v_mul_lo_i32 v5, v2, s4
- s_movk_i32 s4, 0xe0
- v_mul_lo_i32 v6, v1, s4
- v_add_u32_e32 v5, vcc, v5, v0
- v_add_u32_e32 v5, vcc, v5, v6
- v_add_u32_e32 v4, vcc, v4, v5
- v_add_u32_e32 v4, vcc, 0xffffff1f, v4
- v_ashrrev_i32_e32 v5, 31, v4
- v_lshlrev_b64 v[4:5], 2, v[4:5]
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v6, s3
- v_add_u32_e32 v4, vcc, s2, v4
- v_addc_u32_e32 v5, vcc, v6, v5, vcc
- flat_load_dword v4, v[4:5]
- BB0_4:
- s_or_b64 exec, exec, s[0:1]
- s_mov_b32 s0, 0xc948
- v_mul_lo_i32 v3, v3, s0
- s_mov_b32 s0, 0x25bd8
- v_mul_lo_i32 v2, v2, s0
- s_movk_i32 s0, 0xe4
- v_mul_lo_i32 v1, v1, s0
- v_add_u32_e32 v0, vcc, v2, v0
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v2, s7
- v_add_u32_e32 v0, vcc, v0, v1
- v_add_u32_e32 v0, vcc, v3, v0
- v_ashrrev_i32_e32 v1, 31, v0
- v_lshlrev_b64 v[0:1], 2, v[0:1]
- v_add_u32_e32 v0, vcc, s6, v0
- v_addc_u32_e32 v1, vcc, v2, v1, vcc
- s_waitcnt vmcnt(0)
- flat_store_dword v[0:1], v4
- BB0_5:
- s_endpgm
- .Lfunc_end0:
- .size fuse_conv2d_relu_kernel0, .Lfunc_end0-fuse_conv2d_relu_kernel0
- .globl fuse_conv2d_relu_kernel1
- .p2align 8
- .type fuse_conv2d_relu_kernel1,@function
- .amdgpu_hsa_kernel fuse_conv2d_relu_kernel1
- fuse_conv2d_relu_kernel1:
- .amd_kernel_code_t
- amd_code_version_major = 1
- amd_code_version_minor = 1
- amd_machine_kind = 1
- amd_machine_version_major = 8
- amd_machine_version_minor = 0
- amd_machine_version_stepping = 3
- kernel_code_entry_byte_offset = 256
- kernel_code_prefetch_byte_size = 0
- max_scratch_backing_memory_byte_size = 0
- granulated_workitem_vgpr_count = 1
- granulated_wavefront_sgpr_count = 1
- priority = 0
- float_mode = 192
- priv = 0
- enable_dx10_clamp = 1
- debug_mode = 0
- enable_ieee_mode = 1
- enable_sgpr_private_segment_wave_byte_offset = 0
- user_sgpr_count = 6
- enable_trap_handler = 1
- enable_sgpr_workgroup_id_x = 1
- enable_sgpr_workgroup_id_y = 0
- enable_sgpr_workgroup_id_z = 0
- enable_sgpr_workgroup_info = 0
- enable_vgpr_workitem_id = 0
- enable_exception_msb = 0
- granulated_lds_size = 0
- enable_exception = 0
- enable_sgpr_private_segment_buffer = 1
- enable_sgpr_dispatch_ptr = 0
- enable_sgpr_queue_ptr = 0
- enable_sgpr_kernarg_segment_ptr = 1
- enable_sgpr_dispatch_id = 0
- enable_sgpr_flat_scratch_init = 0
- enable_sgpr_private_segment_size = 0
- enable_sgpr_grid_workgroup_count_x = 0
- enable_sgpr_grid_workgroup_count_y = 0
- enable_sgpr_grid_workgroup_count_z = 0
- enable_ordered_append_gds = 0
- private_element_size = 1
- is_ptr64 = 1
- is_dynamic_callstack = 0
- is_debug_enabled = 0
- is_xnack_enabled = 0
- workitem_private_segment_byte_size = 0
- workgroup_group_segment_byte_size = 0
- gds_segment_byte_size = 0
- kernarg_segment_byte_size = 8
- workgroup_fbarrier_count = 0
- wavefront_sgpr_count = 9
- workitem_vgpr_count = 8
- reserved_vgpr_first = 0
- reserved_vgpr_count = 0
- reserved_sgpr_first = 0
- reserved_sgpr_count = 0
- debug_wavefront_private_segment_offset_sgpr = 0
- debug_private_segment_buffer_sgpr = 0
- kernarg_segment_alignment = 4
- group_segment_alignment = 4
- private_segment_alignment = 4
- wavefront_size = 6
- call_convention = -1
- runtime_loader_kernel_symbol = 0
- .end_amd_kernel_code_t
- v_lshrrev_b32_e32 v1, 2, v0
- v_sub_u32_e32 v2, vcc, 0x96f6, v1
- s_lshl_b32 s0, s6, 6
- v_cmp_lt_i32_e32 vcc, s0, v2
- s_and_saveexec_b64 s[2:3], vcc
- s_cbranch_execz BB1_4
- BB1_1:
- v_add_u32_e32 v1, vcc, s0, v1
- v_mov_b32_e32 v2, 0x28b30361
- v_mul_hi_i32 v2, v1, v2
- v_mov_b32_e32 v4, 0x8fb823ef
- v_mul_hi_i32 v6, v1, v4
- s_mulk_i32 s6, 0xffc7
- v_lshrrev_b32_e32 v3, 31, v2
- v_ashrrev_i32_e32 v2, 11, v2
- v_add_u32_e32 v2, vcc, v2, v3
- v_mov_b32_e32 v3, 0x55555556
- v_mul_hi_i32 v3, v2, v3
- v_and_b32_e32 v7, 3, v0
- s_movk_i32 s2, 0xe2
- s_mov_b32 s3, 0xc948
- v_lshrrev_b32_e32 v5, 31, v3
- v_add_u32_e32 v3, vcc, v3, v5
- v_add_u32_e32 v5, vcc, v6, v1
- v_lshrrev_b32_e32 v6, 31, v5
- v_ashrrev_i32_e32 v5, 5, v5
- v_add_u32_e32 v5, vcc, v6, v5
- v_mov_b32_e32 v6, 0x487ede05
- v_mul_lo_i32 v3, v3, 3
- v_mul_hi_i32 v6, v5, v6
- s_load_dwordx2 s[0:1], s[4:5], 0x0
- v_subrev_u32_e32 v2, vcc, v3, v2
- v_lshrrev_b32_e32 v3, 31, v6
- v_ashrrev_i32_e32 v6, 6, v6
- v_add_u32_e32 v3, vcc, v6, v3
- v_add_u32_e32 v6, vcc, s6, v1
- v_mul_hi_i32 v4, v6, v4
- v_mul_lo_i32 v3, v3, s2
- v_mul_lo_i32 v2, v2, s3
- s_movk_i32 s3, 0xe4
- v_add_u32_e32 v0, vcc, v4, v6
- v_lshrrev_b32_e32 v4, 31, v0
- v_ashrrev_i32_e32 v0, 5, v0
- v_add_u32_e32 v0, vcc, v4, v0
- v_mul_lo_i32 v0, v0, 57
- v_mov_b32_e32 v4, 0x6c880903
- v_mul_hi_i32 v4, v1, v4
- v_subrev_u32_e32 v3, vcc, v3, v5
- v_subrev_u32_e32 v0, vcc, v0, v6
- v_mul_lo_i32 v3, v3, s3
- v_lshlrev_b32_e32 v0, 2, v0
- v_lshrrev_b32_e32 v6, 31, v4
- v_ashrrev_i32_e32 v4, 14, v4
- v_add_u32_e32 v4, vcc, v4, v6
- v_or_b32_e32 v0, v0, v7
- v_mov_b32_e32 v6, 0x25bd8
- v_mad_i32_i24 v0, v4, v6, v0
- v_add_u32_e32 v0, vcc, v0, v3
- v_add_u32_e32 v0, vcc, v2, v0
- v_mul_lo_i32 v2, v5, 57
- v_sub_u32_e32 v3, vcc, s2, v7
- v_subrev_u32_e32 v1, vcc, v2, v1
- v_lshlrev_b32_e32 v1, 2, v1
- v_cmp_lt_i32_e32 vcc, v1, v3
- v_ashrrev_i32_e32 v1, 31, v0
- v_lshlrev_b64 v[0:1], 2, v[0:1]
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v2, s1
- v_add_u32_e64 v0, s[0:1], s0, v0
- v_addc_u32_e64 v1, s[0:1], v2, v1, s[0:1]
- v_mov_b32_e32 v2, 0
- s_and_saveexec_b64 s[0:1], vcc
- BB1_2:
- flat_load_dword v2, v[0:1]
- BB1_3:
- s_or_b64 exec, exec, s[0:1]
- s_waitcnt vmcnt(0) lgkmcnt(0)
- flat_store_dword v[0:1], v2
- BB1_4:
- s_endpgm
- .Lfunc_end1:
- .size fuse_conv2d_relu_kernel1, .Lfunc_end1-fuse_conv2d_relu_kernel1
- .globl fuse_conv2d_relu_kernel2
- .p2align 8
- .type fuse_conv2d_relu_kernel2,@function
- .amdgpu_hsa_kernel fuse_conv2d_relu_kernel2
- fuse_conv2d_relu_kernel2:
- .amd_kernel_code_t
- amd_code_version_major = 1
- amd_code_version_minor = 1
- amd_machine_kind = 1
- amd_machine_version_major = 8
- amd_machine_version_minor = 0
- amd_machine_version_stepping = 3
- kernel_code_entry_byte_offset = 256
- kernel_code_prefetch_byte_size = 0
- max_scratch_backing_memory_byte_size = 0
- granulated_workitem_vgpr_count = 8
- granulated_wavefront_sgpr_count = 2
- priority = 0
- float_mode = 192
- priv = 0
- enable_dx10_clamp = 1
- debug_mode = 0
- enable_ieee_mode = 1
- enable_sgpr_private_segment_wave_byte_offset = 0
- user_sgpr_count = 6
- enable_trap_handler = 1
- enable_sgpr_workgroup_id_x = 1
- enable_sgpr_workgroup_id_y = 0
- enable_sgpr_workgroup_id_z = 0
- enable_sgpr_workgroup_info = 0
- enable_vgpr_workitem_id = 0
- enable_exception_msb = 0
- granulated_lds_size = 0
- enable_exception = 0
- enable_sgpr_private_segment_buffer = 1
- enable_sgpr_dispatch_ptr = 0
- enable_sgpr_queue_ptr = 0
- enable_sgpr_kernarg_segment_ptr = 1
- enable_sgpr_dispatch_id = 0
- enable_sgpr_flat_scratch_init = 0
- enable_sgpr_private_segment_size = 0
- enable_sgpr_grid_workgroup_count_x = 0
- enable_sgpr_grid_workgroup_count_y = 0
- enable_sgpr_grid_workgroup_count_z = 0
- enable_ordered_append_gds = 0
- private_element_size = 1
- is_ptr64 = 1
- is_dynamic_callstack = 0
- is_debug_enabled = 0
- is_xnack_enabled = 0
- workitem_private_segment_byte_size = 0
- workgroup_group_segment_byte_size = 5376
- gds_segment_byte_size = 0
- kernarg_segment_byte_size = 32
- workgroup_fbarrier_count = 0
- wavefront_sgpr_count = 21
- workitem_vgpr_count = 34
- reserved_vgpr_first = 0
- reserved_vgpr_count = 0
- reserved_sgpr_first = 0
- reserved_sgpr_count = 0
- debug_wavefront_private_segment_offset_sgpr = 0
- debug_private_segment_buffer_sgpr = 0
- kernarg_segment_alignment = 4
- group_segment_alignment = 4
- private_segment_alignment = 4
- wavefront_size = 6
- call_convention = -1
- runtime_loader_kernel_symbol = 0
- .end_amd_kernel_code_t
- s_load_dwordx2 s[12:13], s[4:5], 0x8
- s_load_dwordx2 s[8:9], s[4:5], 0x10
- s_load_dwordx2 s[10:11], s[4:5], 0x18
- v_cmp_gt_i32_e32 vcc, 27, v0
- s_and_saveexec_b64 s[0:1], vcc
- s_cbranch_execz BB2_2
- BB2_1:
- v_mov_b32_e32 v1, 0x92492493
- v_mul_hi_i32 v1, s6, v1
- v_mov_b32_e32 v2, 0x38e38e39
- v_mul_hi_u32 v2, v0, v2
- s_load_dwordx2 s[2:3], s[4:5], 0x0
- v_add_u32_e32 v1, vcc, s6, v1
- v_lshrrev_b32_e32 v3, 31, v1
- v_ashrrev_i32_e32 v1, 6, v1
- v_add_u32_e32 v1, vcc, v3, v1
- v_mul_lo_i32 v1, v1, 48
- v_lshrrev_b32_e32 v2, 1, v2
- v_mul_u32_u24_e32 v3, 9, v2
- s_waitcnt lgkmcnt(0)
- v_mov_b32_e32 v4, s3
- v_add_u32_e32 v1, vcc, v1, v2
- v_mul_lo_i32 v1, v1, 9
- v_subrev_u32_e32 v2, vcc, v3, v0
- v_mov_b32_e32 v6, s3
- s_movk_i32 s4, 0x5e8
- v_add_u32_e32 v1, vcc, v1, v2
- v_ashrrev_i32_e32 v2, 31, v1
- v_lshlrev_b64 v[2:3], 2, v[1:2]
- v_add_u32_e32 v2, vcc, s2, v2
- v_addc_u32_e32 v3, vcc, v4, v3, vcc
- v_add_u32_e32 v12, vcc, 27, v1
- v_ashrrev_i32_e32 v13, 31, v12
- v_add_u32_e32 v4, vcc, 0xd8, v1
- v_add_u32_e32 v5, vcc, 0xbd, v1
- v_add_u32_e32 v7, vcc, 0xa2, v1
- v_add_u32_e32 v8, vcc, 0x87, v1
- v_add_u32_e32 v9, vcc, 0x6c, v1
- v_add_u32_e32 v10, vcc, 0x51, v1
- v_add_u32_e32 v11, vcc, 54, v1
- v_lshlrev_b64 v[12:13], 2, v[12:13]
- v_add_u32_e32 v12, vcc, s2, v12
- v_addc_u32_e32 v13, vcc, v6, v13, vcc
- v_add_u32_e32 v17, vcc, 0xf3, v1
- v_ashrrev_i32_e32 v18, 31, v17
- v_add_u32_e32 v14, vcc, 0x144, v1
- v_add_u32_e32 v15, vcc, 0x129, v1
- v_add_u32_e32 v16, vcc, 0x10e, v1
- v_lshlrev_b64 v[17:18], 2, v[17:18]
- v_add_u32_e32 v19, vcc, s2, v17
- v_ashrrev_i32_e32 v17, 31, v16
- v_mov_b32_e32 v1, s3
- v_addc_u32_e32 v20, vcc, v1, v18, vcc
- v_lshlrev_b64 v[16:17], 2, v[16:17]
- v_add_u32_e32 v21, vcc, s2, v16
- v_ashrrev_i32_e32 v16, 31, v15
- v_addc_u32_e32 v22, vcc, v1, v17, vcc
- v_lshlrev_b64 v[15:16], 2, v[15:16]
- v_add_u32_e32 v17, vcc, s2, v15
- v_ashrrev_i32_e32 v15, 31, v14
- v_addc_u32_e32 v18, vcc, v1, v16, vcc
- v_lshlrev_b64 v[14:15], 2, v[14:15]
- v_ashrrev_i32_e32 v6, 31, v5
- v_add_u32_e32 v14, vcc, s2, v14
- v_addc_u32_e32 v15, vcc, v1, v15, vcc
- v_lshlrev_b64 v[5:6], 2, v[5:6]
- flat_load_dword v1, v[19:20]
- flat_load_dword v19, v[21:22]
- flat_load_dword v18, v[17:18]
- flat_load_dword v20, v[14:15]
- flat_load_dword v21, v[12:13]
- v_add_u32_e32 v12, vcc, s2, v5
- v_ashrrev_i32_e32 v5, 31, v4
- v_mov_b32_e32 v13, s3
- v_addc_u32_e32 v13, vcc, v13, v6, vcc
- v_lshlrev_b64 v[4:5], 2, v[4:5]
- v_mov_b32_e32 v6, s3
- v_add_u32_e32 v4, vcc, s2, v4
- v_addc_u32_e32 v5, vcc, v6, v5, vcc
- v_add_u32_e32 v14, vcc, s4, v2
- v_addc_u32_e32 v15, vcc, 0, v3, vcc
- s_movk_i32 s4, 0x654
- v_add_u32_e32 v16, vcc, s4, v2
- flat_load_dword v22, v[12:13]
- v_addc_u32_e32 v17, vcc, 0, v3, vcc
- flat_load_dword v24, v[4:5]
- flat_load_dword v4, v[2:3]
- flat_load_dword v16, v[16:17]
- flat_load_dword v17, v[14:15]
- v_lshlrev_b32_e32 v23, 2, v0
- s_mov_b32 m0, -1
- v_ashrrev_i32_e32 v12, 31, v11
- s_waitcnt vmcnt(2) lgkmcnt(2)
- ds_write2_b32 v23, v4, v21 offset1:27
- v_lshlrev_b64 v[4:5], 2, v[11:12]
- v_ashrrev_i32_e32 v11, 31, v10
- v_add_u32_e32 v4, vcc, s2, v4
- v_addc_u32_e32 v5, vcc, v6, v5, vcc
- v_lshlrev_b64 v[10:11], 2, v[10:11]
- v_add_u32_e32 v12, vcc, s2, v10
- v_ashrrev_i32_e32 v10, 31, v9
- v_addc_u32_e32 v13, vcc, v6, v11, vcc
- v_lshlrev_b64 v[9:10], 2, v[9:10]
- v_add_u32_e32 v14, vcc, s2, v9
- v_ashrrev_i32_e32 v9, 31, v8
- v_addc_u32_e32 v15, vcc, v6, v10, vcc
- v_lshlrev_b64 v[8:9], 2, v[8:9]
- v_add_u32_e32 v10, vcc, s2, v8
- v_ashrrev_i32_e32 v8, 31, v7
- v_addc_u32_e32 v11, vcc, v6, v9, vcc
- v_lshlrev_b64 v[6:7], 2, v[7:8]
- v_add_u32_e32 v6, vcc, s2, v6
- v_mov_b32_e32 v8, s3
- v_addc_u32_e32 v7, vcc, v8, v7, vcc
- flat_load_dword v4, v[4:5]
- flat_load_dword v5, v[12:13]
- flat_load_dword v8, v[14:15]
- flat_load_dword v9, v[10:11]
- flat_load_dword v6, v[6:7]
- s_movk_i32 s2, 0x57c
- s_waitcnt vmcnt(3) lgkmcnt(3)
- ds_write2_b32 v23, v4, v5 offset0:54 offset1:81
- s_waitcnt vmcnt(1) lgkmcnt(2)
- ds_write2_b32 v23, v8, v9 offset0:108 offset1:135
- s_waitcnt vmcnt(0) lgkmcnt(2)
- ds_write2_b32 v23, v6, v22 offset0:162 offset1:189
- ds_write2_b32 v23, v24, v1 offset0:216 offset1:243
- v_add_u32_e32 v1, vcc, 0x438, v23
- ds_write2_b32 v1, v19, v18 offset1:27
- v_add_u32_e32 v1, vcc, s2, v2
- v_addc_u32_e32 v2, vcc, 0, v3, vcc
- flat_load_dword v1, v[1:2]
- v_add_u32_e32 v2, vcc, 0x510, v23
- s_waitcnt vmcnt(0) lgkmcnt(0)
- ds_write2_b32 v2, v20, v1 offset1:27
- v_add_u32_e32 v1, vcc, 0x5e8, v23
- ds_write2_b32 v1, v17, v16 offset1:27
- BB2_2:
- s_or_b64 exec, exec, s[0:1]
- v_mov_b32_e32 v1, 0x1f7047dd
- v_mul_hi_u32 v1, v0, v1
- v_mov_b32_e32 v2, 0xe4
- v_cmp_lt_i32_e64 s[0:1], v0, v2
- s_movk_i32 s2, 0x70
- v_subrev_u32_e32 v2, vcc, v1, v0
- v_lshrrev_b32_e32 v2, 1, v2
- v_add_u32_e32 v1, vcc, v1, v2
- v_mov_b32_e32 v2, 0x92492493
- v_mul_hi_i32 v2, s6, v2
- v_mov_b32_e32 v5, 0x24924925
- v_lshrrev_b32_e32 v1, 5, v1
- v_sub_u32_e32 v3, vcc, 0xe2, v1
- v_add_u32_e32 v2, vcc, s6, v2
- v_lshrrev_b32_e32 v4, 31, v2
- v_ashrrev_i32_e32 v2, 6, v2
- v_add_u32_e32 v17, vcc, v4, v2
- v_mul_lo_i32 v2, v17, s2
- v_lshrrev_b32_e32 v4, 5, v0
- v_mul_hi_u32 v18, v4, v5
- s_movk_i32 s4, 0x6c0
- v_sub_u32_e32 v19, vcc, s6, v2
- v_lshlrev_b32_e32 v2, 1, v19
- v_cmp_lt_i32_e64 s[2:3], v2, v3
- v_mul_u32_u24_e32 v3, 0xe0, v18
- v_subrev_u32_e32 v16, vcc, v3, v0
- v_mul_u32_u24_e32 v3, 57, v1
- v_add_u32_e32 v1, vcc, v2, v1
- v_mul_lo_i32 v1, v1, 57
- v_subrev_u32_e32 v2, vcc, v3, v0
- s_and_b64 s[0:1], s[0:1], s[2:3]
- v_mov_b32_e32 v12, 0
- v_add_u32_e32 v20, vcc, v1, v2
- v_lshlrev_b32_e32 v1, 4, v0
- v_add_u32_e32 v21, vcc, s4, v1
- v_lshlrev_b32_e32 v1, 2, v18
- v_add_u32_e32 v0, vcc, v1, v0
- v_lshlrev_b32_e32 v0, 2, v0
- v_add_u32_e32 v22, vcc, s4, v0
- s_mov_b64 s[2:3], 0
- s_mov_b32 s4, 0
- v_mov_b32_e32 v13, 0
- v_mov_b32_e32 v14, 0
- v_mov_b32_e32 v15, 0
- v_mov_b32_e32 v11, 0
- v_mov_b32_e32 v10, 0
- v_mov_b32_e32 v9, 0
- v_mov_b32_e32 v8, 0
- v_mov_b32_e32 v7, 0
- v_mov_b32_e32 v6, 0
- v_mov_b32_e32 v5, 0
- v_mov_b32_e32 v4, 0
- v_mov_b32_e32 v3, 0
- v_mov_b32_e32 v2, 0
- v_mov_b32_e32 v1, 0
- v_mov_b32_e32 v0, 0
- BB2_3:
- s_waitcnt lgkmcnt(0)
- s_barrier
- s_and_saveexec_b64 s[6:7], s[0:1]
- s_cbranch_execz BB2_5
- BB2_4:
- s_mul_i32 s5, s2, 0x3252
- v_add_u32_e32 v23, vcc, s5, v20
- v_lshlrev_b32_e32 v23, 2, v23
- v_ashrrev_i32_e32 v24, 31, v23
- v_lshlrev_b64 v[23:24], 2, v[23:24]
- v_mov_b32_e32 v25, s13
- v_add_u32_e32 v23, vcc, s12, v23
- v_addc_u32_e32 v24, vcc, v25, v24, vcc
- flat_load_dwordx4 v[23:26], v[23:24]
- s_mov_b32 m0, -1
- s_waitcnt vmcnt(0) lgkmcnt(0)
- ds_write2_b64 v21, v[23:24], v[25:26] offset1:1
- BB2_5:
- s_or_b64 exec, exec, s[6:7]
- s_waitcnt lgkmcnt(0)
- s_barrier
- s_mov_b32 s5, 0
- v_mov_b32_e32 v23, v22
- BB2_6:
- s_add_i32 s6, s4, s5
- s_mov_b32 m0, -1
- v_mov_b32_e32 v32, s6
- ds_read2_b32 v[24:25], v32 offset1:1
- ds_read_b32 v33, v23 offset:8
- ds_read2_b32 v[26:27], v32 offset0:27 offset1:28
- ds_read2_b32 v[28:29], v32 offset0:54 offset1:55
- ds_read2_b32 v[30:31], v23 offset1:1
- s_add_i32 s7, s6, 0x438
- s_add_i32 s14, s6, 0x4a4
- s_add_i32 s15, s6, 0x510
- s_add_i32 s16, s6, 0x57c
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v12, v30, v24
- v_mac_f32_e32 v13, v30, v26
- v_mac_f32_e32 v14, v30, v28
- v_mac_f32_e32 v12, v31, v25
- v_mac_f32_e32 v13, v31, v27
- v_mac_f32_e32 v14, v31, v29
- ds_read2_b32 v[24:25], v32 offset0:81 offset1:82
- ds_read2_b32 v[26:27], v32 offset0:108 offset1:109
- ds_read2_b32 v[28:29], v32 offset0:135 offset1:136
- s_add_i32 s17, s6, 0x5e8
- s_add_i32 s18, s6, 0x654
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v15, v30, v24
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v11, v30, v26
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v10, v30, v28
- v_mac_f32_e32 v15, v31, v25
- v_mac_f32_e32 v11, v31, v27
- v_mac_f32_e32 v10, v31, v29
- ds_read2_b32 v[24:25], v32 offset0:162 offset1:163
- ds_read2_b32 v[26:27], v32 offset0:189 offset1:190
- ds_read2_b32 v[28:29], v32 offset0:216 offset1:217
- s_add_i32 s5, s5, 12
- s_cmp_lg_u32 s5, 36
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v9, v30, v24
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v8, v30, v26
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v7, v30, v28
- v_mov_b32_e32 v24, s7
- v_mov_b32_e32 v26, s14
- v_mac_f32_e32 v9, v31, v25
- v_mac_f32_e32 v8, v31, v27
- v_mac_f32_e32 v7, v31, v29
- ds_read2_b32 v[24:25], v24 offset1:1
- ds_read2_b32 v[26:27], v26 offset1:1
- ds_read2_b32 v[28:29], v32 offset0:243 offset1:244
- v_add_u32_e32 v23, vcc, 0x390, v23
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v5, v30, v24
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v4, v30, v26
- v_mov_b32_e32 v24, s15
- v_mac_f32_e32 v5, v31, v25
- v_mac_f32_e32 v4, v31, v27
- v_mov_b32_e32 v26, s16
- ds_read2_b32 v[24:25], v24 offset1:1
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v6, v30, v28
- v_mov_b32_e32 v28, s17
- v_mac_f32_e32 v6, v31, v29
- ds_read2_b32 v[26:27], v26 offset1:1
- ds_read2_b32 v[28:29], v28 offset1:1
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v3, v30, v24
- v_mov_b32_e32 v24, s18
- v_mac_f32_e32 v3, v31, v25
- ds_read2_b32 v[24:25], v24 offset1:1
- s_waitcnt lgkmcnt(2)
- v_mac_f32_e32 v2, v30, v26
- v_mac_f32_e32 v2, v31, v27
- s_waitcnt lgkmcnt(1)
- v_mac_f32_e32 v1, v30, v28
- v_mac_f32_e32 v1, v31, v29
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v0, v30, v24
- v_mac_f32_e32 v0, v31, v25
- ds_read2_b32 v[24:25], v32 offset0:2 offset1:29
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v12, v33, v24
- v_mac_f32_e32 v13, v33, v25
- ds_read2_b32 v[24:25], v32 offset0:56 offset1:83
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v14, v33, v24
- v_mac_f32_e32 v15, v33, v25
- ds_read2_b32 v[24:25], v32 offset0:110 offset1:137
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v11, v33, v24
- v_mac_f32_e32 v10, v33, v25
- ds_read2_b32 v[24:25], v32 offset0:164 offset1:191
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v9, v33, v24
- v_mac_f32_e32 v8, v33, v25
- ds_read2_b32 v[24:25], v32 offset0:218 offset1:245
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v7, v33, v24
- v_add_u32_e32 v24, vcc, 0x440, v32
- v_mac_f32_e32 v6, v33, v25
- ds_read2_b32 v[24:25], v24 offset1:27
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v5, v33, v24
- v_add_u32_e32 v24, vcc, 0x518, v32
- v_mac_f32_e32 v4, v33, v25
- ds_read2_b32 v[24:25], v24 offset1:27
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v3, v33, v24
- v_add_u32_e32 v24, vcc, 0x5f0, v32
- v_mac_f32_e32 v2, v33, v25
- ds_read2_b32 v[24:25], v24 offset1:27
- s_waitcnt lgkmcnt(0)
- v_mac_f32_e32 v1, v33, v24
- v_mac_f32_e32 v0, v33, v25
- s_cbranch_scc1 BB2_6
- s_add_u32 s2, s2, 1
- s_addc_u32 s3, s3, 0
- s_add_i32 s4, s4, 36
- s_cmp_eq_u64 s[2:3], 3
- s_cbranch_scc0 BB2_3
- s_movk_i32 s0, 0x700
- v_mul_lo_i32 v20, v17, s0
- s_movk_i32 s0, 0xe0
- v_mov_b32_e32 v22, s11
- v_mov_b32_e32 v23, s11
- v_add_u32_e32 v19, vcc, v19, v20
- v_lshlrev_b32_e32 v20, 4, v17
- v_lshlrev_b32_e32 v19, 1, v19
- v_ashrrev_i32_e32 v21, 31, v20
- v_add_u32_e32 v19, vcc, v18, v19
- v_lshlrev_b64 v[17:18], 2, v[20:21]
- v_add_u32_e32 v17, vcc, s10, v17
- v_mul_lo_i32 v19, v19, s0
- v_or_b32_e32 v21, 4, v20
- v_addc_u32_e32 v18, vcc, v22, v18, vcc
- v_ashrrev_i32_e32 v22, 31, v21
- v_lshlrev_b64 v[21:22], 2, v[21:22]
- v_add_u32_e32 v21, vcc, s10, v21
- v_addc_u32_e32 v22, vcc, v23, v22, vcc
- v_add_u32_e32 v23, vcc, v19, v16
- v_ashrrev_i32_e32 v24, 31, v23
- v_lshlrev_b64 v[24:25], 2, v[23:24]
- v_mov_b32_e32 v16, s9
- v_add_u32_e32 v24, vcc, s8, v24
- v_addc_u32_e32 v25, vcc, v16, v25, vcc
- v_add_u32_e32 v26, vcc, 0xc400, v23
- v_ashrrev_i32_e32 v27, 31, v26
- v_lshlrev_b64 v[26:27], 2, v[26:27]
- v_add_u32_e32 v26, vcc, s8, v26
- v_addc_u32_e32 v27, vcc, v16, v27, vcc
- flat_load_dwordx4 v[16:19], v[17:18]
- v_mov_b32_e32 v28, s9
- v_mov_b32_e32 v29, s9
- s_mov_b32 s0, 0x27d000
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_add_f32_e32 v30, v12, v16
- v_add_u32_e32 v12, vcc, 0x18800, v23
- v_add_f32_e32 v31, v13, v17
- v_ashrrev_i32_e32 v13, 31, v12
- v_lshlrev_b64 v[12:13], 2, v[12:13]
- v_add_u32_e32 v16, vcc, s8, v12
- v_addc_u32_e32 v17, vcc, v28, v13, vcc
- v_add_u32_e32 v12, vcc, 0x24c00, v23
- v_ashrrev_i32_e32 v13, 31, v12
- v_lshlrev_b64 v[12:13], 2, v[12:13]
- v_add_f32_e32 v32, v14, v18
- v_add_u32_e32 v18, vcc, s8, v12
- v_add_f32_e32 v33, v15, v19
- v_addc_u32_e32 v19, vcc, v29, v13, vcc
- v_add_u32_e32 v12, vcc, 0x31000, v23
- v_ashrrev_i32_e32 v13, 31, v12
- v_lshlrev_b64 v[12:13], 2, v[12:13]
- v_add_u32_e32 v28, vcc, s8, v12
- v_mov_b32_e32 v14, s9
- v_addc_u32_e32 v29, vcc, v14, v13, vcc
- v_max_f32_e32 v12, 0, v30
- v_max_f32_e32 v13, 0, v31
- flat_store_dword v[24:25], v12
- flat_store_dword v[26:27], v13
- flat_load_dwordx4 v[12:15], v[21:22]
- v_max_f32_e32 v21, 0, v33
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_add_f32_e32 v11, v11, v12
- v_max_f32_e32 v12, 0, v32
- v_max_f32_e32 v11, 0, v11
- flat_store_dword v[16:17], v12
- flat_store_dword v[18:19], v21
- flat_store_dword v[28:29], v11
- v_add_u32_e32 v11, vcc, 0x3d400, v23
- v_ashrrev_i32_e32 v12, 31, v11
- v_lshlrev_b64 v[11:12], 2, v[11:12]
- v_add_f32_e32 v10, v10, v13
- v_mov_b32_e32 v16, s9
- v_add_u32_e32 v11, vcc, s8, v11
- v_addc_u32_e32 v12, vcc, v16, v12, vcc
- v_max_f32_e32 v10, 0, v10
- flat_store_dword v[11:12], v10
- v_add_f32_e32 v12, v8, v15
- v_add_u32_e32 v8, vcc, 0x49800, v23
- v_add_f32_e32 v11, v9, v14
- v_ashrrev_i32_e32 v9, 31, v8
- v_lshlrev_b64 v[8:9], 2, v[8:9]
- v_mov_b32_e32 v10, s9
- v_add_u32_e32 v8, vcc, s8, v8
- v_addc_u32_e32 v9, vcc, v10, v9, vcc
- v_max_f32_e32 v11, 0, v11
- v_add_u32_e32 v10, vcc, 0x55c00, v23
- flat_store_dword v[8:9], v11
- v_ashrrev_i32_e32 v11, 31, v10
- v_lshlrev_b64 v[8:9], 2, v[10:11]
- v_mov_b32_e32 v10, s9
- v_add_u32_e32 v8, vcc, s8, v8
- v_addc_u32_e32 v9, vcc, v10, v9, vcc
- v_max_f32_e32 v12, 0, v12
- flat_store_dword v[8:9], v12
- v_or_b32_e32 v8, 8, v20
- v_ashrrev_i32_e32 v9, 31, v8
- v_lshlrev_b64 v[8:9], 2, v[8:9]
- v_mov_b32_e32 v10, s11
- v_add_u32_e32 v12, vcc, s10, v8
- v_addc_u32_e32 v13, vcc, v10, v9, vcc
- v_add_u32_e32 v8, vcc, 0x62000, v23
- v_ashrrev_i32_e32 v9, 31, v8
- v_lshlrev_b64 v[8:9], 2, v[8:9]
- v_add_u32_e32 v16, vcc, s8, v8
- v_mov_b32_e32 v10, s9
- v_or_b32_e32 v8, 12, v20
- v_addc_u32_e32 v17, vcc, v10, v9, vcc
- v_ashrrev_i32_e32 v9, 31, v8
- v_lshlrev_b64 v[8:9], 2, v[8:9]
- v_mov_b32_e32 v10, s11
- v_add_u32_e32 v8, vcc, s10, v8
- v_addc_u32_e32 v9, vcc, v10, v9, vcc
- flat_load_dwordx4 v[8:11], v[8:9]
- flat_load_dwordx4 v[12:15], v[12:13]
- s_waitcnt vmcnt(1) lgkmcnt(1)
- v_add_f32_e32 v3, v3, v8
- s_waitcnt vmcnt(0) lgkmcnt(0)
- v_add_f32_e32 v7, v7, v12
- v_max_f32_e32 v7, 0, v7
- flat_store_dword v[16:17], v7
- v_add_u32_e32 v16, vcc, 0x6e400, v23
- v_ashrrev_i32_e32 v17, 31, v16
- v_lshlrev_b64 v[16:17], 2, v[16:17]
- v_add_f32_e32 v6, v6, v13
- v_mov_b32_e32 v7, s9
- v_add_u32_e32 v16, vcc, s8, v16
- v_addc_u32_e32 v17, vcc, v7, v17, vcc
- v_max_f32_e32 v6, 0, v6
- flat_store_dword v[16:17], v6
- v_add_u32_e32 v6, vcc, 0x7a800, v23
- v_ashrrev_i32_e32 v7, 31, v6
- v_lshlrev_b64 v[6:7], 2, v[6:7]
- v_add_f32_e32 v5, v5, v14
- v_mov_b32_e32 v12, s9
- v_add_u32_e32 v6, vcc, s8, v6
- v_addc_u32_e32 v7, vcc, v12, v7, vcc
- v_max_f32_e32 v5, 0, v5
- flat_store_dword v[6:7], v5
- v_add_u32_e32 v6, vcc, 0x86c00, v23
- v_ashrrev_i32_e32 v7, 31, v6
- v_add_u32_e32 v5, vcc, 0x93000, v23
- v_lshlrev_b64 v[6:7], 2, v[6:7]
- v_add_f32_e32 v4, v4, v15
- v_add_u32_e32 v6, vcc, s8, v6
- v_max_f32_e32 v4, 0, v4
- v_addc_u32_e32 v7, vcc, v12, v7, vcc
- flat_store_dword v[6:7], v4
- v_ashrrev_i32_e32 v6, 31, v5
- v_max_f32_e32 v7, 0, v3
- v_lshlrev_b64 v[3:4], 2, v[5:6]
- v_mov_b32_e32 v5, s9
- v_add_u32_e32 v3, vcc, s8, v3
- v_addc_u32_e32 v4, vcc, v5, v4, vcc
- v_add_f32_e32 v2, v2, v9
- flat_store_dword v[3:4], v7
- v_max_f32_e32 v4, 0, v2
- v_add_u32_e32 v2, vcc, s0, v24
- v_addc_u32_e32 v3, vcc, 0, v25, vcc
- v_add_f32_e32 v1, v1, v10
- s_mov_b32 s0, 0x2ae000
- flat_store_dword v[2:3], v4
- v_max_f32_e32 v3, 0, v1
- v_add_u32_e32 v1, vcc, s0, v24
- v_addc_u32_e32 v2, vcc, 0, v25, vcc
- v_add_f32_e32 v0, v0, v11
- s_mov_b32 s0, 0x2df000
- flat_store_dword v[1:2], v3
- v_max_f32_e32 v2, 0, v0
- v_add_u32_e32 v0, vcc, s0, v24
- v_addc_u32_e32 v1, vcc, 0, v25, vcc
- flat_store_dword v[0:1], v2
- s_endpgm
- .Lfunc_end2:
- .size fuse_conv2d_relu_kernel2, .Lfunc_end2-fuse_conv2d_relu_kernel2
Add Comment
Please, Sign In to add comment