[llvm] [AMDGPU][True16][Codegen] remove another packed build_vector pattern from true16 (PR #149861)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 25 12:29:31 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
Remove a 16bit-vgpr32 build_vector pattern from true16 mode. This stop isel from generating illegal "vgpr_32 = COPY vgpr_16".
ISel will use vgpr16 build vector pattern in true16 mode instead
---
Patch is 1.95 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149861.diff
38 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+12-14)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+3930-3639)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+388-357)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+822-819)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+77-54)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+33-28)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+1809-1756)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+204-201)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+220-201)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+3423-1758)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+2115-2334)
- (modified) llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll (-5)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+25-38)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+19-26)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+19-26)
- (modified) llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll (+8-19)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+10-8)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+36-44)
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+13-5)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.bf16.ll (+36-38)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+93-115)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+42-78)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+52-88)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+52-88)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+52-88)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+49-91)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+52-88)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+52-88)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+52-88)
- (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+190-228)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+246-120)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll (+39-45)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+12-24)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+22-40)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+22-40)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+22-40)
- (modified) llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll (+28-46)
- (modified) llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll (+78-80)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8b4501226732..06e40236f65b2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2174,7 +2174,6 @@ def : GCNPat <
}
foreach fp16vt = [f16, bf16] in {
-
def : GCNPat <
(fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -3637,13 +3636,24 @@ def : GCNPat <
>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = p in {
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;
+// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
+// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
+ (Ty !if(!eq(Ty, i16),
+ (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
+ (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
+>;
+}
+
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
@@ -3669,18 +3679,6 @@ def : GCNPat <
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b)
>;
-
-// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
-// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
-def : GCNPat <
- (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
- (Ty !if(!eq(Ty, i16),
- (Ty (trunc (srl VGPR_32:$b, (i32 16)))),
- (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
- (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
->;
-
-
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
let True16Predicate = NotHasTrue16BitInsts in {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index d03d6a8940b2f..46b82d3a3d651 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -25961,22 +25961,64 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
-; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v63, v31 :: v_dual_mov_b32 v62, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v61, v29 :: v_dual_mov_b32 v60, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v59, v27 :: v_dual_mov_b32 v58, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, v25 :: v_dual_mov_b32 v56, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v23 :: v_dual_mov_b32 v54, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v21 :: v_dual_mov_b32 v52, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v19 :: v_dual_mov_b32 v50, v18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v17 :: v_dual_mov_b32 v48, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v47, v15 :: v_dual_mov_b32 v46, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v45, v13 :: v_dual_mov_b32 v44, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v43, v11 :: v_dual_mov_b32 v42, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v7 :: v_dual_mov_b32 v38, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT: .LBB18_2: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32
@@ -25984,539 +26026,551 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v47, v36, v37
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v33
; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v34 :: v_dual_cndmask_b32 v46, v35, v32
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v13, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v32, v34, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v45, v33, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v14
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v14, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v15, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v15, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v44, v32, v34, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v11
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v12.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v15, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v11, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v14, v32 :: v_dual_and_b32 v12, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_cndmask_b32 v43, v15, v33
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32
-; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v10, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v13, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v14, v32, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v10.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v9, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v12, v14 :: v_dual_and_b32 v10, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VA...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/149861
More information about the llvm-commits
mailing list