[llvm] [AMDGPU][True16][CodeGen] buildvector pattern in true16 (PR #118904)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 5 16:43:40 PST 2024


https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/118904

buildvector pattern in true16

>From 77096111a430997751127307b907b990efa52a69 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 5 Dec 2024 17:03:38 -0500
Subject: [PATCH] buildvector pattern in True16

---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  15 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  20 +
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 786 +++++++++---------
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll        |  82 +-
 4 files changed, 440 insertions(+), 463 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3be865f03df1fd..707ad0caa83042 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -700,9 +700,22 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
     return true;
 
   // TODO: This should probably be a combine somewhere
-  // (build_vector $src0, undef)  -> copy $src0
   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
   if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+    if (Subtarget->useRealTrue16Insts() && IsVector) {
+      // (vecTy (DivergentBinFrag<build_vector> Ty:$src0, (Ty undef))),
+      // -> (vecTy (INSERT_SUBREG (IMPLICIT_DEF), VGPR_16:$src0, lo16))
+      Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*BB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+      BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::INSERT_SUBREG), Dst)
+          .addReg(Undef)
+          .addReg(Src0)
+          .addImm(AMDGPU::lo16);
+      MI.eraseFromParent();
+      return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) &&
+             RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_16RegClass, *MRI);
+    }
+    // (build_vector $src0, undef)  -> copy $src0
     MI.setDesc(TII.get(AMDGPU::COPY));
     MI.removeOperand(2);
     const auto &RC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bc25d75131cc35..c822a3ea0d8667 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3275,6 +3275,8 @@ def : GCNPat <
   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
 >;
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
   (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
@@ -3284,6 +3286,7 @@ def : GCNPat <
   (vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i32 16))
 >;
+}
 
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
@@ -3293,6 +3296,8 @@ def : GCNPat <
 }
 
 let SubtargetPredicate = HasVOP3PInsts in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat <
   (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
   (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
@@ -3322,12 +3327,25 @@ def : GCNPat <
   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 // Take the lower 16 bits from each VGPR_32 and concat them
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
   (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
 >;
 
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
+  (REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
+>;
+// GISel ignores this Pat, but the equivalent is done in selectG_BUILD_VECTOR
+def : GCNPat <
+  (vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
+>;
+}
 
 // Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
 // Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
@@ -3353,6 +3371,8 @@ def : GCNPat <
 
 // Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
 // Special case, can use V_ALIGNBIT (always uses encoded literal)
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector>
     (Ty !if(!eq(Ty, i16),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 9a647f04d43da6..230a62f447938c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -9341,7 +9341,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fadd_v3bf16:
@@ -13109,7 +13109,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fsub_v3bf16:
@@ -13778,7 +13778,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fmul_v3bf16:
@@ -17856,7 +17856,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_minnum_v3bf16:
@@ -21475,7 +21475,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
@@ -28670,8 +28670,8 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
@@ -28774,9 +28774,9 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
@@ -28897,18 +28897,18 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
@@ -30480,8 +30480,8 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -31001,7 +31001,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -31921,7 +31921,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
@@ -32755,7 +32755,7 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -33278,7 +33278,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -34047,8 +34047,8 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -34745,9 +34745,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v3, v4
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_cndmask_b32 v0, v2, v1
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_v2bf16:
@@ -34851,13 +34851,14 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v2bf16:
@@ -35034,14 +35035,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s2
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s1
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX11FAKE16-LABEL: s_select_v2bf16:
@@ -35148,19 +35149,19 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX11TRUE16-LABEL: s_vselect_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s0
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
@@ -36868,33 +36869,32 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX11TRUE16-LABEL: s_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, s1
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX11TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, s0
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s4
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s5
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, s0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, s4
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s3
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, s1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX11FAKE16-LABEL: s_vselect_v4bf16:
@@ -37070,29 +37070,28 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v11, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v4bf16:
@@ -37360,51 +37359,52 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v8bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v11.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v11
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v16, v17, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v16, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v13.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v18.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v19.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v16, v17, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, v13.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.l, v17.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, v7.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v13, v15, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v6.l
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v14, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v5.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v8bf16:
@@ -38016,101 +38016,101 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v49.l, v26.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v50.l, v18.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v33.l, v30.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v34.l, v22.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v53.l, v24.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v54.l, v16.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v35.l, v29.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v36.l, v21.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v51.l, v25.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v33, v34, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v52.l, v17.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v37.l, v28.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v38.l, v20.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v35, v36, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v48.l, v19.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v37, v38, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v22.l, v36.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v35.l
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v20.l, v48.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v39.l
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v21.l, v38.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v29.l, v37.l
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v39, v48, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v49, v50, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v19.l, v50.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v27.l, v49.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v32.l, v23.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v51, v52, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v18.l, v52.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v53, v54, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v26.l, v51.l
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v54.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v30, v22, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v25.l, v53.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v29, v21, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.h, v13.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v22, v30, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v28, v20, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v21, v29, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v27, v19, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v24, v16, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v25, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.h, v9.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v20, v28, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v26, v18, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v10.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.h, v11.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v16, v19, v27, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, v7.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v6.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v12.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v26, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v18.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v25, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v14, v17, v32 :: v_dual_and_b32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v16.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v17.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v31.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v31
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v10, v23, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v3, v23, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v32, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.h, v8.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v16bf16:
@@ -39620,249 +39620,241 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    s_clause 0x1f
 ; GFX11TRUE16-NEXT:    scratch_load_u16 v31, off, s32
-; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:128
-; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:64
-; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:60
-; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:120
-; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:56
-; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:116
-; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:52
-; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:112
-; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:48
-; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:108
-; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:44
-; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:104
-; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:40
-; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:100
-; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:36
-; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:96
-; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:32
-; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:92
-; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:28
-; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:88
-; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:24
-; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:84
-; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:20
-; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:80
-; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:16
-; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:76
-; GFX11TRUE16-NEXT:    scratch_load_b32 v83, off, s32 offset:12
-; GFX11TRUE16-NEXT:    scratch_load_b32 v84, off, s32 offset:72
-; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:8
-; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:68
-; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
+; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:64
+; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:128
+; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:60
+; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:124
+; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:56
+; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:120
+; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:52
+; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:116
+; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:48
+; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:112
+; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:44
+; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:108
+; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:40
+; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:104
+; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:36
+; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:100
+; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:96
+; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:28
+; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:92
+; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:24
+; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:88
+; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:20
+; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:84
+; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:16
+; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:80
+; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:12
+; GFX11TRUE16-NEXT:    scratch_load_b32 v83, off, s32 offset:76
+; GFX11TRUE16-NEXT:    scratch_load_b32 v84, off, s32 offset:8
+; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:72
+; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:4
+; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:68
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v96.l, v32.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v32
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v97.l, v33.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v32, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v98.l, v34.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v99.l, v35.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 16, v34
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v100.l, v36.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v101.l, v37.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v36
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v102.l, v38.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v103.l, v39.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v112.l, v48.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v113.l, v49.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v38
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v34, v35, v34, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v114.l, v50.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v50
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v115.l, v51.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v51
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v116.l, v52.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v117.l, v53.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v52
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v26, v37, v36, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v118.l, v54.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v54
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v119.l, v55.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v128.l, v64.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v129.l, v65.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v130.l, v66.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v131.l, v67.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v132.l, v68.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v133.l, v69.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v134.l, v70.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v135.l, v71.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v146.l, v82.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v147.l, v83.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v84.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v96, v96, v97, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v97.l, v85.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v98, v98, v99, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v144.l, v80.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v145.l, v81.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v26, v100, v101, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v24, v102, v103, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v55
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 16, v53
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v24, v39, v38, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v22, v112, v113, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v20, v114, v115, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v48
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v49
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v39
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v22, v49, v48, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v48.l, v115.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v49.l, v114.l
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v20, v51, v50, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v116, v117, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v50.l, v117.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v51.l, v116.l
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v16, v118, v119, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v53, v52, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v52.l, v119.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v53.l, v118.l
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v38.l, v113.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v39.l, v112.l
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v16, v55, v54, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v128, v129, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v130, v131, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 16, v37
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v132, v133, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v36.l, v103.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v65, v64, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v37.l, v102.l
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v134, v135, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v33
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v67, v66, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v144, v145, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v35
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v146, v147, vcc_lo
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v69, v68, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v64
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 16, v65
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 16, v66
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v71, v70, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v67
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v54.l, v129.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v55.l, v128.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v81, v80, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v68
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v69
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v64.l, v131.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v83, v82, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v65.l, v130.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 16, v70
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v71
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v85, v84, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v31, 1, v31
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v86.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v66.l, v133.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v67.l, v132.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v80
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v99.l, v87.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v30, v97, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v84.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v84.l, v85.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v28, v99, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v87, v86, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v86.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v85.l, v87.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 16, v81
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v68.l, v135.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v69.l, v134.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v82
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v31, v97, v96, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v83
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v70.l, v145.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v71.l, v144.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v81.l, v146.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v29, v99, v98, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v80.l, v147.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v84
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v85
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v86
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v27, v101, v100, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v87
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.h, v29.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v25, v36, v37, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v12.h, v25.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v23, v38, v39, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v21, v48, v49, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.h, v21.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v19, v50, v51, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v17, v52, v53, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.l, v32.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.h, v31.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.h, v17.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v36, v54, v55, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v26.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.h, v27.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v37, v64, v65, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, v22.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.h, v23.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.h, v37.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v38, v66, v67, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v18.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.h, v19.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v39, v68, v69, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v14.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v34.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.h, v36.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.h, v39.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v48, v70, v71, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v10.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v20.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.h, v38.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v49, v80, v81, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v30, v84, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v6.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v12.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v12.l, v24.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v49.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v30, v33, v30, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v28, v85, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v14, v29, v98, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v15, v31, v96, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v16.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v28, v35, v28, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v30.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, v48.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v28.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v32bf16:
@@ -40521,8 +40513,8 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v0, v8, vcc_lo
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v3, 16
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fma_v3bf16:
@@ -41492,7 +41484,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 72e86f1f6f9992..e5df5d3e77a239 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -545,14 +545,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v4.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -664,14 +658,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -743,13 +731,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v5.l, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v3, v4, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -815,20 +803,14 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v5, 0x7fff
 ; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v5
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v5
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v5
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v6.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v3, 0xffff8000, v4, v5
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v6.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v4.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -957,14 +939,8 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v4.l, v5.l
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -1143,27 +1119,15 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v6
-; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v6
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v6
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v6
 ; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v5, 0xffff8000, v5, v6
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v1.l, v4.l
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v7.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v7.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v8.l, v5.l
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1329,22 +1293,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v1.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v4.l, v6.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v4.l, v6.l
 ; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v5.l, v7.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i16:



More information about the llvm-commits mailing list