[llvm] [AMDGPU][True16][CodeGen] si-fix-sgpr-copies legalize size mismatched V2S copy with subreg case (PR #161290)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 1 20:56:36 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/161290
>From 1807334ce3500d937b5401af53c1c626e610f63a Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 29 Sep 2025 14:49:57 -0400
Subject: [PATCH 1/2] addtional case for mismatched size copy
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 87 +++++----
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
.../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 48 ++---
.../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 104 ++++------
.../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 124 ++++--------
.../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 140 +++-----------
.../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 160 +++-------------
.../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 180 +++---------------
.../AMDGPU/fix-sgpr-copies-f16-true16.mir | 30 +++
9 files changed, 255 insertions(+), 620 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 044ea866342c2..3bc89d7853098 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7565,6 +7565,52 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}
+// Legalize size mismatches between 16bit and 32bit registers in v2s copy
+// lowering (lower the copy itself). Including cases:
+// 1. sreg32 = copy vgpr16 => vgpr32 = REG_SEQUENCE(vgpr16, lo16)
+// 2. sreg32 = copy .lo16:vgpr32 / sreg32 = copy .hi16:vgpr32
+// => vgpr16 = copy .hi/lo16:vgpr32
+// vgpr32 = REG_SEQUENCE(vgpr16, lo16)
+// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
+// This can be removed after we have sgpr16 in place.
+bool SIInstrInfo::legalizeV2SCopyt16(MachineInstr &Copy,
+ MachineRegisterInfo &MRI,
+ SIInstrWorklist &Worklist) const {
+ Register DstReg = Copy.getOperand(0).getReg();
+ Register SrcReg = Copy.getOperand(1).getReg();
+ Register SrcSubReg = Copy.getOperand(1).getSubReg();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Copy);
+ const TargetRegisterClass *SrcRegRC = getOpRegClass(Copy, 1);
+ bool KeepCopy;
+
+ if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
+ KeepCopy = 0;
+ } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
+ (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
+ KeepCopy = 1;
+ Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ Copy.getOperand(0).setReg(NewDstReg);
+ SrcReg = NewDstReg;
+ } else
+ return false;
+
+ Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*Copy.getParent(), &Copy, Copy.getDebugLoc(),
+ get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*Copy.getParent(), std::next(Copy.getIterator()), Copy.getDebugLoc(),
+ get(AMDGPU::REG_SEQUENCE), NewDstReg)
+ .addReg(SrcReg)
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ if (!KeepCopy)
+ Copy.eraseFromParent();
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ return true;
+}
+
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {
@@ -8083,6 +8129,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
+ // If this is a v2s copy between 16bit and 32bit reg,
+ // replace vgpr copy to reg_sequence
+ if (ST.useRealTrue16Insts() && Inst.isCopy() &&
+ Inst.getOperand(1).getReg().isVirtual() &&
+ RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
+ if (legalizeV2SCopyt16(Inst, MRI, Worklist))
+ return;
+ }
+
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
// Instead of creating a copy where src and dst are the same register
@@ -8105,38 +8160,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- // If this is a v2s copy between 16bit and 32bit reg,
- // replace vgpr copy to reg_sequence/extract_subreg
- // This can be remove after we have sgpr16 in place
- if (ST.useRealTrue16Insts() && Inst.isCopy() &&
- Inst.getOperand(1).getReg().isVirtual() &&
- RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
- const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
- if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::REG_SEQUENCE), NewDstReg)
- .addReg(Inst.getOperand(1).getReg())
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- Inst.eraseFromParent();
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
- AMDGPU::lo16)) {
- Inst.getOperand(1).setSubReg(AMDGPU::lo16);
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- }
- }
-
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
legalizeOperands(Inst, MDT);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 31a2d55e1baad..05e98ad2c63eb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1375,6 +1375,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineRegisterInfo &MRI) const;
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
MachineRegisterInfo &MRI) const;
+ bool legalizeV2SCopyt16(MachineInstr &Inst, MachineRegisterInfo &MRI,
+ SIInstrWorklist &Worklist) const;
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 47cb6bd3b3bb6..03672fb760586 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -4913,12 +4913,10 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -8342,12 +8340,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -12629,12 +12625,10 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -16043,12 +16037,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -19655,12 +19647,10 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -23094,12 +23084,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -25911,12 +25899,10 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -29258,12 +29244,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -31057,12 +31041,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -31074,12 +31056,12 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -31103,11 +31085,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
@@ -31123,7 +31105,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0]
@@ -31168,9 +31150,9 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -32879,12 +32861,10 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -32896,12 +32876,12 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -32925,11 +32905,11 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
@@ -32945,7 +32925,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1]
@@ -32990,9 +32970,9 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 2cc7c448b2e11..afb2e96a0079f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -5328,15 +5328,11 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
@@ -9137,15 +9133,11 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
@@ -13805,15 +13797,11 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
@@ -17607,15 +17595,11 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
@@ -21568,15 +21552,11 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
@@ -25389,15 +25369,11 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
@@ -28498,15 +28474,11 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
@@ -32224,15 +32196,11 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
@@ -34283,15 +34251,11 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -34301,15 +34265,15 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -34334,14 +34298,14 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
@@ -34356,7 +34320,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s10, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
@@ -34364,7 +34328,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23
@@ -34404,11 +34368,11 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s15 :: v_dual_mov_b32 v33, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s10 :: v_dual_mov_b32 v37, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s8 :: v_dual_mov_b32 v39, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s8 :: v_dual_mov_b32 v39, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s6 :: v_dual_mov_b32 v49, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s11
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v24
@@ -36279,15 +36243,11 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -36297,15 +36257,15 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -36330,14 +36290,14 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
@@ -36352,7 +36312,7 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s10 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
@@ -36360,7 +36320,7 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23
@@ -36400,11 +36360,11 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s15 :: v_dual_mov_b32 v33, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s10 :: v_dual_mov_b32 v37, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s8 :: v_dual_mov_b32 v39, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s8 :: v_dual_mov_b32 v39, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s6 :: v_dual_mov_b32 v49, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s11
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v24
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index c35e183fa787f..6dc830c9e38c1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -5805,19 +5805,13 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -10044,19 +10038,13 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -15153,19 +15141,13 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -19382,19 +19364,13 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -23764,19 +23740,13 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -28015,19 +27985,13 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -31492,19 +31456,13 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -35639,19 +35597,13 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -37964,19 +37916,13 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v23.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -37984,17 +37930,17 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -38021,16 +37967,16 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
@@ -38045,7 +37991,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
@@ -38055,7 +38001,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25
@@ -38096,12 +38042,12 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s12 :: v_dual_mov_b32 v39, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s10 :: v_dual_mov_b32 v49, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v51, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v51, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s6 :: v_dual_mov_b32 v53, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s13
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v24
@@ -40168,19 +40114,13 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v48i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v23.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -40188,17 +40128,17 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -40225,16 +40165,16 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
@@ -40249,7 +40189,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
@@ -40259,7 +40199,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25
@@ -40300,12 +40240,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s12 :: v_dual_mov_b32 v39, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s10 :: v_dual_mov_b32 v49, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v51, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v51, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s6 :: v_dual_mov_b32 v53, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s13
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v24
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 29005a42d8860..fb0d5ee292d86 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -6286,23 +6286,15 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
@@ -10946,23 +10938,15 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
@@ -16527,23 +16511,15 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
@@ -21183,23 +21159,15 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
@@ -25980,23 +25948,15 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
@@ -30655,23 +30615,15 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
@@ -34516,23 +34468,15 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
@@ -39081,23 +39025,15 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
@@ -41806,28 +41742,20 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v52f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
@@ -41836,11 +41764,11 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -41868,7 +41796,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
@@ -41877,10 +41805,10 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
@@ -41907,7 +41835,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29
@@ -41949,13 +41877,13 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s41 :: v_dual_mov_b32 v39, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v49, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s12 :: v_dual_mov_b32 v51, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s8 :: v_dual_mov_b32 v55, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s8 :: v_dual_mov_b32 v55, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s6 :: v_dual_mov_b32 v65, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s40
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v32
@@ -44258,28 +44186,20 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v52i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
@@ -44288,11 +44208,11 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -44320,7 +44240,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
@@ -44329,10 +44249,10 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
@@ -44359,7 +44279,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29
@@ -44401,13 +44321,13 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s41 :: v_dual_mov_b32 v39, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v49, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s12 :: v_dual_mov_b32 v51, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s8 :: v_dual_mov_b32 v55, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s8 :: v_dual_mov_b32 v55, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s6 :: v_dual_mov_b32 v65, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s40
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v32
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 8ee5b966f40b8..8f0e2f74db866 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -6779,27 +6779,17 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
@@ -11885,27 +11875,17 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
@@ -17915,27 +17895,17 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
@@ -23006,27 +22976,17 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
@@ -28216,27 +28176,17 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
@@ -33336,27 +33286,17 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
@@ -37545,27 +37485,17 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
@@ -42545,27 +42475,17 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
@@ -45566,30 +45486,20 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v56f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v27.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s26, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
@@ -45600,11 +45510,11 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -45634,7 +45544,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
@@ -45645,10 +45555,10 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
@@ -45677,7 +45587,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30
@@ -45720,14 +45630,14 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s43 :: v_dual_mov_b32 v49, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s14 :: v_dual_mov_b32 v53, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s12 :: v_dual_mov_b32 v55, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s10 :: v_dual_mov_b32 v65, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s8 :: v_dual_mov_b32 v67, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s8 :: v_dual_mov_b32 v67, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s6 :: v_dual_mov_b32 v69, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s42
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37
@@ -48280,30 +48190,20 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v56i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v27.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s26, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
@@ -48314,11 +48214,11 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -48348,7 +48248,7 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
@@ -48359,10 +48259,10 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
@@ -48391,7 +48291,7 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30
@@ -48434,14 +48334,14 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s43 :: v_dual_mov_b32 v49, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s14 :: v_dual_mov_b32 v53, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s12 :: v_dual_mov_b32 v55, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s10 :: v_dual_mov_b32 v65, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s8 :: v_dual_mov_b32 v67, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s8 :: v_dual_mov_b32 v67, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s6 :: v_dual_mov_b32 v69, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s42
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 967f1a9b442b0..e663be7a6210e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -7240,31 +7240,19 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
@@ -12840,31 +12828,19 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
@@ -19290,31 +19266,19 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
@@ -24867,31 +24831,19 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
@@ -30472,31 +30424,19 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
@@ -36089,31 +36029,19 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
@@ -40632,31 +40560,19 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
@@ -46109,31 +46025,19 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
@@ -49421,32 +49325,20 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v60f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.h
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s26, 16
@@ -49459,11 +49351,11 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -49495,7 +49387,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s41
@@ -49508,10 +49400,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
@@ -49542,7 +49434,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v38, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38
@@ -49586,15 +49478,15 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v51, s43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s14 :: v_dual_mov_b32 v65, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s10 :: v_dual_mov_b32 v69, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s44
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -52368,32 +52260,20 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v60i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.h
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s26, 16
@@ -52406,11 +52286,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -52442,7 +52322,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s41
@@ -52455,10 +52335,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
@@ -52489,7 +52369,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v38, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38
@@ -52533,15 +52413,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v51, s43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s14 :: v_dual_mov_b32 v65, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s10 :: v_dual_mov_b32 v69, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s44
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 043bcc343d265..d3702617f2f20 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -131,6 +131,36 @@ body: |
%2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
...
+---
+name: copy_vgpr16_sreg32_lo16_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: copy_vgpr16_sreg32_lo16_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[DEF]].lo16
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0.lo16:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+...
+
+---
+name: copy_vgpr16_sreg32_hi16_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: copy_vgpr16_sreg32_hi16_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[DEF]].hi16
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0.hi16:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+...
+
---
name: copy_vgpr16_sreg32_usedby_salu32
body: |
>From bf55a5093efc38cb727486e598c4ab1e5ae9d99e Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Wed, 1 Oct 2025 22:39:14 -0400
Subject: [PATCH 2/2] tmp
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 115 ++++++++++--------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +-
.../AMDGPU/fix-sgpr-copies-f16-true16.mir | 24 +---
...-to-valu-pseudo-scalar-trans-f16-true16.ll | 10 +-
4 files changed, 75 insertions(+), 78 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3bc89d7853098..d40165e323562 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7528,6 +7528,13 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
unsigned Opcode = MI.getOpcode();
MachineBasicBlock *MBB = MI.getParent();
+ switch (Opcode) {
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::INSERT_SUBREG:
+ legalizeSpecialInst_t16(MI, MRI);
+ return;
+ }
+
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
OpIdx >= get(Opcode).getNumOperands() ||
@@ -7565,50 +7572,66 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}
-// Legalize size mismatches between 16bit and 32bit registers in v2s copy
-// lowering (lower the copy itself). Including cases:
-// 1. sreg32 = copy vgpr16 => vgpr32 = REG_SEQUENCE(vgpr16, lo16)
-// 2. sreg32 = copy .lo16:vgpr32 / sreg32 = copy .hi16:vgpr32
-// => vgpr16 = copy .hi/lo16:vgpr32
-// vgpr32 = REG_SEQUENCE(vgpr16, lo16)
+// Legalize operands of size-mismatches special inst between 16bit and 32bit
+// in moveToVALU lowering in true16 mode. This caused by 16bit
+// placed in both vgpr16 and sreg32 by isel. Including cases:
+// Copy
+// 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
+// 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
+// => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
+//
+// Reg_sequence / Insert_subreg
+// dst32 = reg_sequence(vgpr32, lo16/hi16) /
+// dst32 = insert_subreg(vgpr32,lo16/hi16)
+// => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16) /
+// dst32 = insert_subreg(.lo16:vgpr32, lo16/hi16)
+//
// This can be removed after we have sgpr16 in place.
-bool SIInstrInfo::legalizeV2SCopyt16(MachineInstr &Copy,
- MachineRegisterInfo &MRI,
- SIInstrWorklist &Worklist) const {
- Register DstReg = Copy.getOperand(0).getReg();
- Register SrcReg = Copy.getOperand(1).getReg();
- Register SrcSubReg = Copy.getOperand(1).getSubReg();
- const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Copy);
- const TargetRegisterClass *SrcRegRC = getOpRegClass(Copy, 1);
- bool KeepCopy;
-
- if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
- KeepCopy = 0;
- } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
- (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
- KeepCopy = 1;
- Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- Copy.getOperand(0).setReg(NewDstReg);
- SrcReg = NewDstReg;
- } else
- return false;
+void SIInstrInfo::legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const {
+ unsigned Opcode = Inst.getOpcode();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+ switch (Opcode) {
+ case AMDGPU::COPY: {
+ Register SrcReg = Inst.getOperand(1).getReg();
+ if (!SrcReg.isVirtual() || !RI.isVGPR(MRI, SrcReg))
+ return;
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(*Copy.getParent(), &Copy, Copy.getDebugLoc(),
- get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*Copy.getParent(), std::next(Copy.getIterator()), Copy.getDebugLoc(),
- get(AMDGPU::REG_SEQUENCE), NewDstReg)
- .addReg(SrcReg)
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- if (!KeepCopy)
- Copy.eraseFromParent();
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return true;
+ bool SetSubReg = false;
+ Register SrcSubReg = Inst.getOperand(1).getSubReg();
+ const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
+ if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
+ } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
+ (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
+ SetSubReg = true;
+ } else
+ return;
+
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::IMPLICIT_DEF), Undef);
+ Inst.setDesc(get(AMDGPU::REG_SEQUENCE));
+ if (SetSubReg)
+ Inst.getOperand(1).setSubReg(SrcSubReg);
+
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::lo16));
+ Inst.addOperand(MachineOperand::CreateReg(Undef, 0));
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::hi16));
+ } break;
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::INSERT_SUBREG: {
+ for (unsigned I = 0, E = (Inst.getNumOperands() - 1) / 2; I < E; ++I) {
+ Register SrcReg = Inst.getOperand(1 + 2 * I).getReg();
+ auto SubReg = Inst.getOperand(1 + 2 * I + 1).getImm();
+ if (SrcReg.isVirtual() && RI.isVGPR(MRI, SrcReg) &&
+ MRI.constrainRegClass(SrcReg, &AMDGPU::VGPR_32RegClass) &&
+ (SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
+ Inst.getOperand(1 + 2 * I).setSubReg(AMDGPU::lo16);
+ }
+ }
+ } break;
+ }
}
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
@@ -8129,14 +8152,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- // If this is a v2s copy between 16bit and 32bit reg,
- // replace vgpr copy to reg_sequence
- if (ST.useRealTrue16Insts() && Inst.isCopy() &&
- Inst.getOperand(1).getReg().isVirtual() &&
- RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
- if (legalizeV2SCopyt16(Inst, MRI, Worklist))
- return;
- }
+ if (ST.useRealTrue16Insts())
+ legalizeSpecialInst_t16(Inst, MRI);
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 05e98ad2c63eb..9728ed4b6e002 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1375,8 +1375,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineRegisterInfo &MRI) const;
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
MachineRegisterInfo &MRI) const;
- bool legalizeV2SCopyt16(MachineInstr &Inst, MachineRegisterInfo &MRI,
- SIInstrWorklist &Worklist) const;
+ void legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const;
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index d3702617f2f20..7e81d83907089 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -138,8 +138,7 @@ body: |
; GCN-LABEL: name: copy_vgpr16_sreg32_lo16_usedby_salu16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[DEF]].lo16
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].lo16, %subreg.lo16, [[DEF1]], %subreg.hi16
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = COPY %0.lo16:vgpr_32
@@ -153,8 +152,7 @@ body: |
; GCN-LABEL: name: copy_vgpr16_sreg32_hi16_usedby_salu16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[DEF]].hi16
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].hi16, %subreg.lo16, [[DEF1]], %subreg.hi16
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = COPY %0.hi16:vgpr_32
@@ -187,24 +185,6 @@ body: |
%2:vgpr_16 = V_TRUNC_F16_t16_e64 0, %1:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...
----
-name: S_FMAC_F16
-body: |
- bb.0:
- ; GCN-LABEL: name: S_FMAC_F16
- ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF1]], %subreg.hi16
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
- ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
- %0:vgpr_16 = IMPLICIT_DEF
- %1:sgpr_lo16 = COPY %0:vgpr_16
- %2:sreg_32 = COPY %0:vgpr_16
- %3:sreg_32 = COPY %1:sgpr_lo16
- %4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode
-...
-
---
name: legalize_with_multi_user
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
index 3768634c1691c..d65398171840c 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -37,7 +37,7 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -60,7 +60,7 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -83,7 +83,7 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -106,7 +106,7 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
More information about the llvm-commits
mailing list