[llvm] [AMDGPU][True16][CodeGen] si-fix-sgpr-copies legalize size mismatched V2S copy with subreg case (PR #161290)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 10:08:59 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/161290
>From 4a168c6ac550899d78dd80c70e8167e7fd222442 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 29 Sep 2025 14:49:57 -0400
Subject: [PATCH] addtional case for mismatched size copy
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 99 +++++++++++++------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
.../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 32 +++---
.../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 72 ++++++--------
.../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 76 ++++++--------
.../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 76 ++++++--------
.../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 80 ++++++---------
.../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 84 ++++++----------
.../AMDGPU/fix-sgpr-copies-f16-true16.mir | 46 ++++++---
llvm/test/CodeGen/AMDGPU/frem.ll | 90 +++++++----------
...-to-valu-pseudo-scalar-trans-f16-true16.ll | 10 +-
11 files changed, 313 insertions(+), 354 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7cb7f47ddb220..60949de157313 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7616,6 +7616,11 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
return;
unsigned Opcode = MI.getOpcode();
+ if (Opcode == AMDGPU::REG_SEQUENCE) {
+ legalizeSpecialInst_t16(MI, MRI);
+ return;
+ }
+
MachineBasicBlock *MBB = MI.getParent();
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
@@ -7654,6 +7659,65 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}
+// Legalize operands of size-mismatches special inst between 16bit and 32bit
+// in moveToVALU lowering in true16 mode. This caused by 16bit
+// placed in both vgpr16 and sreg32 by isel. Including cases:
+// Copy
+// 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
+// 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
+// => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
+// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
+//
+// Reg_sequence
+// dst32 = reg_sequence(vgpr32, lo16/hi16)
+// => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16)
+//
+// This can be removed after we have sgpr16 in place.
+void SIInstrInfo::legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const {
+ unsigned Opcode = Inst.getOpcode();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+ switch (Opcode) {
+ case AMDGPU::COPY: {
+ Register SrcReg = Inst.getOperand(1).getReg();
+ if (!SrcReg.isVirtual() || !RI.isVGPR(MRI, SrcReg))
+ return;
+
+ bool SetSubReg = false;
+ Register SrcSubReg = Inst.getOperand(1).getSubReg();
+ const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
+ if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
+ } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
+ (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
+ SetSubReg = true;
+ } else
+ return;
+
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::IMPLICIT_DEF), Undef);
+ Inst.setDesc(get(AMDGPU::REG_SEQUENCE));
+ if (SetSubReg)
+ Inst.getOperand(1).setSubReg(SrcSubReg);
+
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::lo16));
+ Inst.addOperand(MachineOperand::CreateReg(Undef, 0));
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::hi16));
+ } break;
+ case AMDGPU::REG_SEQUENCE: {
+ for (unsigned I = 0, E = (Inst.getNumOperands() - 1) / 2; I < E; ++I) {
+ Register SrcReg = Inst.getOperand(1 + 2 * I).getReg();
+ auto SubReg = Inst.getOperand(1 + 2 * I + 1).getImm();
+ if (SrcReg.isVirtual() && RI.isVGPR(MRI, SrcReg) &&
+ MRI.constrainRegClass(SrcReg, &AMDGPU::VGPR_32RegClass) &&
+ (SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
+ Inst.getOperand(1 + 2 * I).setSubReg(AMDGPU::lo16);
+ }
+ }
+ } break;
+ }
+}
+
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {
@@ -8192,6 +8256,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
+ if (ST.useRealTrue16Insts())
+ legalizeSpecialInst_t16(Inst, MRI);
+
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
Register NewDstReg = Inst.getOperand(1).getReg();
const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
@@ -8222,38 +8289,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
}
- // If this is a v2s copy between 16bit and 32bit reg,
- // replace vgpr copy to reg_sequence/extract_subreg
- // This can be remove after we have sgpr16 in place
- if (ST.useRealTrue16Insts() && Inst.isCopy() &&
- Inst.getOperand(1).getReg().isVirtual() &&
- RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
- const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
- if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::REG_SEQUENCE), NewDstReg)
- .addReg(Inst.getOperand(1).getReg())
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- Inst.eraseFromParent();
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
- AMDGPU::lo16)) {
- Inst.getOperand(1).setSubReg(AMDGPU::lo16);
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- }
- }
-
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
legalizeOperands(Inst, MDT);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c66985a19685b..b88835d2691ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1460,6 +1460,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineRegisterInfo &MRI) const;
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
MachineRegisterInfo &MRI) const;
+ void legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const;
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 78c1971c50d14..257cd16bb8608 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -33473,7 +33473,6 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -33486,19 +33485,18 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v20.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
@@ -33514,11 +33512,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
@@ -33534,7 +33532,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
@@ -33581,9 +33579,9 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
@@ -35284,7 +35282,6 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -35297,19 +35294,18 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v20.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
@@ -35325,11 +35321,11 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
@@ -35345,7 +35341,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
@@ -35392,9 +35388,9 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 9cff9c4a9dc65..777a83e5fcd8c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -36755,7 +36755,6 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -36765,26 +36764,23 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v22.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
@@ -36797,14 +36793,14 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -36819,7 +36815,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
@@ -36827,7 +36823,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
@@ -36871,11 +36867,11 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s15 :: v_dual_mov_b32 v33, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s10 :: v_dual_mov_b32 v37, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s8 :: v_dual_mov_b32 v39, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s8 :: v_dual_mov_b32 v39, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s6 :: v_dual_mov_b32 v49, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s11
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
@@ -38736,7 +38732,6 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -38746,26 +38741,23 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v22.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
@@ -38778,14 +38770,14 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -38800,7 +38792,7 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
@@ -38808,7 +38800,7 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
@@ -38852,11 +38844,11 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s15 :: v_dual_mov_b32 v33, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s10 :: v_dual_mov_b32 v37, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s8 :: v_dual_mov_b32 v39, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s8 :: v_dual_mov_b32 v39, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s6 :: v_dual_mov_b32 v49, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s11
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index 530ff4f30fd05..e004c87ba2844 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -40461,7 +40461,6 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -40469,32 +40468,27 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v24.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
@@ -40505,16 +40499,16 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
@@ -40529,7 +40523,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
@@ -40539,7 +40533,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
@@ -40586,12 +40580,12 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s12 :: v_dual_mov_b32 v39, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s10 :: v_dual_mov_b32 v49, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v51, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v51, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s6 :: v_dual_mov_b32 v53, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s13
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
@@ -42643,7 +42637,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -42651,32 +42644,27 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v24.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
@@ -42687,16 +42675,16 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
@@ -42711,7 +42699,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
@@ -42721,7 +42709,7 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
@@ -42768,12 +42756,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s12 :: v_dual_mov_b32 v39, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s10 :: v_dual_mov_b32 v49, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v51, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v51, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s6 :: v_dual_mov_b32 v53, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s13
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 93690270fd797..2cd97ac85771f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -44375,12 +44375,11 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
@@ -44389,30 +44388,23 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v26.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
@@ -44420,7 +44412,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
@@ -44429,10 +44421,10 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
@@ -44459,7 +44451,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
@@ -44509,13 +44501,13 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s41 :: v_dual_mov_b32 v39, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v49, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s12 :: v_dual_mov_b32 v51, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s8 :: v_dual_mov_b32 v55, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s8 :: v_dual_mov_b32 v55, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s6 :: v_dual_mov_b32 v65, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s40
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
@@ -46800,12 +46792,11 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
@@ -46814,30 +46805,23 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v26.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
@@ -46845,7 +46829,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
@@ -46854,10 +46838,10 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
@@ -46884,7 +46868,7 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
@@ -46934,13 +46918,13 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s41 :: v_dual_mov_b32 v39, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v49, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s12 :: v_dual_mov_b32 v51, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s8 :: v_dual_mov_b32 v55, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s8 :: v_dual_mov_b32 v55, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s6 :: v_dual_mov_b32 v65, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s40
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 0e7bca4f61bfb..d41c05bf0a56f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -48206,10 +48206,9 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s26, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
@@ -48220,40 +48219,31 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v28.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
@@ -48264,10 +48254,10 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
@@ -48296,7 +48286,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
@@ -48349,14 +48339,14 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s43 :: v_dual_mov_b32 v49, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s14 :: v_dual_mov_b32 v53, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s12 :: v_dual_mov_b32 v55, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s10 :: v_dual_mov_b32 v65, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s8 :: v_dual_mov_b32 v67, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s8 :: v_dual_mov_b32 v67, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s6 :: v_dual_mov_b32 v69, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s42
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
@@ -50895,10 +50885,9 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s26, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
@@ -50909,40 +50898,31 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v28.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
@@ -50953,10 +50933,10 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
@@ -50985,7 +50965,7 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
@@ -51038,14 +51018,14 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s43 :: v_dual_mov_b32 v49, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s14 :: v_dual_mov_b32 v53, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s12 :: v_dual_mov_b32 v55, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s10 :: v_dual_mov_b32 v65, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s8 :: v_dual_mov_b32 v67, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s8 :: v_dual_mov_b32 v67, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s6 :: v_dual_mov_b32 v69, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s42
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 37f049de7a633..fa0975b9bd4f0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -52081,8 +52081,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, 0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s26, 16
@@ -52095,42 +52094,31 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v30.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s41
@@ -52143,10 +52131,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
@@ -52177,7 +52165,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
@@ -52233,15 +52221,15 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v51, s43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s14 :: v_dual_mov_b32 v65, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s10 :: v_dual_mov_b32 v69, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s44
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
@@ -54996,8 +54984,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, 0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s26, 16
@@ -55010,42 +54997,31 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v29.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v30.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s41
@@ -55058,10 +55034,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
@@ -55092,7 +55068,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
@@ -55148,15 +55124,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v51, s43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s14 :: v_dual_mov_b32 v65, s13
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s11
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s10 :: v_dual_mov_b32 v69, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s44
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index f64615dcc78f0..3a00f6e97d292 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -131,6 +131,34 @@ body: |
%2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
...
+---
+name: copy_vgpr16_sreg32_lo16_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: copy_vgpr16_sreg32_lo16_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].lo16, %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0.lo16:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+...
+
+---
+name: copy_vgpr16_sreg32_hi16_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: copy_vgpr16_sreg32_hi16_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].hi16, %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0.hi16:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+...
+
---
name: copy_vgpr16_sreg32_usedby_salu32
body: |
@@ -158,21 +186,17 @@ body: |
...
---
-name: S_FMAC_F16
+name: reg_sequence_vgpr32_sreg32
body: |
bb.0:
- ; GCN-LABEL: name: S_FMAC_F16
+ ; GCN-LABEL: name: reg_sequence_vgpr32_sreg32
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF1]], %subreg.hi16
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
- ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF1]].lo16, %subreg.lo16, [[DEF]], %subreg.hi16
%0:vgpr_16 = IMPLICIT_DEF
- %1:sgpr_lo16 = COPY %0:vgpr_16
- %2:sreg_32 = COPY %0:vgpr_16
- %3:sreg_32 = COPY %1:sgpr_lo16
- %4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:vgpr_32 = REG_SEQUENCE %2:sreg_32, %subreg.lo16, %0:vgpr_16, %subreg.hi16
...
---
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index d8cbdb11a911f..5586cc0ed18d7 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -538,11 +538,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v3, v2
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB0_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v3, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v5, v4
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v1
; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.l, v4.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_3
; GFX11-TRUE16-NEXT: s_branch .LBB0_8
@@ -622,10 +621,9 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v2, v3
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v2, v1
; GFX11-TRUE16-NEXT: .LBB0_8: ; %Flow19
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v0.l
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l|
@@ -770,12 +768,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s1, s0
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB0_2
; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s1, s0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2
+; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0
; GFX1150-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, s2
; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB0_3
; GFX1150-TRUE16-NEXT: s_branch .LBB0_8
@@ -856,12 +853,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4
; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3
+; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0
; GFX1150-TRUE16-NEXT: .LBB0_8: ; %Flow19
; GFX1150-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0
; GFX1150-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
@@ -1015,12 +1011,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s1, s0
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB0_2
; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s1, s0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2
+; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0
; GFX1200-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, s2
; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB0_3
; GFX1200-TRUE16-NEXT: s_branch .LBB0_8
@@ -1104,12 +1099,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3
+; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0
; GFX1200-TRUE16-NEXT: .LBB0_8: ; %Flow19
; GFX1200-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0
; GFX1200-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
@@ -5783,11 +5777,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else20
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v5, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_3
; GFX11-TRUE16-NEXT: s_branch .LBB9_8
@@ -5867,10 +5860,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0
; GFX11-TRUE16-NEXT: .LBB9_8:
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
@@ -5881,11 +5873,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_10
; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v8, v7
+; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v7, v3
; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v7.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_11
; GFX11-TRUE16-NEXT: s_branch .LBB9_16
@@ -5965,10 +5956,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_ldexp_f32 v5, v6, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v5.l, v5
-; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v5, v6
+; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v5, v3
; GFX11-TRUE16-NEXT: .LBB9_16: ; %Flow54
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
@@ -8956,11 +8946,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else86
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v7, v4
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_3
; GFX11-TRUE16-NEXT: s_branch .LBB10_8
@@ -9040,10 +9029,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v5
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v0
; GFX11-TRUE16-NEXT: .LBB10_8:
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
@@ -9054,11 +9042,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_10
; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v10, v7
+; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v7, v5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v7.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_11
; GFX11-TRUE16-NEXT: s_branch .LBB10_16
@@ -9138,10 +9125,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v8, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v7.l, v7
-; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v7, v8
+; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v7, v5
; GFX11-TRUE16-NEXT: .LBB10_16:
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v10, |v1.l|
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v9, |v3.l|
@@ -9149,11 +9135,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_18
; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else20
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v11, v8
+; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v1
; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.l, v8.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_19
; GFX11-TRUE16-NEXT: s_branch .LBB10_24
@@ -9233,10 +9218,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_ldexp_f32 v8, v9, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v8.l, v8
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v9
+; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v1
; GFX11-TRUE16-NEXT: .LBB10_24:
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v3
@@ -9247,11 +9231,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_26
; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v14, v11
+; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v9
; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v9.l, v11.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_27
; GFX11-TRUE16-NEXT: s_branch .LBB10_32
@@ -9331,10 +9314,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_ldexp_f32 v11, v12, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v11.l, v11
-; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v12
+; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v9
; GFX11-TRUE16-NEXT: .LBB10_32: ; %Flow124
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v2.l
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
index 3768634c1691c..d65398171840c 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -37,7 +37,7 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -60,7 +60,7 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -83,7 +83,7 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -106,7 +106,7 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
More information about the llvm-commits
mailing list