[llvm] [AMDGPU][True16][CodeGen] si-fix-sgpr-copies legalize size mismatched V2S copy with subreg case (PR #161290)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 3 06:21:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
In true16 mode isel put 16bit in both vgpr16 and sreg32. Thus si-fix-sgpr-copies needs to legalize register-size-mismatched insts in the moveToVALU lowering process
For special inst, previously we legalized:
```
1. sreg_32 = copy vgpr_16
2. sgpr_lo16 = copy vgpr_32
```
with patch https://github.com/llvm/llvm-project/pull/144819, case2 is removed.
This patch removes case2 support, and added the missing subreg cases:
```
copy :
1. sreg_32 = copy .lo16:vgpr_32
2. sreg_32 = copy .hi16:vgpr_32
reg_sequence
vgpr_32 = reg_sequence(sreg32, lo16/hi16, ...)
```
This is a follow up of https://github.com/llvm/llvm-project/pull/160891
---
Patch is 152.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161290.diff
11 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+67-32)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+14-34)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+32-72)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+32-92)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+30-110)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+30-130)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+30-150)
- (modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir (+35-11)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+36-54)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll (+5-5)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 044ea866342c2..10283a2206732 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7527,6 +7527,11 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
return;
unsigned Opcode = MI.getOpcode();
+ if (Opcode == AMDGPU::REG_SEQUENCE) {
+ legalizeSpecialInst_t16(MI, MRI);
+ return;
+ }
+
MachineBasicBlock *MBB = MI.getParent();
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
@@ -7565,6 +7570,65 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}
+// Legalize operands of size-mismatches special inst between 16bit and 32bit
+// in moveToVALU lowering in true16 mode. This caused by 16bit
+// placed in both vgpr16 and sreg32 by isel. Including cases:
+// Copy
+// 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
+// 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
+// => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
+// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
+//
+// Reg_sequence
+// dst32 = reg_sequence(vgpr32, lo16/hi16)
+// => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16)
+//
+// This can be removed after we have sgpr16 in place.
+void SIInstrInfo::legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const {
+ unsigned Opcode = Inst.getOpcode();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+ switch (Opcode) {
+ case AMDGPU::COPY: {
+ Register SrcReg = Inst.getOperand(1).getReg();
+ if (!SrcReg.isVirtual() || !RI.isVGPR(MRI, SrcReg))
+ return;
+
+ bool SetSubReg = false;
+ Register SrcSubReg = Inst.getOperand(1).getSubReg();
+ const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
+ if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
+ } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
+ (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
+ SetSubReg = true;
+ } else
+ return;
+
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::IMPLICIT_DEF), Undef);
+ Inst.setDesc(get(AMDGPU::REG_SEQUENCE));
+ if (SetSubReg)
+ Inst.getOperand(1).setSubReg(SrcSubReg);
+
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::lo16));
+ Inst.addOperand(MachineOperand::CreateReg(Undef, 0));
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::hi16));
+ } break;
+ case AMDGPU::REG_SEQUENCE: {
+ for (unsigned I = 0, E = (Inst.getNumOperands() - 1) / 2; I < E; ++I) {
+ Register SrcReg = Inst.getOperand(1 + 2 * I).getReg();
+ auto SubReg = Inst.getOperand(1 + 2 * I + 1).getImm();
+ if (SrcReg.isVirtual() && RI.isVGPR(MRI, SrcReg) &&
+ MRI.constrainRegClass(SrcReg, &AMDGPU::VGPR_32RegClass) &&
+ (SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
+ Inst.getOperand(1 + 2 * I).setSubReg(AMDGPU::lo16);
+ }
+ }
+ } break;
+ }
+}
+
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {
@@ -8083,6 +8147,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
+ if (ST.useRealTrue16Insts())
+ legalizeSpecialInst_t16(Inst, MRI);
+
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
// Instead of creating a copy where src and dst are the same register
@@ -8105,38 +8172,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- // If this is a v2s copy between 16bit and 32bit reg,
- // replace vgpr copy to reg_sequence/extract_subreg
- // This can be remove after we have sgpr16 in place
- if (ST.useRealTrue16Insts() && Inst.isCopy() &&
- Inst.getOperand(1).getReg().isVirtual() &&
- RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
- const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
- if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::REG_SEQUENCE), NewDstReg)
- .addReg(Inst.getOperand(1).getReg())
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- Inst.eraseFromParent();
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
- AMDGPU::lo16)) {
- Inst.getOperand(1).setSubReg(AMDGPU::lo16);
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- }
- }
-
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
legalizeOperands(Inst, MDT);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 31a2d55e1baad..9728ed4b6e002 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1375,6 +1375,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineRegisterInfo &MRI) const;
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
MachineRegisterInfo &MRI) const;
+ void legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const;
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 47cb6bd3b3bb6..03672fb760586 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -4913,12 +4913,10 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -8342,12 +8340,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -12629,12 +12625,10 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -16043,12 +16037,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -19655,12 +19647,10 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -23094,12 +23084,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -25911,12 +25899,10 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -29258,12 +29244,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -31057,12 +31041,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -31074,12 +31056,12 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -31103,11 +31085,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
@@ -31123,7 +31105,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0]
@@ -31168,9 +31150,9 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -32879,12 +32861,10 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -32896,12 +32876,12 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -32925,11 +32905,11 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
@@ -32945,7 +32925,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1]
@@ -32990,9 +32970,9 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 2cc7c448b2e11..afb2e96a0079f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -5328,15 +5328,11 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/161290
More information about the llvm-commits
mailing list