[llvm] [AMDGPU][True16][CodeGen] si-fix-sgpr-copies legalize size mismatched V2S copy with subreg case (PR #161290)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 09:06:11 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/161290
>From 8440f0e1b8388c122351e4051164be97cb3bb36f Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 29 Sep 2025 14:49:57 -0400
Subject: [PATCH] addtional case for mismatched size copy
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 99 +++++++++++++------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
.../AMDGPU/fix-sgpr-copies-f16-true16.mir | 46 ++++++---
...-to-valu-pseudo-scalar-trans-f16-true16.ll | 10 +-
4 files changed, 109 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7cb7f47ddb220..60949de157313 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7616,6 +7616,11 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
return;
unsigned Opcode = MI.getOpcode();
+ if (Opcode == AMDGPU::REG_SEQUENCE) {
+ legalizeSpecialInst_t16(MI, MRI);
+ return;
+ }
+
MachineBasicBlock *MBB = MI.getParent();
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
@@ -7654,6 +7659,65 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}
+// Legalize operands of size-mismatches special inst between 16bit and 32bit
+// in moveToVALU lowering in true16 mode. This caused by 16bit
+// placed in both vgpr16 and sreg32 by isel. Including cases:
+// Copy
+// 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
+// 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
+// => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
+// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
+//
+// Reg_sequence
+// dst32 = reg_sequence(vgpr32, lo16/hi16)
+// => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16)
+//
+// This can be removed after we have sgpr16 in place.
+void SIInstrInfo::legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const {
+ unsigned Opcode = Inst.getOpcode();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+ switch (Opcode) {
+ case AMDGPU::COPY: {
+ Register SrcReg = Inst.getOperand(1).getReg();
+ if (!SrcReg.isVirtual() || !RI.isVGPR(MRI, SrcReg))
+ return;
+
+ bool SetSubReg = false;
+ Register SrcSubReg = Inst.getOperand(1).getSubReg();
+ const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
+ if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
+ } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
+ (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
+ SetSubReg = true;
+ } else
+ return;
+
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::IMPLICIT_DEF), Undef);
+ Inst.setDesc(get(AMDGPU::REG_SEQUENCE));
+ if (SetSubReg)
+ Inst.getOperand(1).setSubReg(SrcSubReg);
+
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::lo16));
+ Inst.addOperand(MachineOperand::CreateReg(Undef, 0));
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::hi16));
+ } break;
+ case AMDGPU::REG_SEQUENCE: {
+ for (unsigned I = 0, E = (Inst.getNumOperands() - 1) / 2; I < E; ++I) {
+ Register SrcReg = Inst.getOperand(1 + 2 * I).getReg();
+ auto SubReg = Inst.getOperand(1 + 2 * I + 1).getImm();
+ if (SrcReg.isVirtual() && RI.isVGPR(MRI, SrcReg) &&
+ MRI.constrainRegClass(SrcReg, &AMDGPU::VGPR_32RegClass) &&
+ (SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
+ Inst.getOperand(1 + 2 * I).setSubReg(AMDGPU::lo16);
+ }
+ }
+ } break;
+ }
+}
+
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {
@@ -8192,6 +8256,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
+ if (ST.useRealTrue16Insts())
+ legalizeSpecialInst_t16(Inst, MRI);
+
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
Register NewDstReg = Inst.getOperand(1).getReg();
const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
@@ -8222,38 +8289,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
}
- // If this is a v2s copy between 16bit and 32bit reg,
- // replace vgpr copy to reg_sequence/extract_subreg
- // This can be remove after we have sgpr16 in place
- if (ST.useRealTrue16Insts() && Inst.isCopy() &&
- Inst.getOperand(1).getReg().isVirtual() &&
- RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
- const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
- if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::REG_SEQUENCE), NewDstReg)
- .addReg(Inst.getOperand(1).getReg())
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- Inst.eraseFromParent();
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
- AMDGPU::lo16)) {
- Inst.getOperand(1).setSubReg(AMDGPU::lo16);
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- }
- }
-
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
legalizeOperands(Inst, MDT);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c66985a19685b..b88835d2691ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1460,6 +1460,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineRegisterInfo &MRI) const;
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
MachineRegisterInfo &MRI) const;
+ void legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const;
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index f64615dcc78f0..3a00f6e97d292 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -131,6 +131,34 @@ body: |
%2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
...
+---
+name: copy_vgpr16_sreg32_lo16_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: copy_vgpr16_sreg32_lo16_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].lo16, %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0.lo16:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+...
+
+---
+name: copy_vgpr16_sreg32_hi16_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: copy_vgpr16_sreg32_hi16_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].hi16, %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0.hi16:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+...
+
---
name: copy_vgpr16_sreg32_usedby_salu32
body: |
@@ -158,21 +186,17 @@ body: |
...
---
-name: S_FMAC_F16
+name: reg_sequence_vgpr32_sreg32
body: |
bb.0:
- ; GCN-LABEL: name: S_FMAC_F16
+ ; GCN-LABEL: name: reg_sequence_vgpr32_sreg32
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF1]], %subreg.hi16
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
- ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF1]].lo16, %subreg.lo16, [[DEF]], %subreg.hi16
%0:vgpr_16 = IMPLICIT_DEF
- %1:sgpr_lo16 = COPY %0:vgpr_16
- %2:sreg_32 = COPY %0:vgpr_16
- %3:sreg_32 = COPY %1:sgpr_lo16
- %4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:vgpr_32 = REG_SEQUENCE %2:sreg_32, %subreg.lo16, %0:vgpr_16, %subreg.hi16
...
---
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
index 3768634c1691c..d65398171840c 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -37,7 +37,7 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -60,7 +60,7 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -83,7 +83,7 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -106,7 +106,7 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
More information about the llvm-commits
mailing list