[llvm] bf388f8 - [AMDGPU][True16][CodeGen] legalize operands when move16bit SALU to VALU (#133985)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 3 09:26:44 PDT 2025
Author: Brox Chen
Date: 2025-04-03T12:26:41-04:00
New Revision: bf388f8a43c26264dfa96a91bead440d19f58bc4
URL: https://github.com/llvm/llvm-project/commit/bf388f8a43c26264dfa96a91bead440d19f58bc4
DIFF: https://github.com/llvm/llvm-project/commit/bf388f8a43c26264dfa96a91bead440d19f58bc4.diff
LOG: [AMDGPU][True16][CodeGen] legalize operands when move16bit SALU to VALU (#133985)
This is a follow up PR from
https://github.com/llvm/llvm-project/pull/132089.
When a V2S copy and its useMI are lowered to VALU, this patch check:
If the generated new VALU is a true16 inst. Add subreg access on all
operands if necessary.
an example MIR looks like:
```
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ...
%2:sreg_32 = COPY %1:vgpr_32
%3:sreg_32 = S_FLOOR_F16 %2:sreg_32, ...
```
currently lowered to
```
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ...
%2:vgpr_16 = V_FLOOR_F16_t16_e64 0, %1:vgpr_32, 0, 0, 0 ...
```
after this patch
```
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ...
%2:vgpr_16 = V_FLOOR_F16_t16_e64 0, %1.lo16:vgpr_32, 0, 0, 0 ...
```
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 260f80a5f532e..61fda0eef6314 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7228,6 +7228,29 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
return DeferredList.contains(MI);
}
+// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
+// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
+// subreg access properly. This can be removed after we have sgpr16 in place
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const {
+ unsigned Opcode = Inst.getOpcode();
+ if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
+ return;
+
+ for (MachineOperand &Op : Inst.explicit_operands()) {
+ unsigned OpIdx = Op.getOperandNo();
+ if (!OpIdx)
+ continue;
+ if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
+ unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
+ const TargetRegisterClass *RC = RI.getRegClass(RCID);
+ if (RI.getRegSizeInBits(*RC) == 16) {
+ Op.setSubReg(AMDGPU::lo16);
+ }
+ }
+ }
+}
+
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {
@@ -7613,6 +7636,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.add(Inst.getOperand(0))
.add(Inst.getOperand(1));
}
+ legalizeOperandsVALUt16(*NewInstr, MRI);
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
MachineOperand SCCOp = Inst.getOperand(SCCIdx);
@@ -7682,6 +7706,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.addImm(0) // omod
.addImm(0); // opsel0
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+ legalizeOperandsVALUt16(*NewInstr, MRI);
legalizeOperands(*NewInstr, MDT);
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
Inst.eraseFromParent();
@@ -7747,6 +7772,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// If this is a v2s copy src from vgpr16 to sgpr32,
// replace vgpr copy to subreg_to_reg
+ // This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
Inst.getOperand(1).getReg().isVirtual() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
@@ -7785,11 +7811,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
NewInstr.addImm(0);
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
MachineOperand Src = Inst.getOperand(1);
- if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
- Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
- NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
- else
- NewInstr->addOperand(Src);
+ NewInstr->addOperand(Src);
}
if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
@@ -7863,6 +7885,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Check useMI of NewInstr. If used by a true16 instruction,
// add a lo16 subreg access if size mismatched
+ // This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
E = MRI.use_end();
@@ -7878,6 +7901,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
}
fixImplicitOperands(*NewInstr);
+
+ legalizeOperandsVALUt16(*NewInstr, MRI);
+
// Legalize the operands
legalizeOperands(*NewInstr, MDT);
if (NewDstReg)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 79ef1432d512a..d63225c067c9d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1279,6 +1279,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// was moved to VGPR. \returns true if succeeded.
bool moveFlatAddrToVGPR(MachineInstr &Inst) const;
+ /// Fix operands in Inst to fix 16bit SALU to VALU lowering.
+ void legalizeOperandsVALUt16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const;
+
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
/// in the \p WorkList to the VALU if necessary. If present, \p MDT is
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 419f57972a485..137a9aaea6a77 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -1,6 +1,26 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
+---
+name: cmp_f16
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: cmp_f16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
+ ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
+ %0:vgpr_16 = IMPLICIT_DEF
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+ %3:sreg_32 = COPY %2:vgpr_16
+ nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
+ %4:sreg_32_xm0_xexec = COPY $scc
+ %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+...
+
---
name: cvt_hi_f32_f16
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 23e4b80b61f69..8bc8eefad6bf7 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -1,19 +1,26 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
-# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=REAL16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=FAKE16 %s
---
name: fmac_f16
body: |
bb.0:
- ; GCN-LABEL: name: fmac_f16
- ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; REAL16-LABEL: name: fmac_f16
+ ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; REAL16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; REAL16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; REAL16-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; FAKE16-LABEL: name: fmac_f16
+ ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; FAKE16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; FAKE16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; FAKE16-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = IMPLICIT_DEF
%2:sreg_32 = IMPLICIT_DEF
More information about the llvm-commits
mailing list