[llvm] SGPR 16bit folding in true16 (PR #128929)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 14 11:50:59 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/128929
>From c00f8164ed8ae6905c0454588d10ae49aa383dc2 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Wed, 26 Feb 2025 13:39:42 -0500
Subject: [PATCH] 16bit sgpr folding
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 102 +++++++++++++++++++---
1 file changed, 90 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 91df516b80857..c45bcefb0c6b3 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -12,8 +12,11 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -576,6 +579,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
MachineOperand *New = Fold.OpToFold;
+ // TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
+ // Rework once the VS_16 register class is updated to include proper
+ // 16-bit SGPRs instead of 32-bit ones.
+ if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
+ Old.setSubReg(AMDGPU::NoSubRegister);
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
Old.setIsUndef(New->isUndef());
return true;
@@ -947,9 +955,14 @@ void SIFoldOperandsImpl::foldOperand(
return;
// FIXME: Fold operands with subregs.
- if (UseOp->isReg() && OpToFold.isReg() &&
- (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
- return;
+ if (UseOp->isReg() && OpToFold.isReg()) {
+ if (UseOp->isImplicit())
+ return;
+ // Allow folding from SGPRs to 16-bit VGPRs.
+ if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
+ UseOp->getSubReg() != AMDGPU::lo16)
+ return;
+ }
// Special case for REG_SEQUENCE: We can't fold literals into
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -1030,14 +1043,20 @@ void SIFoldOperandsImpl::foldOperand(
return;
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
- if (!DestReg.isPhysical()) {
- if (DestRC == &AMDGPU::AGPR_32RegClass &&
- TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
- UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
- UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
- CopiesToReplace.push_back(UseMI);
- return;
- }
+ if (DestRC == &AMDGPU::AGPR_32RegClass &&
+ TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+ CopiesToReplace.push_back(UseMI);
+ return;
+ }
+
+ // Allow immediates COPYd into sgpr_lo16 to be further folded while
+ // still being legal if not further folded
+ if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
+ assert(ST->useRealTrue16Insts());
+ MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
+ DestRC = &AMDGPU::SGPR_32RegClass;
}
// In order to fold immediates into copies, we need to change the
@@ -1073,9 +1092,43 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
+ unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
- UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+ unsigned SubRegIdx = OpToFold.getSubReg();
+ // Hack to allow 32-bit SGPRs to be folded into True16 instructions
+ // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
+ // VS_16RegClass
+ //
+ // Excerpt from AMDGPUGenRegisterInfo.inc
+ // NoSubRegister, //0
+ // hi16, // 1
+ // lo16, // 2
+ // sub0, // 3
+ // ...
+ // sub1, // 11
+ // sub1_hi16, // 12
+ // sub1_lo16, // 13
+ static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
+ if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isSGPRReg(*MRI, UseReg)) {
+ // Produce the 32 bit subregister index to which the 16-bit subregister
+ // is aligned.
+ if (SubRegIdx > AMDGPU::sub1) {
+ LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
+ M |= M.getLane(M.getHighestLane() - 1);
+ SmallVector<unsigned, 4> Indexes;
+ TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
+ Indexes);
+ assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
+ SubRegIdx = Indexes[0];
+ // 32-bit registers do not have a sub0 index
+ } else if (TII->getOpSize(*UseMI, 1) == 4)
+ SubRegIdx = 0;
+ else
+ SubRegIdx = AMDGPU::sub0;
+ }
+ UseMI->getOperand(1).setSubReg(SubRegIdx);
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
@@ -1713,6 +1766,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
return false;
+ // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
+ // Can remove this code if proper 16-bit SGPRs are implemented
+ // Example: Pre-peephole-opt
+ // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
+ // %32:sreg_32 = COPY %29:sgpr_lo16
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // Post-peephole-opt and DCE
+ // %32:sreg_32 = COPY %16.lo16:sreg_32
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // After this transform
+ // %32:sreg_32 = COPY %16:sreg_32
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // After the fold operands pass
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
+ if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
+ OpToFold.getSubReg()) {
+ const TargetRegisterClass *DstRC =
+ MRI->getRegClass(MI.getOperand(0).getReg());
+ if (DstRC == &AMDGPU::SReg_32RegClass &&
+ DstRC == MRI->getRegClass(OpToFold.getReg())) {
+ assert(OpToFold.getSubReg() == AMDGPU::lo16);
+ OpToFold.setSubReg(0);
+ }
+ }
+
// Prevent folding operands backwards in the function. For example,
// the COPY opcode must not be replaced by 1 in this example:
//
More information about the llvm-commits
mailing list