[llvm] AMDGPU: Factor agpr reg_sequence folding into a function (PR #129002)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 26 21:12:04 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
---
Full diff: https://github.com/llvm/llvm-project/pull/129002.diff
1 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+87-79)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3a019dbaad02c..f1ba199fbae3f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -128,6 +128,8 @@ class SIFoldOperandsImpl {
bool tryFoldCndMask(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+
+ bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
bool tryFoldFoldableCopy(MachineInstr &MI,
MachineOperand *&CurrentKnownM0Val) const;
@@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
- unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
@@ -1021,84 +1022,9 @@ void SIFoldOperandsImpl::foldOperand(
OpToFold.setIsKill(false);
// Remove kill flags as kills may now be out of order with uses.
- MRI->clearKillFlags(OpToFold.getReg());
-
- // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
- // can only accept VGPR or inline immediate. Recreate a reg_sequence with
- // its initializers right here, so we will rematerialize immediates and
- // avoid copies via different reg classes.
- SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
- if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
- getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
- const DebugLoc &DL = UseMI->getDebugLoc();
- MachineBasicBlock &MBB = *UseMI->getParent();
-
- UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
- for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
- UseMI->removeOperand(I);
-
- MachineInstrBuilder B(*MBB.getParent(), UseMI);
- DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
- SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
- for (unsigned I = 0; I < Size / 4; ++I) {
- MachineOperand *Def = Defs[I].first;
- TargetInstrInfo::RegSubRegPair CopyToVGPR;
- if (Def->isImm() &&
- TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
- int64_t Imm = Def->getImm();
-
- auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
- BuildMI(MBB, UseMI, DL,
- TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
- B.addReg(Tmp);
- } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
- auto Src = getRegSubRegPair(*Def);
- Def->setIsKill(false);
- if (!SeenAGPRs.insert(Src)) {
- // We cannot build a reg_sequence out of the same registers, they
- // must be copied. Better do it here before copyPhysReg() created
- // several reads to do the AGPR->VGPR->AGPR copy.
- CopyToVGPR = Src;
- } else {
- B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
- Src.SubReg);
- }
- } else {
- assert(Def->isReg());
- Def->setIsKill(false);
- auto Src = getRegSubRegPair(*Def);
-
- // Direct copy from SGPR to AGPR is not possible. To avoid creation
- // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
- // create a copy here and track if we already have such a copy.
- if (TRI->isSGPRReg(*MRI, Src.Reg)) {
- CopyToVGPR = Src;
- } else {
- auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
- BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
- B.addReg(Tmp);
- }
- }
-
- if (CopyToVGPR.Reg) {
- auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
- Register &Vgpr = It->second;
- if (Inserted) {
- Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
- }
- auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
- BuildMI(MBB, UseMI, DL,
- TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
- B.addReg(Tmp);
- }
-
- B.addImm(Defs[I].second);
- }
- LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
- }
-
- return;
+ MRI->clearKillFlags(UseReg);
+ if (foldCopyToAGPRRegSequence(UseMI))
+ return;
}
unsigned UseOpc = UseMI->getOpcode();
@@ -1558,6 +1484,88 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
return true;
}
+/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
+/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
+bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
+ // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
+ // only accept VGPR or inline immediate. Recreate a reg_sequence with its
+ // initializers right here, so we will rematerialize immediates and avoid
+ // copies via different reg classes.
+ if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg()))
+ return false;
+ Register UseReg = CopyMI->getOperand(1).getReg();
+ SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
+ if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
+ return false;
+
+ const DebugLoc &DL = CopyMI->getDebugLoc();
+ MachineBasicBlock &MBB = *CopyMI->getParent();
+
+ CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
+ for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
+ CopyMI->removeOperand(I);
+
+ MachineInstrBuilder B(*MBB.getParent(), CopyMI);
+ DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
+ SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
+ for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) {
+ MachineOperand *Def = Defs[I].first;
+ TargetInstrInfo::RegSubRegPair CopyToVGPR;
+ if (Def->isImm() &&
+ TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+ int64_t Imm = Def->getImm();
+
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
+ .addImm(Imm);
+ B.addReg(Tmp);
+ } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
+ auto Src = getRegSubRegPair(*Def);
+ Def->setIsKill(false);
+ if (!SeenAGPRs.insert(Src)) {
+ // We cannot build a reg_sequence out of the same registers, they
+ // must be copied. Better do it here before copyPhysReg() created
+ // several reads to do the AGPR->VGPR->AGPR copy.
+ CopyToVGPR = Src;
+ } else {
+ B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg);
+ }
+ } else {
+ assert(Def->isReg());
+ Def->setIsKill(false);
+ auto Src = getRegSubRegPair(*Def);
+
+ // Direct copy from SGPR to AGPR is not possible. To avoid creation
+ // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
+ // create a copy here and track if we already have such a copy.
+ if (TRI->isSGPRReg(*MRI, Src.Reg)) {
+ CopyToVGPR = Src;
+ } else {
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
+ B.addReg(Tmp);
+ }
+ }
+
+ if (CopyToVGPR.Reg) {
+ auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
+ Register &Vgpr = It->second;
+ if (Inserted) {
+ Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
+ }
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
+ .addReg(Vgpr);
+ B.addReg(Tmp);
+ }
+
+ B.addImm(Defs[I].second);
+ }
+ LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
+ return true;
+}
+
bool SIFoldOperandsImpl::tryFoldFoldableCopy(
MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
Register DstReg = MI.getOperand(0).getReg();
``````````
</details>
https://github.com/llvm/llvm-project/pull/129002
More information about the llvm-commits
mailing list