[llvm-branch-commits] [llvm] [AMDGPU] Make AMDGPURewriteAGPRCopyMFMA aware of subreg reload (PR #174998)
Christudasan Devadasan via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 12 05:29:22 PST 2026
https://github.com/cdevadas updated https://github.com/llvm/llvm-project/pull/174998
>From b180da617cad7cd5ebfb0db442b9a1ad00f78a2f Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Wed, 7 Jan 2026 10:48:26 +0000
Subject: [PATCH 1/2] [AMDGPU] Make AMDGPURewriteAGPRCopyMFMA aware of subreg
reload
AMDGPURewriteAGPRCopyMFMA pass is currently not subreg-aware.
In particular, the logic that optimizes spills into COPY
instructions assumes full register reloads. This becomes
problematic when the reload instruction partially restores
a tuple register. This patch introduces the necessary changes
to make this pass subreg-aware, for a future patch that
implements subreg reload during RA.
---
.../include/llvm/CodeGen/TargetRegisterInfo.h | 3 ++
llvm/lib/CodeGen/TargetRegisterInfo.cpp | 10 +++++
.../AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp | 42 ++++++++++++++++++-
3 files changed, 54 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 35b14e8b8fd30..5c35cd338feb6 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -430,6 +430,9 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
return SubRegIndexLaneMasks[SubIdx];
}
+ /// Try to find a matching subreg from the given lanemask.
+ unsigned getSubRegIdxFromLaneMask(LaneBitmask LaneMask) const;
+
/// Try to find one or more subregister indexes to cover \p LaneMask.
///
/// If this is possible, returns true and appends the best matching set of
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index cffb3ed1b8779..2b3924e368ccd 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -546,6 +546,16 @@ TargetRegisterInfo::getRegSizeInBits(Register Reg,
return getRegSizeInBits(*RC);
}
+unsigned
+TargetRegisterInfo::getSubRegIdxFromLaneMask(LaneBitmask LaneMask) const {
+ for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
+ if (getSubRegIndexLaneMask(Idx) == LaneMask)
+ return Idx;
+ }
+
+ return 0 /*NoSubRegister*/;
+}
+
bool TargetRegisterInfo::getCoveringSubRegIndexes(
const TargetRegisterClass *RC, LaneBitmask LaneMask,
SmallVectorImpl<unsigned> &NeededIndexes) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index ffbb1c183ca9e..b015198c02e8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -112,6 +112,17 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const;
bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const;
+ /// Derives the subregister index from a spill reload pseudo instruction by
+ /// constructing a lane mask that covers the reloaded portion and finding
+ /// the matching subregister.
+ ///
+ /// \p MI the spill reload pseudo instruction containing the offset and
+ /// spill size info
+ /// \p Reg the original virtual register being spilled (mostly a tuple
+ /// register)
+ /// \return the subregister index corresponding to the reload portion.
+ unsigned getSubRegFromReload(MachineInstr &MI, Register VReg) const;
+
/// Replace spill instruction \p SpillMI which loads/stores from/to \p SpillFI
/// with a COPY to the replacement register value \p VReg.
void replaceSpillWithCopyToVReg(MachineInstr &SpillMI, int SpillFI,
@@ -422,6 +433,33 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR(
return MadeChange;
}
+unsigned
+AMDGPURewriteAGPRCopyMFMAImpl::getSubRegFromReload(MachineInstr &MI,
+ Register Reg) const {
+ unsigned NumRegs = TRI.getRegSizeInBits(*MRI.getRegClass(Reg)) / 32;
+ unsigned SubReg = 0;
+ // SubReg accesses for the tuple registers are of interest here.
+ // Note: We don't support 16-bit subreg reloads. If that assuption is
+ // changed in the future, this function should be revised.
+ if (NumRegs == 1)
+ return SubReg;
+
+ unsigned NumSpilledRegs = TII.getNumSubRegsForSpillOp(MI);
+ // Skip if the entire tuple is reloaded.
+ if (NumRegs == NumSpilledRegs)
+ return SubReg;
+
+ // Construct the covering lanes for the reloaded portion.
+ unsigned SubRegIdx =
+ TII.getNamedOperand(MI, AMDGPU::OpName::offset)->getImm() / 4;
+ // Subreg lane masks are maintained in terms of regunits and each 32-bit
+ // register consists of two regunits.
+ uint64_t Lanes = (1ULL << NumSpilledRegs * 2) - 1;
+ LaneBitmask CoveringLanes = LaneBitmask(Lanes << SubRegIdx * 2);
+ SubReg = TRI.getSubRegIdxFromLaneMask(CoveringLanes);
+ return SubReg;
+}
+
void AMDGPURewriteAGPRCopyMFMAImpl::replaceSpillWithCopyToVReg(
MachineInstr &SpillMI, int SpillFI, Register VReg) const {
const DebugLoc &DL = SpillMI.getDebugLoc();
@@ -431,9 +469,11 @@ void AMDGPURewriteAGPRCopyMFMAImpl::replaceSpillWithCopyToVReg(
NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY), VReg)
.add(SpillMI.getOperand(0));
} else {
+ // Identify the subregs if SpillMI is really a subreg-load.
+ unsigned SubReg = getSubRegFromReload(SpillMI, VReg);
NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY))
.add(SpillMI.getOperand(0))
- .addReg(VReg);
+ .addReg(VReg, 0, SubReg);
}
LIS.ReplaceMachineInstrInMaps(SpillMI, *NewCopy);
>From 0ebaff5fb69b54476549043327fb6d92b83e2cfb Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Mon, 12 Jan 2026 12:59:34 +0000
Subject: [PATCH 2/2] suggestions incorporated.
---
llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index b015198c02e8d..de329a4083ba5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -437,9 +437,9 @@ unsigned
AMDGPURewriteAGPRCopyMFMAImpl::getSubRegFromReload(MachineInstr &MI,
Register Reg) const {
unsigned NumRegs = TRI.getRegSizeInBits(*MRI.getRegClass(Reg)) / 32;
- unsigned SubReg = 0;
+ unsigned SubReg = AMDGPU::NoSubRegister;
// SubReg accesses for the tuple registers are of interest here.
- // Note: We don't support 16-bit subreg reloads. If that assuption is
+ // Note: We don't support 16-bit subreg reloads. If that assumption is
// changed in the future, this function should be revised.
if (NumRegs == 1)
return SubReg;
More information about the llvm-branch-commits
mailing list