[llvm-branch-commits] [llvm] [AMDGPU] Make AMDGPURewriteAGPRCopyMFMA aware of subreg reload (PR #174998)

Mon Jan 12 05:29:22 PST 2026

https://github.com/cdevadas updated https://github.com/llvm/llvm-project/pull/174998

>From b180da617cad7cd5ebfb0db442b9a1ad00f78a2f Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Wed, 7 Jan 2026 10:48:26 +0000
Subject: [PATCH 1/2] [AMDGPU] Make AMDGPURewriteAGPRCopyMFMA aware of subreg
 reload

AMDGPURewriteAGPRCopyMFMA pass is currently not subreg-aware.
In particular, the logic that optimizes spills into COPY
instructions assumes full register reloads. This becomes
problematic when the reload instruction partially restores
a tuple register. This patch introduces the necessary changes
to make this pass subreg-aware, for a future patch that
implements subreg reload during RA.
---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |  3 ++
 llvm/lib/CodeGen/TargetRegisterInfo.cpp       | 10 +++++
 .../AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp      | 42 ++++++++++++++++++-
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 35b14e8b8fd30..5c35cd338feb6 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -430,6 +430,9 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
     return SubRegIndexLaneMasks[SubIdx];
   }
 
+  /// Try to find a matching subreg from the given lanemask.
+  unsigned getSubRegIdxFromLaneMask(LaneBitmask LaneMask) const;
+
   /// Try to find one or more subregister indexes to cover \p LaneMask.
   ///
   /// If this is possible, returns true and appends the best matching set of
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index cffb3ed1b8779..2b3924e368ccd 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -546,6 +546,16 @@ TargetRegisterInfo::getRegSizeInBits(Register Reg,
   return getRegSizeInBits(*RC);
 }
 
+unsigned
+TargetRegisterInfo::getSubRegIdxFromLaneMask(LaneBitmask LaneMask) const {
+  for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
+    if (getSubRegIndexLaneMask(Idx) == LaneMask)
+      return Idx;
+  }
+
+  return 0 /*NoSubRegister*/;
+}
+
 bool TargetRegisterInfo::getCoveringSubRegIndexes(
     const TargetRegisterClass *RC, LaneBitmask LaneMask,
     SmallVectorImpl<unsigned> &NeededIndexes) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index ffbb1c183ca9e..b015198c02e8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -112,6 +112,17 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const;
   bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const;
 
+  /// Derives the subregister index from a spill reload pseudo instruction by
+  /// constructing a lane mask that covers the reloaded portion and finding
+  /// the matching subregister.
+  ///
+  /// \p MI the spill reload pseudo instruction containing the offset and
+  /// spill size info
+  /// \p Reg the original virtual register being spilled (mostly a tuple
+  /// register)
+  /// \return the subregister index corresponding to the reload portion.
+  unsigned getSubRegFromReload(MachineInstr &MI, Register VReg) const;
+
   /// Replace spill instruction \p SpillMI which loads/stores from/to \p SpillFI
   /// with a COPY to the replacement register value \p VReg.
   void replaceSpillWithCopyToVReg(MachineInstr &SpillMI, int SpillFI,
@@ -422,6 +433,33 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR(
   return MadeChange;
 }
 
+unsigned
+AMDGPURewriteAGPRCopyMFMAImpl::getSubRegFromReload(MachineInstr &MI,
+                                                   Register Reg) const {
+  unsigned NumRegs = TRI.getRegSizeInBits(*MRI.getRegClass(Reg)) / 32;
+  unsigned SubReg = 0;
+  // SubReg accesses for the tuple registers are of interest here.
+  // Note: We don't support 16-bit subreg reloads. If that assuption is
+  // changed in the future, this function should be revised.
+  if (NumRegs == 1)
+    return SubReg;
+
+  unsigned NumSpilledRegs = TII.getNumSubRegsForSpillOp(MI);
+  // Skip if the entire tuple is reloaded.
+  if (NumRegs == NumSpilledRegs)
+    return SubReg;
+
+  // Construct the covering lanes for the reloaded portion.
+  unsigned SubRegIdx =
+      TII.getNamedOperand(MI, AMDGPU::OpName::offset)->getImm() / 4;
+  // Subreg lane masks are maintained in terms of regunits and each 32-bit
+  // register consists of two regunits.
+  uint64_t Lanes = (1ULL << NumSpilledRegs * 2) - 1;
+  LaneBitmask CoveringLanes = LaneBitmask(Lanes << SubRegIdx * 2);
+  SubReg = TRI.getSubRegIdxFromLaneMask(CoveringLanes);
+  return SubReg;
+}
+
 void AMDGPURewriteAGPRCopyMFMAImpl::replaceSpillWithCopyToVReg(
     MachineInstr &SpillMI, int SpillFI, Register VReg) const {
   const DebugLoc &DL = SpillMI.getDebugLoc();
@@ -431,9 +469,11 @@ void AMDGPURewriteAGPRCopyMFMAImpl::replaceSpillWithCopyToVReg(
     NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY), VReg)
                   .add(SpillMI.getOperand(0));
   } else {
+    // Identify the subregs if SpillMI is really a subreg-load.
+    unsigned SubReg = getSubRegFromReload(SpillMI, VReg);
     NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY))
                   .add(SpillMI.getOperand(0))
-                  .addReg(VReg);
+                  .addReg(VReg, 0, SubReg);
   }
 
   LIS.ReplaceMachineInstrInMaps(SpillMI, *NewCopy);

>From 0ebaff5fb69b54476549043327fb6d92b83e2cfb Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Mon, 12 Jan 2026 12:59:34 +0000
Subject: [PATCH 2/2] suggestions incorporated.

---
 llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index b015198c02e8d..de329a4083ba5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -437,9 +437,9 @@ unsigned
 AMDGPURewriteAGPRCopyMFMAImpl::getSubRegFromReload(MachineInstr &MI,
                                                    Register Reg) const {
   unsigned NumRegs = TRI.getRegSizeInBits(*MRI.getRegClass(Reg)) / 32;
-  unsigned SubReg = 0;
+  unsigned SubReg = AMDGPU::NoSubRegister;
   // SubReg accesses for the tuple registers are of interest here.
-  // Note: We don't support 16-bit subreg reloads. If that assuption is
+  // Note: We don't support 16-bit subreg reloads. If that assumption is
   // changed in the future, this function should be revised.
   if (NumRegs == 1)
     return SubReg;