[llvm] [WIP][AMDGPU] Improve the handling of `inreg` arguments (PR #133614)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 29 21:02:59 PDT 2025
https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/133614
When SGPRs available for `inreg` argument passing run out, the compiler silently
falls back to using whole VGPRs to pass those arguments. Ideally, instead of
using whole VGPRs, we should pack `inreg` arguments into individual lanes of
VGPRs.
This PR introduces `InregVGPRSpiller`, which handles this packing. It uses
`v_writelane` at the call site to place `inreg` arguments into specific VGPR
lanes, and then extracts them in the callee using `v_readlane`.
Fixes #130443 and #129071.
>From 7190fa6b16442e24a46a61357fc02f2e268070a3 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Sun, 30 Mar 2025 00:02:32 -0400
Subject: [PATCH] [WIP][AMDGPU] Improve the handling of `inreg` arguments
When SGPRs available for `inreg` argument passing run out, the compiler silently
falls back to using whole VGPRs to pass those arguments. Ideally, instead of
using whole VGPRs, we should pack `inreg` arguments into individual lanes of
VGPRs.
This PR introduces `InregVGPRSpiller`, which handles this packing. It uses
`v_writelane` at the call site to place `inreg` arguments into specific VGPR
lanes, and then extracts them in the callee using `v_readlane`.
Fixes #130443 and #129071.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 99 +++++++++++++++++++-
llvm/test/CodeGen/AMDGPU/inreg-vgpr-spill.ll | 28 ++++++
2 files changed, 125 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/inreg-vgpr-spill.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c8645850fe111..628ea2515482d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2841,6 +2841,86 @@ void SITargetLowering::insertCopiesSplitCSR(
}
}
+class InregVPGRSpiller {
+ CCState &State;
+ const unsigned WaveFrontSize;
+
+ Register CurReg;
+ unsigned CurLane = 0;
+
+protected:
+ SelectionDAG &DAG;
+ MachineFunction &MF;
+
+ Register getCurReg() const { return CurReg; }
+ unsigned getCurLane() const { return CurLane; }
+
+ InregVPGRSpiller(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
+ : State(State),
+ WaveFrontSize(MF.getSubtarget<GCNSubtarget>().getWavefrontSize()),
+ DAG(DAG), MF(MF) {}
+
+ void setReg(Register &Reg) {
+ if (CurReg.isValid()) {
+ State.DeallocateReg(Reg);
+ Reg = CurReg;
+ } else {
+ CurReg = Reg;
+ }
+ }
+
+ void forward() {
+ // We have used the same VGPRs of all the lanes, so we need to reset it and
+ // pick up a new one in the next move.
+ if (++CurLane % WaveFrontSize == 0)
+ CurReg = 0;
+ }
+};
+
+class InregVPGRSpillerCallee final : private InregVPGRSpiller {
+public:
+ InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
+ : InregVPGRSpiller(DAG, MF, State) {}
+
+ SDValue read(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
+ setReg(Reg);
+
+ MF.addLiveIn(getCurReg(), &AMDGPU::VGPR_32RegClass);
+
+ // TODO: Do we need the chain here?
+ SmallVector<SDValue, 4> Operands{
+ DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
+ DAG.getRegister(getCurReg(), VT),
+ DAG.getTargetConstant(getCurLane(), SL, MVT::i32)};
+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
+
+ forward();
+
+ return Res;
+ }
+};
+
+class InregVPGRSpillerCallSite final : private InregVPGRSpiller {
+public:
+ InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF,
+ CCState &State)
+ : InregVPGRSpiller(DAG, MF, State) {}
+
+ SDValue write(const SDLoc &SL, Register &Reg, SDValue V, EVT VT) {
+ setReg(Reg);
+
+ SmallVector<SDValue, 4> Operands{
+ DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32),
+ DAG.getRegister(getCurReg(), VT), V,
+ DAG.getTargetConstant(getCurLane(), SL, MVT::i32)};
+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
+
+ forward();
+
+ return Res;
+ }
+};
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -2963,6 +3043,7 @@ SDValue SITargetLowering::LowerFormalArguments(
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
// kern arg offset.
const Align KernelArgBaseAlign = Align(16);
+ InregVPGRSpillerCallee Spiller(DAG, MF, CCInfo);
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
@@ -3130,8 +3211,17 @@ SDValue SITargetLowering::LowerFormalArguments(
llvm_unreachable("Unexpected register class in LowerFormalArguments!");
EVT ValVT = VA.getValVT();
- Reg = MF.addLiveIn(Reg, RC);
- SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ SDValue Val;
+ // If an argument is marked inreg but gets pushed to a VGPR, it indicates
+ // we've run out of SGPRs for argument passing. In such cases, we'd prefer
+ // to start packing inreg arguments into individual lanes of VGPRs, rather
+ // than placing them directly into VGPRs.
+ if (RC == &AMDGPU::VGPR_32RegClass && Arg.Flags.isInReg()) {
+ Val = Spiller.read(Chain, DL, Reg, VT);
+ } else {
+ Reg = MF.addLiveIn(Reg, RC);
+ Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ }
if (Arg.Flags.isSRet()) {
// The return object should be reasonably addressable.
@@ -3875,6 +3965,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MVT PtrVT = MVT::i32;
+ InregVPGRSpillerCallSite Spiller(DAG, MF, CCInfo);
+
// Walk the register/memloc assignments, inserting copies/loads.
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -3904,6 +3996,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
if (VA.isRegLoc()) {
+ Register Reg = VA.getLocReg();
+ if (Outs[i].Flags.isInReg() && AMDGPU::VGPR_32RegClass.contains(Reg))
+ Arg = Spiller.write(DL, Reg, Arg, VA.getLocVT());
RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
} else {
assert(VA.isMemLoc());
diff --git a/llvm/test/CodeGen/AMDGPU/inreg-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/inreg-vgpr-spill.ll
new file mode 100644
index 0000000000000..3ec4c86fa87ad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inreg-vgpr-spill.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - %s | FileCheck %s
+
+; arg3 is v0, arg4 is in v1. These should be packed into a lane and extracted with readlane
+define i32 @callee(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <2 x i32> inreg %arg2, i32 inreg %arg3, i32 inreg %arg4) {
+; CHECK-LABEL: test0:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_readlane_b32 s0, v0, 1
+; CHECK-NEXT: v_readlane_b32 s1, v0, 0
+; CHECK-NEXT: s_add_i32 s1, s1, s0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mov_b32_e32 v0, s1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %add = add i32 %arg3, %arg4
+ ret i32 %add
+}
+
+define amdgpu_kernel void @kernel(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p) {
+ %arg0 = load <8 x i32>, ptr %p0
+ %arg1 = load <8 x i32>, ptr %p1
+ %arg2 = load <2 x i32>, ptr %p2
+ %arg3 = load i32, ptr %p3
+ %arg4 = load i32, ptr %p4
+ %ret = call i32 @callee(<8 x i32> %arg0, <8 x i32> %arg1, <2 x i32> %arg2, i32 %arg3, i32 %arg4)
+ store i32 %ret, ptr %p
+ ret void
+}
More information about the llvm-commits
mailing list