[llvm] [WIP][AMDGPU] Improve the handling of `inreg` arguments (PR #133614)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 3 21:14:48 PDT 2025
================
@@ -2841,6 +2841,96 @@ void SITargetLowering::insertCopiesSplitCSR(
}
}
+/// Classes for spilling inreg VGPR arguments.
+///
+/// When an argument marked inreg is pushed to a VGPR, it indicates that the
+/// available SGPRs for argument passing have been exhausted. In such cases, it
+/// is preferable to pack multiple inreg arguments into individual lanes of
+/// VGPRs instead of assigning each directly to separate VGPRs.
+///
+/// Spilling involves two parts: the caller-side (call site) and the
+/// callee-side. Both must follow the same method for selecting registers and
+/// lanes, ensuring that an argument written at the call site matches exactly
+/// with the one read at the callee.
+
+/// The spilling class for the caller-side that lowers packing of call site
+/// arguments.
+class InregVPGRSpillerCallee {
+ CCState &State;
+ SelectionDAG &DAG;
+ MachineFunction &MF;
+
+ Register SrcReg;
+ SDValue SrcVal;
+ unsigned CurLane = 0;
+
+public:
+ InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
+ : State(State), DAG(DAG), MF(MF) {}
+
+ SDValue readLane(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
+ if (SrcVal) {
+ State.DeallocateReg(Reg);
+ } else {
+ Reg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ SrcReg = Reg;
+ SrcVal = DAG.getCopyFromReg(Chain, SL, Reg, VT);
+ }
+ // According to the calling convention, VGPR0-31 are used for passing
+ // function arguments, no matter they are regular arguments, or 'inreg'
+ // function arguments that get spilled into VGPRs. Therefore, there are at
+ // most 32 'inreg' arguments that can be spilled to VGPRs.
+ assert(CurLane < 32 && "more than expected VGPR inreg arguments");
+ SmallVector<SDValue, 4> Operands{
+ DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
+ DAG.getRegister(SrcReg, VT),
+ DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
+ }
+};
+
+/// The spilling class for the caller-side that lowers packing of call site
+/// arguments.
+class InregVPGRSpillerCallSite {
+ Register DstReg;
+ SDValue LastWrite;
+ unsigned CurLane = 0;
+
+ SelectionDAG &DAG;
+ MachineFunction &MF;
+
+public:
+ InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF)
+ : DAG(DAG), MF(MF) {}
+
+ void writeLane(const SDLoc &SL, Register &Reg, SDValue Val, EVT VT) {
+ if (DstReg.isValid())
+ Reg = DstReg;
+ else
+ DstReg = Reg;
+ // According to the calling convention, VGPR0-31 are used for passing
+ // function arguments, no matter they are regular arguments, or 'inreg'
+ // function arguments that get spilled into VGPRs. Therefore, there are at
+ // most 32 'inreg' arguments that can be spilled to VGPRs.
+ assert(CurLane < 32 && "more than expected VGPR inreg arguments");
+ SmallVector<SDValue, 4> Operands{
+ DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32), Val,
+ DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
+ if (!LastWrite) {
+ Register VReg = MF.getRegInfo().getLiveInVirtReg(DstReg);
----------------
shiltian wrote:
The mismatch is from here: the first tied-input attached to the intrinsic. Basically we want the intrinsic to write to a specific register. The input `Reg` is actually a `MCRegister`. We need a virtual register here, therefore we use `getLiveInVirtReg` first to get a virtual register corresponds to the `MCRegister`, and then use `getRegister` to get an `SDValue`. Thhe generated MIR would be something like below:
```
%0:vgpr_32(s32) = COPY $vgpr0
...
%33:vgpr_32 = V_WRITELANE_B32 killed %28:sreg_32, 0, %0:vgpr_32(tied-def 0)(s32)
%34:vgpr_32 = V_WRITELANE_B32 killed %29:sreg_32, 1, %33:vgpr_32(tied-def 0)
```
This sequence later is lowered to something like below:
```
%34:vgpr_32 = COPY %0:vgpr_32(s32)
%34:vgpr_32 = V_WRITELANE_B32 $sgpr30, 0, %34:vgpr_32(tied-def 0)
%34:vgpr_32 = V_WRITELANE_B32 $sgpr31, 1, %34:vgpr_32(tied-def 0)
```
And further lowered to the following after RA:
```
renamable $vgpr1 = COPY renamable $vgpr0
renamable $vgpr1 = V_WRITELANE_B32 $sgpr30, 0, killed $vgpr1(tied-def 0)
renamable $vgpr1 = V_WRITELANE_B32 $sgpr31, 1, killed $vgpr1(tied-def 0)
```
As we can see here, we are supposed to write to `$vgpr0`, but because the `COPY` reads from `$vgpr0` to `$vgpr1`, and then `V_WRITELANE_B32` becomes write to `$vgpr1`.
We probably should not use virtual register as the operand of the intrinsic, but that will crash the two address instruction pass, which expects the operand to be a vreg.
https://github.com/llvm/llvm-project/pull/133614
More information about the llvm-commits
mailing list