[llvm] [WIP][AMDGPU] Improve the handling of `inreg` arguments (PR #133614)

Wed Apr 2 22:23:35 PDT 2025

================
@@ -2841,6 +2841,91 @@ void SITargetLowering::insertCopiesSplitCSR(
   }
 }
 
+/// Classes for spilling inreg VGPR arguments.
+///
+/// When an argument marked inreg is pushed to a VGPR, it indicates that the
+/// available SGPRs for argument passing have been exhausted. In such cases, it
+/// is preferable to pack multiple inreg arguments into individual lanes of
+/// VGPRs instead of assigning each directly to separate VGPRs.
+///
+/// Spilling involves two parts: the caller-side (call site) and the
+/// callee-side. Both must follow the same method for selecting registers and
+/// lanes, ensuring that an argument written at the call site matches exactly
+/// with the one read at the callee.
+///
+/// The spilling class for the caller-side that lowers packing of call site
+/// arguments.
+class InregVPGRSpillerCallee {
+  CCState &State;
+  SelectionDAG &DAG;
+  MachineFunction &MF;
+
+  Register SrcReg;
+  SDValue SrcVal;
+  unsigned CurLane = 0;
+
+public:
+  InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
+      : State(State), DAG(DAG), MF(MF) {}
+
+  SDValue read(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
+    if (SrcVal) {
+      State.DeallocateReg(Reg);
+    } else {
+      Reg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+      SrcReg = Reg;
+      SrcVal = DAG.getCopyFromReg(Chain, SL, Reg, VT);
+    }
+    // According to the calling convention, only SGPR4–SGPR29 should be used for
+    // passing 'inreg' function arguments. Therefore, the number of 'inreg' VGPR
+    // arguments must not exceed 26.
+    assert(CurLane < 26 && "more than expected VGPR inreg arguments");
+    SmallVector<SDValue, 4> Operands{
+        DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
+        DAG.getRegister(SrcReg, VT),
+        DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
+  }
+};
+
+/// The spilling class for the caller-side that lowers packing of call site
+/// arguments.
+class InregVPGRSpillerCallSite {
+  CCState &State;
+
+  Register DstReg;
+  SDValue Glue;
+  unsigned CurLane = 0;
+
+  SelectionDAG &DAG;
+  MachineFunction &MF;
+
+public:
+  InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF,
+                           CCState &State)
+      : State(State), DAG(DAG), MF(MF) {}
+
+  std::pair<SDValue, SDValue> write(SDValue Chain, const SDLoc &SL,
+                                    Register &Reg, SDValue Val, SDValue InGlue,
+                                    EVT VT) {
+    if (DstReg.isValid()) {
+      Reg = DstReg;
+    } else {
+      DstReg = Reg;
+      Glue = DAG.getCopyToReg(Chain, SL, Reg, Val, InGlue).getValue(1);
+    }
+    // According to the calling convention, only SGPR4–SGPR29 should be used for
+    // passing 'inreg' function arguments. Therefore, the number of 'inreg' VGPR
+    // arguments must not exceed 26.
+    assert(CurLane < 26 && "more than expected VGPR inreg arguments");
+    SmallVector<SDValue, 4> Operands{
+        DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32),
+        DAG.getRegister(DstReg, VT), Val,
+        DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
+    return {DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands), Glue};
----------------
shiltian wrote:

@arsenm I'm not sure how to handle this part correctly. Basically we want to return something similar to `DAG.getCopyToReg`, because in the normal case it has the following handling:

```
      Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
      InGlue = Chain.getValue(1);
```

I tried something like the following:

```
  SDValue write(SDValue Chain, const SDLoc &SL,
                                    Register &Reg, SDValue Val, SDValue InGlue,
                                    EVT VT) {
    if (DstReg.isValid()) {
      Reg = DstReg;
    } else {
      DstReg = Reg;
      DstVal = DAG.getCopyToReg(Chain, SL, Reg, Val, InGlue);
    }
    // According to the calling convention, only SGPR4–SGPR29 should be used for
    // passing 'inreg' function arguments. Therefore, the number of 'inreg' VGPR
    // arguments must not exceed 26.
    assert(CurLane < 26 && "more than expected VGPR inreg arguments");
    SmallVector<SDValue, 4> Operands{
        DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32),
        DAG.getRegister(DstReg, VT), Val,
        DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
    DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
    return DstVal;
  }
```

And then:

```
    if (ArgIdx >= NumSpecialInputs &&
        Outs[ArgIdx - NumSpecialInputs].Flags.isInReg() &&
        AMDGPU::VGPR_32RegClass.contains(Reg)) {
      Chain = Spiller.write(Chain, DL, Reg, Val, InGlue,
                            ArgLocs[ArgIdx - NumSpecialInputs].getLocVT());
    } else {
      Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
    }
    InGlue = Chain.getValue(1);
```

This doesn't seem to work because there doesn't seem to be an actual "effect" of the write lane intrinsic and then it is simply not emitted, as shown in the newly added test.

The code currently in the PR doesn't work as well, because the same glue on the different values messes up the scheduling.

https://github.com/llvm/llvm-project/pull/133614