[mlir] [clang] [llvm] [AArch64][SME] Remove immediate argument restriction for svldr and svstr (PR #68565)

Mon Nov 13 05:48:57 PST 2023

================
@@ -4825,6 +4827,113 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
                      Mask);
 }
 
+// Lower an SME LDR/STR ZA intrinsic to LDR_ZA_PSEUDO or STR_ZA.
+// Case 1: If the vector number (vecnum) is an immediate in range, it gets
+// folded into the instruction
+//    ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
+// Case 2: If the vecnum is not an immediate, then it is used to modify the base
+// and tile slice registers
+//    ldr(%tileslice, %ptr, %vecnum)
+//    ->
+//    %svl = rdsvl
+//    %ptr2 = %ptr + %svl * %vecnum
+//    %tileslice2 = %tileslice + %vecnum
+//    ldr [%tileslice2, 0], [%ptr2, 0]
+// Case 3: If the vecnum is an immediate out of range, then the same is done as
+// case 2, but the base and slice registers are modified by the greatest
+// multiple of 15 lower than the vecnum and the remainder is folded into the
+// instruction. This means that successive loads and stores that are offset from
+// each other can share the same base and slice register updates.
+//    ldr(%tileslice, %ptr, 22)
+//    ldr(%tileslice, %ptr, 23)
+//    ->
+//    %svl = rdsvl
+//    %ptr2 = %ptr + %svl * 15
+//    %tileslice2 = %tileslice + 15
+//    ldr [%tileslice2, 7], [%ptr2, 7]
+//    ldr [%tileslice2, 8], [%ptr2, 8]
+// Case 4: If the vecnum is an add of an immediate, then the non-immediate
+// operand and the immediate can be folded into the instruction, like case 2.
+//    ldr(%tileslice, %ptr, %vecnum + 7)
+//    ldr(%tileslice, %ptr, %vecnum + 8)
+//    ->
+//    %svl = rdsvl
+//    %ptr2 = %ptr + %svl * %vecnum
+//    %tileslice2 = %tileslice + %vecnum
+//    ldr [%tileslice2, 7], [%ptr2, 7]
+//    ldr [%tileslice2, 8], [%ptr2, 8]
+// Case 5: The vecnum being an add of an immediate out of range is also handled,
+// in which case the same remainder logic as case 3 is used.
+SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
+  SDLoc DL(N);
+
+  SDValue TileSlice = N->getOperand(2);
+  SDValue Base = N->getOperand(3);
+  SDValue VecNum = N->getOperand(4);
+  int Addend = 0;
+
+  // If the vnum is an add, we can fold that add into the instruction if the
+  // operand is an immediate. The range check is performed below.
+  if (VecNum.getOpcode() == ISD::ADD) {
+    if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum.getOperand(1))) {
+      Addend = ImmNode->getSExtValue();
+      VecNum = VecNum.getOperand(0);
+    }
+  }
+
+  SDValue Remainder = DAG.getTargetConstant(Addend, DL, MVT::i32);
+
+  // true if the base and slice registers need to be modified
+  bool NeedsAdd = true;
+  auto ImmNode = dyn_cast<ConstantSDNode>(VecNum);
+  if (ImmNode || Addend != 0) {
+    int Imm = ImmNode ? ImmNode->getSExtValue() + Addend : Addend;
+    Remainder = DAG.getTargetConstant(Imm % 16, DL, MVT::i32);
+    if (Imm >= 0 && Imm <= 15) {
+      // If vnum is an immediate in range then we don't need to modify the tile
+      // slice and base register. We could also get here because Addend != 0 but
+      // vecnum is not an immediate, in which case we still want the base and
+      // slice register to be modified
+      NeedsAdd = !ImmNode;
----------------
sdesmalen-arm wrote:

Maybe it's me, but I find this logic a little tricky to follow. Specifically here that the value for NeedsAdd depends on previous control flow, which depends on whether ImmNode is defined.

It might be a bit simpler to follow if you progressively break down VecNum in two subsequent steps.

First break it down into:
* A variable part (e.g. for `i + 17` that would be `i`)
* A constant (e.g. for `i + 17` that would be `17`)

Second to break down `17` into:
* A base constant (for `17` that would be `15`)
* An immediate (for `17` that would be `2`)

When you then fold the base constant into the variable part, you can avoid the need for `NeedsAdd` because it can be inferred from whether there is a variable part, e.g.:

```
// First split VecNum into a "Variable" and "Constant" part.
int32_t ConstAddend = 0;
SDValue VariableAddend = VecNum;
if (VecNum.getOpcode() == ISD::ADD && isa<ConstantSDNode>(VecNum.getOperand(1))) {
  ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
  VariableAddend = VecNum.getOperand(0);
} else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
  ConstAddend = ImmNode->getSExtValue();
  VariableAddend = SDValue();
}

// Further try to split the constant into an immediate.
int32_t ImmAddend = ConstAddend % 16;
if (int32_t C = (ConstAddend - ImmAddend)) {
  SDValue CVal = DAG.getConstant(C, DL, MVT::i32);
  VariableAddend = VariableAddend ?
    DAG.getNode(ISD::ADD, DL, MVT::i32, {VariableAddend, CVal}) : CVal;
}

if (VariableAddend) {
  // Get the vector length that will be multiplied by VariableAddend
  auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
  ...
}
```

https://github.com/llvm/llvm-project/pull/68565