[mlir] [clang] [llvm] [AArch64][SME] Remove immediate argument restriction for svldr and svstr (PR #68565)
Sander de Smalen via cfe-commits
cfe-commits at lists.llvm.org
Mon Nov 13 05:48:57 PST 2023
================
@@ -4825,6 +4827,113 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
Mask);
}
+// Lower an SME LDR/STR ZA intrinsic to LDR_ZA_PSEUDO or STR_ZA.
+// Case 1: If the vector number (vecnum) is an immediate in range, it gets
+// folded into the instruction
+// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
+// Case 2: If the vecnum is not an immediate, then it is used to modify the base
+// and tile slice registers
+// ldr(%tileslice, %ptr, %vecnum)
+// ->
+// %svl = rdsvl
+// %ptr2 = %ptr + %svl * %vecnum
+// %tileslice2 = %tileslice + %vecnum
+// ldr [%tileslice2, 0], [%ptr2, 0]
+// Case 3: If the vecnum is an immediate out of range, then the same is done as
+// case 2, but the base and slice registers are modified by the greatest
+// multiple of 15 lower than the vecnum and the remainder is folded into the
+// instruction. This means that successive loads and stores that are offset from
+// each other can share the same base and slice register updates.
+// ldr(%tileslice, %ptr, 22)
+// ldr(%tileslice, %ptr, 23)
+// ->
+// %svl = rdsvl
+// %ptr2 = %ptr + %svl * 15
+// %tileslice2 = %tileslice + 15
+// ldr [%tileslice2, 7], [%ptr2, 7]
+// ldr [%tileslice2, 8], [%ptr2, 8]
+// Case 4: If the vecnum is an add of an immediate, then the non-immediate
+// operand and the immediate can be folded into the instruction, like case 2.
+// ldr(%tileslice, %ptr, %vecnum + 7)
+// ldr(%tileslice, %ptr, %vecnum + 8)
+// ->
+// %svl = rdsvl
+// %ptr2 = %ptr + %svl * %vecnum
+// %tileslice2 = %tileslice + %vecnum
+// ldr [%tileslice2, 7], [%ptr2, 7]
+// ldr [%tileslice2, 8], [%ptr2, 8]
+// Case 5: The vecnum being an add of an immediate out of range is also handled,
+// in which case the same remainder logic as case 3 is used.
+SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
+ SDLoc DL(N);
+
+ SDValue TileSlice = N->getOperand(2);
+ SDValue Base = N->getOperand(3);
+ SDValue VecNum = N->getOperand(4);
+ int Addend = 0;
+
+ // If the vnum is an add, we can fold that add into the instruction if the
+ // operand is an immediate. The range check is performed below.
+ if (VecNum.getOpcode() == ISD::ADD) {
+ if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum.getOperand(1))) {
+ Addend = ImmNode->getSExtValue();
+ VecNum = VecNum.getOperand(0);
+ }
+ }
+
+ SDValue Remainder = DAG.getTargetConstant(Addend, DL, MVT::i32);
+
+ // true if the base and slice registers need to be modified
+ bool NeedsAdd = true;
+ auto ImmNode = dyn_cast<ConstantSDNode>(VecNum);
+ if (ImmNode || Addend != 0) {
+ int Imm = ImmNode ? ImmNode->getSExtValue() + Addend : Addend;
+ Remainder = DAG.getTargetConstant(Imm % 16, DL, MVT::i32);
+ if (Imm >= 0 && Imm <= 15) {
+ // If vnum is an immediate in range then we don't need to modify the tile
+ // slice and base register. We could also get here because Addend != 0 but
+ // vecnum is not an immediate, in which case we still want the base and
+ // slice register to be modified
+ NeedsAdd = !ImmNode;
----------------
sdesmalen-arm wrote:
Maybe it's me, but I find this logic a little tricky to follow. Specifically here that the value for NeedsAdd depends on previous control flow, which depends on whether ImmNode is defined.
It might be a bit simpler to follow if you progressively break down VecNum in two subsequent steps.
First break it down into:
* A variable part (e.g. for `i + 17` that would be `i`)
* A constant (e.g. for `i + 17` that would be `17`)
Second to break down `17` into:
* A base constant (for `17` that would be `15`)
* An immediate (for `17` that would be `2`)
When you then fold the base constant into the variable part, you can avoid the need for `NeedsAdd` because it can be inferred from whether there is a variable part, e.g.:
```
// First split VecNum into a "Variable" and "Constant" part.
int32_t ConstAddend = 0;
SDValue VariableAddend = VecNum;
if (VecNum.getOpcode() == ISD::ADD && isa<ConstantSDNode>(VecNum.getOperand(1))) {
ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
VariableAddend = VecNum.getOperand(0);
} else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
ConstAddend = ImmNode->getSExtValue();
VariableAddend = SDValue();
}
// Further try to split the constant into an immediate.
int32_t ImmAddend = ConstAddend % 16;
if (int32_t C = (ConstAddend - ImmAddend)) {
SDValue CVal = DAG.getConstant(C, DL, MVT::i32);
VariableAddend = VariableAddend ?
DAG.getNode(ISD::ADD, DL, MVT::i32, {VariableAddend, CVal}) : CVal;
}
if (VariableAddend) {
// Get the vector length that will be multiplied by VariableAddend
auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
...
}
```
https://github.com/llvm/llvm-project/pull/68565
More information about the cfe-commits
mailing list