[llvm] [ARM] Speedups for CombineBaseUpdate. (PR #129725)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 5 04:54:11 PST 2025
https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/129725
>From 0507796604bda9dc44dfa4f75a2610f6acfa3bd3 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 4 Mar 2025 15:23:30 +0000
Subject: [PATCH 1/2] [ARM] Speedups for CombineBaseUpdate.
This attempts to put limits onto CombineBaseUpdate for degenerate cases like
127477. The biggest change is to add a limit to the number of base updates to
check in CombineBaseUpdate. 64 is hopefully plenty high enough for most runtime
unrolled loops to generate postinc where they are beneficial.
It also moves the check for isValidBaseUpdate later so that it only happens if
we will generate a valid instruction. The 1024 limit to hasPredecessorHelper
comes from the X86 backend, which uses the same limit.
I haven't added a test case as it would need to be very big and my attempts at
generating a smaller version did not show anything useful.
Fixes #127477.
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 65 ++++++++++++++-----------
1 file changed, 36 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e3dc337bd0843..b5e65d1424644 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -149,6 +149,11 @@ MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
cl::desc("Maximum interleave factor for MVE VLDn to generate."),
cl::init(2));
+cl::opt<unsigned> ArmMaxBaseUpdatesToCheck(
+ "arm-max-base-updates-to-check", cl::Hidden,
+ cl::desc("Maximum number of base-updates to check generating postindex."),
+ cl::init(64));
+
/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
constexpr MVT FlagsVT = MVT::i32;
@@ -15865,6 +15870,21 @@ struct BaseUpdateUser {
unsigned ConstInc;
};
+static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
+ // Check that the add is independent of the load/store.
+ // Otherwise, folding it would create a cycle. Search through Addr
+ // as well, since the User may not be a direct user of Addr and
+ // only share a base pointer.
+ SmallPtrSet<const SDNode *, 32> Visited;
+ SmallVector<const SDNode *, 16> Worklist;
+ Worklist.push_back(N);
+ Worklist.push_back(User);
+ if (SDNode::hasPredecessorHelper(N, Visited, Worklist, 1024) ||
+ SDNode::hasPredecessorHelper(User, Visited, Worklist, 1024))
+ return false;
+ return true;
+}
+
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
struct BaseUpdateUser &User,
bool SimpleConstIncOnly,
@@ -16066,6 +16086,9 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
if (SimpleConstIncOnly && User.ConstInc != NumBytes)
return false;
+ if (!isValidBaseUpdate(N, User.N))
+ return false;
+
// OK, we found an ADD we can fold into the base update.
// Now, create a _UPD node, taking care of not breaking alignment.
@@ -16214,21 +16237,6 @@ static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
}
}
-static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
- // Check that the add is independent of the load/store.
- // Otherwise, folding it would create a cycle. Search through Addr
- // as well, since the User may not be a direct user of Addr and
- // only share a base pointer.
- SmallPtrSet<const SDNode *, 32> Visited;
- SmallVector<const SDNode *, 16> Worklist;
- Worklist.push_back(N);
- Worklist.push_back(User);
- if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
- SDNode::hasPredecessorHelper(User, Visited, Worklist))
- return false;
- return true;
-}
-
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
/// NEON load/store intrinsics, and generic vector load/stores, to merge
/// base address updates.
@@ -16242,6 +16250,10 @@ static SDValue CombineBaseUpdate(SDNode *N,
const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
+ // Limit the number of possible base-updates we look at to prevent degenerate
+ // cases.
+ unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
+
SDValue Addr = N->getOperand(AddrOpIdx);
SmallVector<BaseUpdateUser, 8> BaseUpdates;
@@ -16256,8 +16268,11 @@ static SDValue CombineBaseUpdate(SDNode *N,
unsigned ConstInc =
getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
- if (ConstInc || User->getOpcode() == ISD::ADD)
+ if (ConstInc || User->getOpcode() == ISD::ADD) {
BaseUpdates.push_back({User, Inc, ConstInc});
+ if (BaseUpdates.size() > MaxBaseUpdates)
+ break;
+ }
}
// If the address is a constant pointer increment itself, find
@@ -16284,27 +16299,19 @@ static SDValue CombineBaseUpdate(SDNode *N,
unsigned NewConstInc = UserOffset - Offset;
SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
BaseUpdates.push_back({User, NewInc, NewConstInc});
+ if (BaseUpdates.size() > MaxBaseUpdates)
+ break;
}
}
// Try to fold the load/store with an update that matches memory
// access size. This should work well for sequential loads.
- //
- // Filter out invalid updates as well.
unsigned NumValidUpd = BaseUpdates.size();
- for (unsigned I = 0; I < NumValidUpd;) {
+ for (unsigned I = 0; I < NumValidUpd; I++) {
BaseUpdateUser &User = BaseUpdates[I];
- if (!isValidBaseUpdate(N, User.N)) {
- --NumValidUpd;
- std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
- continue;
- }
-
if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
return SDValue();
- ++I;
}
- BaseUpdates.resize(NumValidUpd);
// Try to fold with other users. Non-constant updates are considered
// first, and constant updates are sorted to not break a sequence of
@@ -16360,8 +16367,8 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
Visited.insert(Addr.getNode());
Worklist.push_back(N);
Worklist.push_back(User);
- if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
- SDNode::hasPredecessorHelper(User, Visited, Worklist))
+ if (SDNode::hasPredecessorHelper(N, Visited, Worklist, 1024) ||
+ SDNode::hasPredecessorHelper(User, Visited, Worklist, 1024))
continue;
// Find the new opcode for the updating load/store.
>From 5877ba040e8c25facd4eb449e121c3849d57c353 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 5 Mar 2025 12:53:35 +0000
Subject: [PATCH 2/2] Address comments
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b5e65d1424644..1fb68fa85f7b6 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -15879,8 +15879,9 @@ static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
SmallVector<const SDNode *, 16> Worklist;
Worklist.push_back(N);
Worklist.push_back(User);
- if (SDNode::hasPredecessorHelper(N, Visited, Worklist, 1024) ||
- SDNode::hasPredecessorHelper(User, Visited, Worklist, 1024))
+ const unsigned MaxSteps = 1024;
+ if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
+ SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
return false;
return true;
}
@@ -16270,7 +16271,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
if (ConstInc || User->getOpcode() == ISD::ADD) {
BaseUpdates.push_back({User, Inc, ConstInc});
- if (BaseUpdates.size() > MaxBaseUpdates)
+ if (BaseUpdates.size() >= MaxBaseUpdates)
break;
}
}
@@ -16299,7 +16300,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
unsigned NewConstInc = UserOffset - Offset;
SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
BaseUpdates.push_back({User, NewInc, NewConstInc});
- if (BaseUpdates.size() > MaxBaseUpdates)
+ if (BaseUpdates.size() >= MaxBaseUpdates)
break;
}
}
@@ -16367,8 +16368,9 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
Visited.insert(Addr.getNode());
Worklist.push_back(N);
Worklist.push_back(User);
- if (SDNode::hasPredecessorHelper(N, Visited, Worklist, 1024) ||
- SDNode::hasPredecessorHelper(User, Visited, Worklist, 1024))
+ const unsigned MaxSteps = 1024;
+ if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
+ SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
continue;
// Find the new opcode for the updating load/store.
More information about the llvm-commits
mailing list