[llvm] [MachinePipeliner] Make Recurrence MII More Accurate (PR #105475)
Michael Marjieh via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 22 00:35:18 PDT 2024
https://github.com/mmarjieh updated https://github.com/llvm/llvm-project/pull/105475
>From e94a5a46c80a71b123cb8b19b226561a0e559fb6 Mon Sep 17 00:00:00 2001
From: Michael Marjieh <michael.marjieh at mobileye.com>
Date: Tue, 20 Aug 2024 17:07:47 +0300
Subject: [PATCH] [MachinePipeliner] Make Recurrence MII More Accurate
Current RecMII calculation is bigger than it needs to be.
The calculation was refined in this patch.
---
llvm/include/llvm/CodeGen/MachinePipeliner.h | 72 ++++++++++++++-----
llvm/lib/CodeGen/MachinePipeliner.cpp | 16 +++--
.../Thumb2/pipeliner-preserve-ties.mir | 2 +-
3 files changed, 64 insertions(+), 26 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 7fe5581faa183d..4c5b151c81f5e8 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -197,7 +197,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
}
void createAdjacencyStructure(SwingSchedulerDAG *DAG);
- bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
+ bool circuit(int V, int S, NodeSetType &NodeSets,
+ const SwingSchedulerDAG *DAG, bool HasBackedge = false);
void unblock(int U);
};
@@ -260,7 +261,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
}
- bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
+ bool isLoopCarriedDep(SUnit *Source, const SDep &Dep,
+ bool isSucc = true) const;
/// The distance function, which indicates that operation V of iteration I
/// depends on operations U of iteration I-distance.
@@ -311,7 +313,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
void computeNodeOrder(NodeSetType &NodeSets);
void checkValidNodeOrder(const NodeSetType &Circuits) const;
bool schedulePipeline(SMSchedule &Schedule);
- bool computeDelta(MachineInstr &MI, unsigned &Delta);
+ bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
MachineInstr *findDefInLoop(Register Reg);
bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
unsigned &OffsetPos, unsigned &NewBase,
@@ -339,24 +341,58 @@ class NodeSet {
using iterator = SetVector<SUnit *>::const_iterator;
NodeSet() = default;
- NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
- Latency = 0;
- for (const SUnit *Node : Nodes) {
- DenseMap<SUnit *, unsigned> SuccSUnitLatency;
- for (const SDep &Succ : Node->Succs) {
- auto SuccSUnit = Succ.getSUnit();
- if (!Nodes.count(SuccSUnit))
+ NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
+ : Nodes(S, E), HasRecurrence(true) {
+ // Calculate the latency of this node set.
+ // Example to demonstrate the calculation:
+ // Given: N0 -> N1 -> N2 -> N0
+ // Edges:
+ // (N0 -> N1, 3)
+ // (N0 -> N1, 5)
+ // (N1 -> N2, 2)
+ // (N2 -> N0, 1)
+ // The total latency which is a lower bound of the recurrence MII is the
+ // longest patch from N0 back to N0 given only the edges of this node set.
+ // In this example, the latency is: 5 + 2 + 1 = 8.
+ //
+ // Hold a map from each SUnit in the circle to the maximum distance from the
+ // source node by only considering the nodes.
+ DenseMap<SUnit *, unsigned> SUnitToDistance;
+ for (auto *Node : Nodes)
+ SUnitToDistance[Node] = 0;
+
+ for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) {
+ SUnit *U = Nodes[I - 1];
+ SUnit *V = Nodes[I % Nodes.size()];
+ for (const SDep &Succ : U->Succs) {
+ SUnit *SuccSUnit = Succ.getSUnit();
+ if (V != SuccSUnit)
continue;
- unsigned CurLatency = Succ.getLatency();
- unsigned MaxLatency = 0;
- if (SuccSUnitLatency.count(SuccSUnit))
- MaxLatency = SuccSUnitLatency[SuccSUnit];
- if (CurLatency > MaxLatency)
- SuccSUnitLatency[SuccSUnit] = CurLatency;
+ if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) {
+ SUnitToDistance[V] = SUnitToDistance[U] + Succ.getLatency();
+ }
}
- for (auto SUnitLatency : SuccSUnitLatency)
- Latency += SUnitLatency.second;
}
+ // Handle a back-edge between a store and a load
+ SUnit *FirstNode = Nodes[0];
+ SUnit *LastNode = Nodes[Nodes.size() - 1];
+
+ if (LastNode->getInstr()->mayStore() && FirstNode->getInstr()->mayLoad()) {
+ for (auto &PI : LastNode->Preds) {
+ // If we have an order dep between a load and a store that is
+ // potentially loop carried then a back-edge exists between the last
+ // node and the first node that isn't modeled in the DAG. Handle it
+ // manually by adding 1 to the distance of the last node.
+ if (PI.getSUnit() != FirstNode || PI.getKind() != SDep::Order ||
+ !DAG->isLoopCarriedDep(LastNode, PI, false))
+ continue;
+ SUnitToDistance[FirstNode] =
+ std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1);
+ }
+ }
+
+ // The latency is the distance from the source node to itself.
+ Latency = SUnitToDistance[Nodes.front()];
}
bool insert(SUnit *SU) { return Nodes.insert(SU); }
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 416129ff837c33..34eaf211c17a30 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1706,6 +1706,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
/// Identify an elementary circuit in the dependence graph starting at the
/// specified node.
bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
+ const SwingSchedulerDAG *DAG,
bool HasBackedge) {
SUnit *SV = &SUnits[V];
bool F = false;
@@ -1719,12 +1720,13 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
continue;
if (W == S) {
if (!HasBackedge)
- NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
+ NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
F = true;
++NumPaths;
break;
- } else if (!Blocked.test(W)) {
- if (circuit(W, S, NodeSets,
+ }
+ if (!Blocked.test(W)) {
+ if (circuit(W, S, NodeSets, DAG,
Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
F = true;
}
@@ -1767,9 +1769,9 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
Circuits Cir(SUnits, Topo);
// Create the adjacency structure.
Cir.createAdjacencyStructure(this);
- for (int i = 0, e = SUnits.size(); i != e; ++i) {
+ for (int I = 0, E = SUnits.size(); I != E; ++I) {
Cir.reset();
- Cir.circuit(i, i, NodeSets);
+ Cir.circuit(I, I, NodeSets, this);
}
// Change the dependences back so that we've created a DAG again.
@@ -2565,7 +2567,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
/// Return true if we can compute the amount the instruction changes
/// during each iteration. Set Delta to the amount of the change.
-bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
+bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const MachineOperand *BaseOp;
int64_t Offset;
@@ -2719,7 +2721,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
/// potentially. A dependence is loop carried if the destination defines a value
/// that may be used or defined by the source in a subsequent iteration.
bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
- bool isSucc) {
+ bool isSucc) const {
if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
return false;
diff --git a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
index 08f08c41917b1a..6983c6f97cc81f 100644
--- a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
+++ b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
@@ -222,8 +222,8 @@ body: |
; CHECK-NEXT: [[t2MLS1:%[0-9]+]]:rgpr = t2MLS [[t2SDIV1]], [[t2LDRSHi12_1]], [[t2LDRSH_PRE2]], 14 /* CC::al */, $noreg
; CHECK-NEXT: [[t2UXTH2:%[0-9]+]]:rgpr = t2UXTH [[t2SDIV1]], 0, 14 /* CC::al */, $noreg
; CHECK-NEXT: [[t2LDRHi12_9:%[0-9]+]]:gprnopc = t2LDRHi12 [[t2LDRSH_PRE3]], 6, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.uglygep6, align 2, !tbaa !9)
- ; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2UXTH3:%[0-9]+]]:rgpr = t2UXTH [[t2MLS1]], 0, 14 /* CC::al */, $noreg
+ ; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[PHI1]](tied-def 0)
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_9]], [[t2UXTH3]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[t2ADDri4]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[t2ADDri4]](tied-def 0)
More information about the llvm-commits
mailing list