[llvm] [MachinePipeliner] Make Recurrence MII More Accurate (PR #105475)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 26 07:35:12 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: Michael Marjieh (mmarjieh)
<details>
<summary>Changes</summary>
Current RecMII calculation is bigger than it needs to be. The calculation was refined in this patch.
---
Full diff: https://github.com/llvm/llvm-project/pull/105475.diff
4 Files Affected:
- (modified) llvm/include/llvm/CodeGen/MachinePipeliner.h (+52-18)
- (modified) llvm/lib/CodeGen/MachinePipeliner.cpp (+19-11)
- (added) llvm/test/CodeGen/PowerPC/sms-recmii.ll (+48)
- (modified) llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir (+1-1)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 7fe5581faa183d..23d3181e2a0ae7 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -197,7 +197,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
}
void createAdjacencyStructure(SwingSchedulerDAG *DAG);
- bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
+ bool circuit(int V, int S, NodeSetType &NodeSets,
+ const SwingSchedulerDAG *DAG, bool HasBackedge = false);
void unblock(int U);
};
@@ -260,7 +261,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
}
- bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
+ bool isLoopCarriedDep(SUnit *Source, const SDep &Dep,
+ bool isSucc = true) const;
/// The distance function, which indicates that operation V of iteration I
/// depends on operations U of iteration I-distance.
@@ -311,7 +313,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
void computeNodeOrder(NodeSetType &NodeSets);
void checkValidNodeOrder(const NodeSetType &Circuits) const;
bool schedulePipeline(SMSchedule &Schedule);
- bool computeDelta(MachineInstr &MI, unsigned &Delta);
+ bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
MachineInstr *findDefInLoop(Register Reg);
bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
unsigned &OffsetPos, unsigned &NewBase,
@@ -339,24 +341,56 @@ class NodeSet {
using iterator = SetVector<SUnit *>::const_iterator;
NodeSet() = default;
- NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
- Latency = 0;
- for (const SUnit *Node : Nodes) {
- DenseMap<SUnit *, unsigned> SuccSUnitLatency;
- for (const SDep &Succ : Node->Succs) {
- auto SuccSUnit = Succ.getSUnit();
- if (!Nodes.count(SuccSUnit))
+ NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
+ : Nodes(S, E), HasRecurrence(true) {
+ // Calculate the latency of this node set.
+ // Example to demonstrate the calculation:
+ // Given: N0 -> N1 -> N2 -> N0
+ // Edges:
+ // (N0 -> N1, 3)
+ // (N0 -> N1, 5)
+ // (N1 -> N2, 2)
+ // (N2 -> N0, 1)
+ // The total latency which is a lower bound of the recurrence MII is the
+ // longest path from N0 back to N0 given only the edges of this node set.
+ // In this example, the latency is: 5 + 2 + 1 = 8.
+ //
+ // Hold a map from each SUnit in the circle to the maximum distance from the
+ // source node by only considering the nodes.
+ DenseMap<SUnit *, unsigned> SUnitToDistance;
+ for (auto *Node : Nodes)
+ SUnitToDistance[Node] = 0;
+
+ for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) {
+ SUnit *U = Nodes[I - 1];
+ SUnit *V = Nodes[I % Nodes.size()];
+ for (const SDep &Succ : U->Succs) {
+ SUnit *SuccSUnit = Succ.getSUnit();
+ if (V != SuccSUnit)
continue;
- unsigned CurLatency = Succ.getLatency();
- unsigned MaxLatency = 0;
- if (SuccSUnitLatency.count(SuccSUnit))
- MaxLatency = SuccSUnitLatency[SuccSUnit];
- if (CurLatency > MaxLatency)
- SuccSUnitLatency[SuccSUnit] = CurLatency;
+ if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) {
+ SUnitToDistance[V] = SUnitToDistance[U] + Succ.getLatency();
+ }
}
- for (auto SUnitLatency : SuccSUnitLatency)
- Latency += SUnitLatency.second;
}
+ // Handle a back-edge between a store and a load
+ SUnit *FirstNode = Nodes[0];
+ SUnit *LastNode = Nodes[Nodes.size() - 1];
+
+ for (auto &PI : LastNode->Preds) {
+ // If we have an order dep that is potentially loop carried then a
+ // back-edge exists between the last node and the first node that isn't
+ // modeled in the DAG. Handle it manually by adding 1 to the distance of
+ // the last node.
+ if (PI.getSUnit() != FirstNode || PI.getKind() != SDep::Order ||
+ !DAG->isLoopCarriedDep(LastNode, PI, false))
+ continue;
+ SUnitToDistance[FirstNode] =
+ std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1);
+ }
+
+ // The latency is the distance from the source node to itself.
+ Latency = SUnitToDistance[Nodes.front()];
}
bool insert(SUnit *SU) { return Nodes.insert(SU); }
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 416129ff837c33..3f22e78c53f04b 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1680,13 +1680,19 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
Added.set(N);
}
}
- // A chain edge between a store and a load is treated as a back-edge in the
- // adjacency matrix.
+ // Order edges of the following:
+ // 1. Load -> Store
+ // 2. Store -> Load
+ // are treated as a back-edge in the adjacency matrix.
+ // Store after store was handled above.
for (auto &PI : SUnits[i].Preds) {
- if (!SUnits[i].getInstr()->mayStore() ||
+ if (PI.getKind() != SDep::Order ||
!DAG->isLoopCarriedDep(&SUnits[i], PI, false))
continue;
- if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
+ if ((SUnits[i].getInstr()->mayLoad() &&
+ PI.getSUnit()->getInstr()->mayStore()) ||
+ (SUnits[i].getInstr()->mayStore() &&
+ PI.getSUnit()->getInstr()->mayLoad())) {
int N = PI.getSUnit()->NodeNum;
if (!Added.test(N)) {
AdjK[i].push_back(N);
@@ -1706,6 +1712,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
/// Identify an elementary circuit in the dependence graph starting at the
/// specified node.
bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
+ const SwingSchedulerDAG *DAG,
bool HasBackedge) {
SUnit *SV = &SUnits[V];
bool F = false;
@@ -1719,12 +1726,13 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
continue;
if (W == S) {
if (!HasBackedge)
- NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
+ NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
F = true;
++NumPaths;
break;
- } else if (!Blocked.test(W)) {
- if (circuit(W, S, NodeSets,
+ }
+ if (!Blocked.test(W)) {
+ if (circuit(W, S, NodeSets, DAG,
Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
F = true;
}
@@ -1767,9 +1775,9 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
Circuits Cir(SUnits, Topo);
// Create the adjacency structure.
Cir.createAdjacencyStructure(this);
- for (int i = 0, e = SUnits.size(); i != e; ++i) {
+ for (int I = 0, E = SUnits.size(); I != E; ++I) {
Cir.reset();
- Cir.circuit(i, i, NodeSets);
+ Cir.circuit(I, I, NodeSets, this);
}
// Change the dependences back so that we've created a DAG again.
@@ -2565,7 +2573,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
/// Return true if we can compute the amount the instruction changes
/// during each iteration. Set Delta to the amount of the change.
-bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
+bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const MachineOperand *BaseOp;
int64_t Offset;
@@ -2719,7 +2727,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
/// potentially. A dependence is loop carried if the destination defines a value
/// that may be used or defined by the source in a subsequent iteration.
bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
- bool isSucc) {
+ bool isSucc) const {
if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
return false;
diff --git a/llvm/test/CodeGen/PowerPC/sms-recmii.ll b/llvm/test/CodeGen/PowerPC/sms-recmii.ll
new file mode 100644
index 00000000000000..82b1c9e82fcb78
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/sms-recmii.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs\
+; RUN: -mcpu=pwr9 --ppc-enable-pipeliner --debug-only=pipeliner 2>&1 | FileCheck %s
+
+; Test that the pipeliner doesn't overestimate the recurrence MII when evaluating circuits.
+; CHECK: MII = 16 MAX_II = 26 (rec=16, res=5)
+define dso_local void @comp_method(ptr noalias nocapture noundef readonly %0, ptr nocapture noundef writeonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6, i64 %v1) local_unnamed_addr {
+ %8 = icmp sgt i32 %3, 64
+ tail call void @llvm.assume(i1 %8)
+ %9 = and i32 %3, 1
+ %10 = icmp eq i32 %9, 0
+ tail call void @llvm.assume(i1 %10)
+ %11 = sext i32 %5 to i64
+ %12 = sext i32 %6 to i64
+ %13 = zext nneg i32 %3 to i64
+ %14 = getelementptr i8, ptr %2, i64 %12
+ br label %16
+
+15:
+ ret void
+
+16:
+ %17 = phi i64 [ 0, %7 ], [ %24, %16 ]
+ %18 = getelementptr inbounds i8, ptr %0, i64 %17
+ %19 = load i8, ptr %18, align 1
+ %20 = sext i8 %19 to i64
+ %21 = getelementptr inbounds i8, ptr %1, i64 %20
+ store i8 2, ptr %21, align 1
+ %22 = mul nsw i64 %17, %11
+ %a1 = ashr i64 %22, 2
+ %a2 = add i64 %a1, %v1
+ %a3 = add i64 %20, %a2
+ %a4 = mul nsw i64 %a3, 5
+ %23 = getelementptr i8, ptr %14, i64 %a4
+ %a5 = load i8, ptr %23, align 1
+ %a4_truncated = trunc i64 %a4 to i8
+ %min = call i8 @llvm.smin.i8(i8 %a5, i8 %a4_truncated)
+ %res = mul i8 %min, %a5
+ store i8 %res, ptr %23, align 1
+ %24 = add nuw nsw i64 %17, 1
+ %25 = icmp eq i64 %24, %13
+ br i1 %25, label %15, label %16
+}
+
+declare void @llvm.assume(i1 noundef) #1
+declare i8 @llvm.smin.i8(i8, i8)
+
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
diff --git a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
index 08f08c41917b1a..6983c6f97cc81f 100644
--- a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
+++ b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
@@ -222,8 +222,8 @@ body: |
; CHECK-NEXT: [[t2MLS1:%[0-9]+]]:rgpr = t2MLS [[t2SDIV1]], [[t2LDRSHi12_1]], [[t2LDRSH_PRE2]], 14 /* CC::al */, $noreg
; CHECK-NEXT: [[t2UXTH2:%[0-9]+]]:rgpr = t2UXTH [[t2SDIV1]], 0, 14 /* CC::al */, $noreg
; CHECK-NEXT: [[t2LDRHi12_9:%[0-9]+]]:gprnopc = t2LDRHi12 [[t2LDRSH_PRE3]], 6, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.uglygep6, align 2, !tbaa !9)
- ; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2UXTH3:%[0-9]+]]:rgpr = t2UXTH [[t2MLS1]], 0, 14 /* CC::al */, $noreg
+ ; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[PHI1]](tied-def 0)
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_9]], [[t2UXTH3]], 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK-NEXT: [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[t2ADDri4]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[t2ADDri4]](tied-def 0)
``````````
</details>
https://github.com/llvm/llvm-project/pull/105475
More information about the llvm-commits
mailing list