[llvm] [MachinePipeliner] Make Recurrence MII More Accurate (PR #105475)

Mon Aug 26 07:35:12 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-powerpc

Author: Michael Marjieh (mmarjieh)

<details>
<summary>Changes</summary>

Current RecMII calculation is bigger than it needs to be. The calculation was refined in this patch.

---
Full diff: https://github.com/llvm/llvm-project/pull/105475.diff


4 Files Affected:

- (modified) llvm/include/llvm/CodeGen/MachinePipeliner.h (+52-18) 
- (modified) llvm/lib/CodeGen/MachinePipeliner.cpp (+19-11) 
- (added) llvm/test/CodeGen/PowerPC/sms-recmii.ll (+48) 
- (modified) llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir (+1-1) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 7fe5581faa183d..23d3181e2a0ae7 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -197,7 +197,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     }
 
     void createAdjacencyStructure(SwingSchedulerDAG *DAG);
-    bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
+    bool circuit(int V, int S, NodeSetType &NodeSets,
+                 const SwingSchedulerDAG *DAG, bool HasBackedge = false);
     void unblock(int U);
   };
 
@@ -260,7 +261,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
   }
 
-  bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
+  bool isLoopCarriedDep(SUnit *Source, const SDep &Dep,
+                        bool isSucc = true) const;
 
   /// The distance function, which indicates that operation V of iteration I
   /// depends on operations U of iteration I-distance.
@@ -311,7 +313,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
   void computeNodeOrder(NodeSetType &NodeSets);
   void checkValidNodeOrder(const NodeSetType &Circuits) const;
   bool schedulePipeline(SMSchedule &Schedule);
-  bool computeDelta(MachineInstr &MI, unsigned &Delta);
+  bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
   MachineInstr *findDefInLoop(Register Reg);
   bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
                              unsigned &OffsetPos, unsigned &NewBase,
@@ -339,24 +341,56 @@ class NodeSet {
   using iterator = SetVector<SUnit *>::const_iterator;
 
   NodeSet() = default;
-  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
-    Latency = 0;
-    for (const SUnit *Node : Nodes) {
-      DenseMap<SUnit *, unsigned> SuccSUnitLatency;
-      for (const SDep &Succ : Node->Succs) {
-        auto SuccSUnit = Succ.getSUnit();
-        if (!Nodes.count(SuccSUnit))
+  NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
+      : Nodes(S, E), HasRecurrence(true) {
+    // Calculate the latency of this node set.
+    // Example to demonstrate the calculation:
+    // Given: N0 -> N1 -> N2 -> N0
+    // Edges:
+    // (N0 -> N1, 3)
+    // (N0 -> N1, 5)
+    // (N1 -> N2, 2)
+    // (N2 -> N0, 1)
+    // The total latency which is a lower bound of the recurrence MII is the
+    // longest path from N0 back to N0 given only the edges of this node set.
+    // In this example, the latency is: 5 + 2 + 1 = 8.
+    //
+    // Hold a map from each SUnit in the circle to the maximum distance from the
+    // source node by only considering the nodes.
+    DenseMap<SUnit *, unsigned> SUnitToDistance;
+    for (auto *Node : Nodes)
+      SUnitToDistance[Node] = 0;
+
+    for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) {
+      SUnit *U = Nodes[I - 1];
+      SUnit *V = Nodes[I % Nodes.size()];
+      for (const SDep &Succ : U->Succs) {
+        SUnit *SuccSUnit = Succ.getSUnit();
+        if (V != SuccSUnit)
           continue;
-        unsigned CurLatency = Succ.getLatency();
-        unsigned MaxLatency = 0;
-        if (SuccSUnitLatency.count(SuccSUnit))
-          MaxLatency = SuccSUnitLatency[SuccSUnit];
-        if (CurLatency > MaxLatency)
-          SuccSUnitLatency[SuccSUnit] = CurLatency;
+        if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) {
+          SUnitToDistance[V] = SUnitToDistance[U] + Succ.getLatency();
+        }
       }
-      for (auto SUnitLatency : SuccSUnitLatency)
-        Latency += SUnitLatency.second;
     }
+    // Handle a back-edge between a store and a load
+    SUnit *FirstNode = Nodes[0];
+    SUnit *LastNode = Nodes[Nodes.size() - 1];
+
+    for (auto &PI : LastNode->Preds) {
+      // If we have an order dep that is potentially loop carried then a
+      // back-edge exists between the last node and the first node that isn't
+      // modeled in the DAG. Handle it manually by adding 1 to the distance of
+      // the last node.
+      if (PI.getSUnit() != FirstNode || PI.getKind() != SDep::Order ||
+          !DAG->isLoopCarriedDep(LastNode, PI, false))
+        continue;
+      SUnitToDistance[FirstNode] =
+          std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1);
+    }
+
+    // The latency is the distance from the source node to itself.
+    Latency = SUnitToDistance[Nodes.front()];
   }
 
   bool insert(SUnit *SU) { return Nodes.insert(SU); }
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 416129ff837c33..3f22e78c53f04b 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1680,13 +1680,19 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
         Added.set(N);
       }
     }
-    // A chain edge between a store and a load is treated as a back-edge in the
-    // adjacency matrix.
+    // Order edges of the following:
+    // 1. Load -> Store
+    // 2. Store -> Load
+    // are treated as a back-edge in the adjacency matrix.
+    // Store after store was handled above.
     for (auto &PI : SUnits[i].Preds) {
-      if (!SUnits[i].getInstr()->mayStore() ||
+      if (PI.getKind() != SDep::Order ||
           !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
         continue;
-      if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
+      if ((SUnits[i].getInstr()->mayLoad() &&
+           PI.getSUnit()->getInstr()->mayStore()) ||
+          (SUnits[i].getInstr()->mayStore() &&
+           PI.getSUnit()->getInstr()->mayLoad())) {
         int N = PI.getSUnit()->NodeNum;
         if (!Added.test(N)) {
           AdjK[i].push_back(N);
@@ -1706,6 +1712,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
 /// Identify an elementary circuit in the dependence graph starting at the
 /// specified node.
 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
+                                          const SwingSchedulerDAG *DAG,
                                           bool HasBackedge) {
   SUnit *SV = &SUnits[V];
   bool F = false;
@@ -1719,12 +1726,13 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
       continue;
     if (W == S) {
       if (!HasBackedge)
-        NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
+        NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
       F = true;
       ++NumPaths;
       break;
-    } else if (!Blocked.test(W)) {
-      if (circuit(W, S, NodeSets,
+    }
+    if (!Blocked.test(W)) {
+      if (circuit(W, S, NodeSets, DAG,
                   Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
         F = true;
     }
@@ -1767,9 +1775,9 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
   Circuits Cir(SUnits, Topo);
   // Create the adjacency structure.
   Cir.createAdjacencyStructure(this);
-  for (int i = 0, e = SUnits.size(); i != e; ++i) {
+  for (int I = 0, E = SUnits.size(); I != E; ++I) {
     Cir.reset();
-    Cir.circuit(i, i, NodeSets);
+    Cir.circuit(I, I, NodeSets, this);
   }
 
   // Change the dependences back so that we've created a DAG again.
@@ -2565,7 +2573,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
 
 /// Return true if we can compute the amount the instruction changes
 /// during each iteration. Set Delta to the amount of the change.
-bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
+bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineOperand *BaseOp;
   int64_t Offset;
@@ -2719,7 +2727,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
 /// potentially. A dependence is loop carried if the destination defines a value
 /// that may be used or defined by the source in a subsequent iteration.
 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
-                                         bool isSucc) {
+                                         bool isSucc) const {
   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
       Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
     return false;
diff --git a/llvm/test/CodeGen/PowerPC/sms-recmii.ll b/llvm/test/CodeGen/PowerPC/sms-recmii.ll
new file mode 100644
index 00000000000000..82b1c9e82fcb78
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/sms-recmii.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs\
+; RUN:       -mcpu=pwr9 --ppc-enable-pipeliner --debug-only=pipeliner 2>&1 | FileCheck %s
+
+; Test that the pipeliner doesn't overestimate the recurrence MII when evaluating circuits.
+; CHECK: MII = 16 MAX_II = 26 (rec=16, res=5)
+define dso_local void @comp_method(ptr noalias nocapture noundef readonly %0, ptr nocapture noundef writeonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6, i64 %v1) local_unnamed_addr {
+  %8 = icmp sgt i32 %3, 64
+  tail call void @llvm.assume(i1 %8)
+  %9 = and i32 %3, 1
+  %10 = icmp eq i32 %9, 0
+  tail call void @llvm.assume(i1 %10)
+  %11 = sext i32 %5 to i64
+  %12 = sext i32 %6 to i64
+  %13 = zext nneg i32 %3 to i64
+  %14 = getelementptr i8, ptr %2, i64 %12
+  br label %16
+
+15:
+  ret void
+
+16:
+  %17 = phi i64 [ 0, %7 ], [ %24, %16 ]
+  %18 = getelementptr inbounds i8, ptr %0, i64 %17
+  %19 = load i8, ptr %18, align 1
+  %20 = sext i8 %19 to i64
+  %21 = getelementptr inbounds i8, ptr %1, i64 %20
+  store i8 2, ptr %21, align 1
+  %22 = mul nsw i64 %17, %11
+  %a1 = ashr i64 %22, 2
+  %a2 = add i64 %a1, %v1
+  %a3 = add i64 %20, %a2
+  %a4 = mul nsw i64 %a3, 5
+  %23 = getelementptr i8, ptr %14, i64 %a4
+  %a5 = load i8, ptr %23, align 1
+  %a4_truncated = trunc i64 %a4 to i8
+  %min = call i8 @llvm.smin.i8(i8 %a5, i8 %a4_truncated)
+  %res = mul i8 %min, %a5
+  store i8 %res, ptr %23, align 1
+  %24 = add nuw nsw i64 %17, 1
+  %25 = icmp eq i64 %24, %13
+  br i1 %25, label %15, label %16
+}
+
+declare void @llvm.assume(i1 noundef) #1
+declare i8 @llvm.smin.i8(i8, i8)
+
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
diff --git a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
index 08f08c41917b1a..6983c6f97cc81f 100644
--- a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
+++ b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
@@ -222,8 +222,8 @@ body:             |
   ; CHECK-NEXT:   [[t2MLS1:%[0-9]+]]:rgpr = t2MLS [[t2SDIV1]], [[t2LDRSHi12_1]], [[t2LDRSH_PRE2]], 14 /* CC::al */, $noreg
   ; CHECK-NEXT:   [[t2UXTH2:%[0-9]+]]:rgpr = t2UXTH [[t2SDIV1]], 0, 14 /* CC::al */, $noreg
   ; CHECK-NEXT:   [[t2LDRHi12_9:%[0-9]+]]:gprnopc = t2LDRHi12 [[t2LDRSH_PRE3]], 6, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.uglygep6, align 2, !tbaa !9)
-  ; CHECK-NEXT:   t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
   ; CHECK-NEXT:   [[t2UXTH3:%[0-9]+]]:rgpr = t2UXTH [[t2MLS1]], 0, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
   ; CHECK-NEXT:   [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[PHI1]](tied-def 0)
   ; CHECK-NEXT:   t2CMPrr [[t2LDRHi12_9]], [[t2UXTH3]], 14 /* CC::al */, $noreg, implicit-def $cpsr
   ; CHECK-NEXT:   [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[t2ADDri4]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[t2ADDri4]](tied-def 0)

``````````

</details>


https://github.com/llvm/llvm-project/pull/105475