[llvm] MTM: improve operand latency when missing sched info (PR #101389)

Thu Aug 1 14:54:14 PDT 2024

https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/101389

>From 1f6026b726145fb2d2f5aa2d1edf02e2b5d74eb4 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 31 Jul 2024 16:49:45 +0100
Subject: [PATCH 1/2] MTM: improve operand latency when missing sched info

TargetSchedModel::computeOperandLatency is supposed to return the exact
latency between two MIs, although it is observed that InstrSchedModel
and InstrItineraries are often unavailable in many real-world scenarios.
When these two pieces of information are not available, the function
returns an estimate that is much too conservative: the default def
latency. MachineTraceMetrics is one of the callers affected quite badly
by these conservative estimates. To improve the estimate, and let
callers of MTM generate better code, offset the default def latency by
the estiamted cycles elapsed between the def MI and use MI. Since we're
trying to improve codegen in the case when no scheduling information is
unavailable, it is impossible to determine the number of cycles elapsed
between the two MIs, and we use the distance between them as a crude
approximate. In practice, this improvement of one crude estimate by
offseting it with another crude estimate leads to better codegen on
average, and yields huge gains on standard benchmarks.
---
 llvm/lib/CodeGen/MachineTraceMetrics.cpp | 57 ++++++++++++++++++++----
 1 file changed, 49 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index bf3add010574b..84bbdb008f8f3 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -761,6 +762,46 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
   }
 }
 
+/// Returns the distance between DefMI and UseMI if they're non-null and in the
+/// same BasicBlock, 0 otherwise.
+static unsigned computeDefUseDist(const MachineInstr *DefMI,
+                                  const MachineInstr *UseMI) {
+  if (!DefMI || !UseMI || DefMI == UseMI)
+    return 0;
+  const MachineBasicBlock *ParentBB = DefMI->getParent();
+  if (ParentBB != UseMI->getParent())
+    return 0;
+  auto DefIt = llvm::find_if(
+      *ParentBB, [DefMI](const MachineInstr &MI) { return DefMI == &MI; });
+  auto UseIt = llvm::find_if(
+      *ParentBB, [UseMI](const MachineInstr &MI) { return UseMI == &MI; });
+  return std::distance(DefIt, UseIt);
+}
+
+/// Wraps Sched.computeOperandLatency, accounting for the case when
+/// InstrSchedModel and InstrItineraries are not available: in this case,
+/// Sched.computeOperandLatency returns DefaultDefLatency, which is a very rough
+/// approximate; to improve this approximate, offset it by the approximate
+/// cycles elapsed from DefMI to UseMI (since the MIs could be re-ordered by the
+/// scheduler, and we don't have this information, this distance cannot be known
+/// exactly). When scheduling information is available,
+/// Sched.computeOperandLatency returns a much better estimate (especially if
+/// UseMI is non-null), so we just return that.
+static unsigned computeOperandLatency(const TargetSchedModel &Sched,
+                                      const MachineInstr *DefMI,
+                                      unsigned DefOperIdx,
+                                      const MachineInstr *UseMI,
+                                      unsigned UseOperIdx) {
+  assert(DefMI && "Non-null DefMI expected");
+  if (!Sched.hasInstrSchedModel() && !Sched.hasInstrItineraries()) {
+    unsigned DefaultDefLatency = Sched.getInstrInfo()->defaultDefLatency(
+        *Sched.getMCSchedModel(), *DefMI);
+    unsigned DefUseDist = computeDefUseDist(DefMI, UseMI);
+    return DefaultDefLatency > DefUseDist ? DefaultDefLatency - DefUseDist : 0;
+  }
+  return Sched.computeOperandLatency(DefMI, DefOperIdx, UseMI, UseOperIdx);
+}
+
 /// The length of the critical path through a trace is the maximum of two path
 /// lengths:
 ///
@@ -813,8 +854,8 @@ updateDepth(MachineTraceMetrics::TraceBlockInfo &TBI, const MachineInstr &UseMI,
     unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth;
     // Add latency if DefMI is a real instruction. Transients get latency 0.
     if (!Dep.DefMI->isTransient())
-      DepCycle += MTM.SchedModel
-        .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp);
+      DepCycle += computeOperandLatency(MTM.SchedModel, Dep.DefMI, Dep.DefOp,
+                                        &UseMI, Dep.UseOp);
     Cycle = std::max(Cycle, DepCycle);
   }
   // Remember the instruction depth.
@@ -929,8 +970,8 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
       if (!MI.isTransient()) {
         // We may not know the UseMI of this dependency, if it came from the
         // live-in list. SchedModel can handle a NULL UseMI.
-        DepHeight += SchedModel.computeOperandLatency(&MI, MO.getOperandNo(),
-                                                      I->MI, I->Op);
+        DepHeight += computeOperandLatency(SchedModel, &MI, MO.getOperandNo(),
+                                           I->MI, I->Op);
       }
       Height = std::max(Height, DepHeight);
       // This regunit is dead above MI.
@@ -965,8 +1006,8 @@ static bool pushDepHeight(const DataDep &Dep, const MachineInstr &UseMI,
                           const TargetInstrInfo *TII) {
   // Adjust height by Dep.DefMI latency.
   if (!Dep.DefMI->isTransient())
-    UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI,
-                                                  Dep.UseOp);
+    UseHeight += computeOperandLatency(SchedModel, Dep.DefMI, Dep.DefOp, &UseMI,
+                                       Dep.UseOp);
 
   // Update Heights[DefMI] to be the maximum height seen.
   MIHeightMap::iterator I;
@@ -1192,8 +1233,8 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr &PHI) const {
   unsigned DepCycle = getInstrCycles(*Dep.DefMI).Depth;
   // Add latency if DefMI is a real instruction. Transients get latency 0.
   if (!Dep.DefMI->isTransient())
-    DepCycle += TE.MTM.SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
-                                                        &PHI, Dep.UseOp);
+    DepCycle += computeOperandLatency(TE.MTM.SchedModel, Dep.DefMI, Dep.DefOp,
+                                      &PHI, Dep.UseOp);
   return DepCycle;
 }
 

>From 691e104086a9b107c55a3503a897693338b1a554 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 1 Aug 2024 12:29:35 +0100
Subject: [PATCH 2/2] MTM: account for IssueWidth; improve patch

---
 llvm/lib/CodeGen/MachineTraceMetrics.cpp | 44 +++++++++++++++++-------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 84bbdb008f8f3..50956ae37c4c1 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -762,20 +762,35 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
   }
 }
 
-/// Returns the distance between DefMI and UseMI if they're non-null and in the
-/// same BasicBlock, 0 otherwise.
-static unsigned computeDefUseDist(const MachineInstr *DefMI,
-                                  const MachineInstr *UseMI) {
+/// Estimates the number of cycles elapsed between DefMI and UseMI if they're
+/// non-null and in the same BasicBlock. Returns std::nullopt when UseMI is in a
+/// different MBB than DefMI, or when it is a dangling MI.
+static std::optional<unsigned>
+estimateDefUseCycles(const TargetSchedModel &Sched, const MachineInstr *DefMI,
+                     const MachineInstr *UseMI) {
   if (!DefMI || !UseMI || DefMI == UseMI)
     return 0;
   const MachineBasicBlock *ParentBB = DefMI->getParent();
   if (ParentBB != UseMI->getParent())
-    return 0;
-  auto DefIt = llvm::find_if(
-      *ParentBB, [DefMI](const MachineInstr &MI) { return DefMI == &MI; });
-  auto UseIt = llvm::find_if(
-      *ParentBB, [UseMI](const MachineInstr &MI) { return UseMI == &MI; });
-  return std::distance(DefIt, UseIt);
+    return std::nullopt;
+
+  const auto DefIt =
+      llvm::find_if(ParentBB->instrs(),
+                    [DefMI](const MachineInstr &MI) { return DefMI == &MI; });
+  const auto UseIt =
+      llvm::find_if(ParentBB->instrs(),
+                    [UseMI](const MachineInstr &MI) { return UseMI == &MI; });
+  assert(std::distance(DefIt, UseIt) > 0 &&
+         "Def expected to appear before use");
+  unsigned NumMicroOps = 0;
+  for (auto It = DefIt; It != UseIt; ++It) {
+    // In some cases, UseMI is a dangling MI beyond the end of the MBB.
+    if (It.isEnd())
+      return std::nullopt;
+
+    NumMicroOps += Sched.getNumMicroOps(&*It);
+  }
+  return NumMicroOps / Sched.getIssueWidth() - 1;
 }
 
 /// Wraps Sched.computeOperandLatency, accounting for the case when
@@ -783,7 +798,7 @@ static unsigned computeDefUseDist(const MachineInstr *DefMI,
 /// Sched.computeOperandLatency returns DefaultDefLatency, which is a very rough
 /// approximate; to improve this approximate, offset it by the approximate
 /// cycles elapsed from DefMI to UseMI (since the MIs could be re-ordered by the
-/// scheduler, and we don't have this information, this distance cannot be known
+/// scheduler, and we don't have this information, this cannot be known
 /// exactly). When scheduling information is available,
 /// Sched.computeOperandLatency returns a much better estimate (especially if
 /// UseMI is non-null), so we just return that.
@@ -796,8 +811,11 @@ static unsigned computeOperandLatency(const TargetSchedModel &Sched,
   if (!Sched.hasInstrSchedModel() && !Sched.hasInstrItineraries()) {
     unsigned DefaultDefLatency = Sched.getInstrInfo()->defaultDefLatency(
         *Sched.getMCSchedModel(), *DefMI);
-    unsigned DefUseDist = computeDefUseDist(DefMI, UseMI);
-    return DefaultDefLatency > DefUseDist ? DefaultDefLatency - DefUseDist : 0;
+    std::optional<unsigned> DefUseCycles =
+        estimateDefUseCycles(Sched, DefMI, UseMI);
+    if (!DefUseCycles || DefaultDefLatency <= DefUseCycles)
+      return 0;
+    return DefaultDefLatency - *DefUseCycles;
   }
   return Sched.computeOperandLatency(DefMI, DefOperIdx, UseMI, UseOperIdx);
 }