[llvm] [AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: use BranchProbability and TargetSchedmodel (PR #109818)

Thu Oct 3 08:11:17 PDT 2024

================
@@ -304,11 +299,67 @@ bool SIPreEmitPeephole::getBlockDestinations(
   return true;
 }
 
+namespace {
+class BranchWeightCostModel {
+  const SIInstrInfo &TII;
+  const TargetSchedModel &SchedModel;
+  BranchProbability BranchProb;
+  uint64_t BranchCost;
+  uint64_t ThenCyclesCost = 0;
+
+public:
+  BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
+                        const MachineBasicBlock &Succ)
+      : TII(TII), SchedModel(TII.getSchedModel()) {
+    assert(SchedModel.hasInstrSchedModelOrItineraries());
+
+    const MachineBasicBlock &Head = *Branch.getParent();
+    const auto *FromIt = find(Head.successors(), &Succ);
+    assert(FromIt != Head.succ_end());
+
+    BranchProb = Head.getSuccProbability(FromIt);
+    if (BranchProb.isUnknown())
+      return;
+
+    BranchCost = SchedModel.computeInstrLatency(&Branch, false);
+  }
+
+  bool isUnknown() const { return BranchProb.isUnknown(); }
+
+  bool isProfitable(const MachineInstr &MI) {
+    assert(!isUnknown());
+
+    if (TII.isWaitcnt(MI.getOpcode()))
+      return false;
+
+    ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);
----------------
jmmartinez wrote:

I'll do some tests using `MachineTraceMetrics` (which internally uses `computeOperandLatency`).

>From `MachineTraceMetrics.h`:
```cpp
// For each trace, we compute the critical path length, which is the number of
// cycles required to execute the trace when execution is limited by data
// dependencies only. We also compute the resource height, which is the number
// of cycles required to execute all instructions in the trace when ignoring
// data dependencies.
```

This will make the transformation more aggressive since `ThenCycles` would be smaller.

https://github.com/llvm/llvm-project/pull/109818