[llvm] [llvm][CodeGen] Add a new software pipeliner 'Window Scheduler' (PR #84443)

Sun Apr 7 00:50:37 PDT 2024

================
@@ -0,0 +1,692 @@
+//======----------- WindowScheduler.cpp - window scheduler -------------======//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An implementation of the Window Scheduling software pipelining algorithm.
+//
+// The fundamental concept of the window scheduling algorithm involves folding
+// the original MBB at a specific position, followed by list scheduling on the
+// folded MIs. The optimal scheduling result is then chosen from various folding
+// positions as the final scheduling outcome.
+//
+// The primary challenge in this algorithm lies in generating the folded MIs and
+// establishing their dependencies. We have innovatively employed a new MBB,
+// created by copying the original MBB three times, known as TripleMBB. This
+// TripleMBB enables the convenient implementation of MI folding and dependency
+// establishment. To facilitate the algorithm's implementation, we have also
+// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
+//
+// Another challenge in the algorithm is the scheduling of phis. Semantically,
+// it is difficult to place the phis in the window and perform list scheduling.
+// Therefore, we schedule these phis separately after each list scheduling.
+//
+// The provided implementation is designed for use before the Register Allocator
+// (RA). If the target requires implementation after RA, it is recommended to
+// reimplement analyseII(), schedulePhi(), and expand(). Additionally,
+// target-specific logic can be added in initialize(), preProcess(), and
+// postProcess().
+//
+// Lastly, it is worth mentioning that getSearchIndexes() is an important
+// function. We have experimented with more complex heuristics on downstream
+// target and achieved favorable results.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/WindowScheduler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePipeliner.h"
+#include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TimeProfiler.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pipeliner"
+
+namespace {
+STATISTIC(NumTryWindowSchedule,
+          "Number of loops that we attempt to use window scheduling");
+STATISTIC(NumTryWindowSearch,
+          "Number of times that we run list schedule in the window scheduling");
+STATISTIC(NumWindowSchedule,
+          "Number of loops that we successfully use window scheduling");
+STATISTIC(NumFailAnalyseII,
+          "Window scheduling abort due to the failure of the II analysis");
+
+cl::opt<unsigned>
+    WindowSearchNum("window-search-num",
+                    cl::desc("The number of searches per loop in the window "
+                             "algorithm. 0 means no search number limit."),
+                    cl::Hidden, cl::init(6));
+
+cl::opt<unsigned> WindowSearchRatio(
+    "window-search-ratio",
+    cl::desc("The ratio of searches per loop in the window algorithm. 100 "
+             "means search all positions in the loop, while 0 means not "
+             "performing any search."),
+    cl::Hidden, cl::init(40));
+
+cl::opt<unsigned> WindowIICoeff(
+    "window-ii-coeff",
+    cl::desc(
+        "The coefficient used when initializing II in the window algorithm."),
+    cl::Hidden, cl::init(5));
+
+cl::opt<unsigned> WindowRegionLimit(
+    "window-region-limit",
+    cl::desc(
+        "The lower limit of the scheduling region in the window algorithm."),
+    cl::Hidden, cl::init(3));
+
+cl::opt<unsigned> WindowDiffLimit(
+    "window-diff-limit",
+    cl::desc("The lower limit of the difference between best II and base II in "
+             "the window algorithm. If the difference is smaller than "
+             "this lower limit, window scheduling will not be performed."),
+    cl::Hidden, cl::init(2));
+} // namespace
+
+// WindowIILimit serves as an indicator of abnormal scheduling results and could
+// potentially be referenced by the derived target window scheduler.
+cl::opt<unsigned>
+    WindowIILimit("window-ii-limit",
+                  cl::desc("The upper limit of II in the window algorithm."),
+                  cl::Hidden, cl::init(1000));
+
+WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
+    : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML) {
+  Subtarget = &(MF->getSubtarget());
+  TII = Subtarget->getInstrInfo();
+  TRI = Subtarget->getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
+      createMachineScheduler(/*OnlyBuildGraph=*/true));
+}
+
+bool WindowScheduler::run() {
+  if (!initialize()) {
+    LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
+    return false;
+  }
+  // The window algorithm is time-consuming, and its compilation time should be
+  // taken into consideration.
+  TimeTraceScope Scope("WindowSearch");
+  ++NumTryWindowSchedule;
+  // Performing the relevant processing before window scheduling.
+  preProcess();
+  // The main window scheduling begins.
+  std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
+  auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio);
+  for (unsigned Idx : SearchIndexes) {
+    OriToCycle.clear();
+    ++NumTryWindowSearch;
+    // The scheduling starts with non-phi instruction, so SchedPhiNum needs to
+    // be added to Idx.
+    unsigned Offset = Idx + SchedPhiNum;
+    auto Range = getScheduleRange(Offset, SchedInstrNum);
+    SchedDAG->startBlock(MBB);
+    SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum);
+    SchedDAG->schedule();
+    LLVM_DEBUG(SchedDAG->dump());
+    unsigned II = analyseII(*SchedDAG, Offset);
+    if (II == WindowIILimit) {
+      restoreTripleMBB();
+      LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
+      ++NumFailAnalyseII;
+      continue;
+    }
+    schedulePhi(Offset, II);
+    updateScheduleResult(Offset, II);
+    restoreTripleMBB();
+    LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
+                      << II << ".\n");
+  }
+  // Performing the relevant processing after window scheduling.
+  postProcess();
+  // Check whether the scheduling result is valid.
+  if (!isScheduleValid()) {
+    LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
+                    << " and Best II is " << BestII << ".\n");
+  // Expand the scheduling result to prologue, kernel, and epilogue.
+  expand();
+  ++NumWindowSchedule;
+  return true;
+}
+
+ScheduleDAGInstrs *
+WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
+  return OnlyBuildGraph
+             ? new ScheduleDAGMI(
+                   Context, std::make_unique<PostGenericScheduler>(Context),
+                   true)
+             : Context->PassConfig->createMachineScheduler(Context);
+}
+
+bool WindowScheduler::initialize() {
+  if (!Subtarget->enableWindowScheduler()) {
+    LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
+    return false;
+  }
+  // Initialized the member variables used by window algorithm.
+  OriMIs.clear();
+  TriMIs.clear();
+  TriToOri.clear();
+  OriToCycle.clear();
+  SchedResult.clear();
+  SchedPhiNum = 0;
+  SchedInstrNum = 0;
+  BestII = UINT_MAX;
+  BestOffset = 0;
+  BaseII = 0;
+  // List scheduling used in the window algorithm depends on LiveIntervals.
+  if (!Context->LIS) {
+    LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
+    return false;
+  }
+  // Check each MI in MBB.
+  SmallVector<Register, 8> PhiDefs;
+  auto PLI = TII->analyzeLoopForPipelining(MBB);
+  for (auto &MI : *MBB) {
+    if (MI.isDebugInstr() || MI.isTerminator())
+      continue;
+    if (MI.isPHI()) {
+      for (auto Def : PhiDefs)
+        if (MI.readsRegister(Def, TRI)) {
+          LLVM_DEBUG(
+              dbgs()
+              << "Consecutive phis are not allowed in window scheduling!\n");
+          return false;
+        }
+      for (auto Def : MI.defs())
+        if (Def.isReg())
+          PhiDefs.push_back(Def.getReg());
+      ++SchedPhiNum;
+      ++BestOffset;
+    } else
+      ++SchedInstrNum;
+    if (TII->isSchedulingBoundary(MI, MBB, *MF)) {
+      LLVM_DEBUG(
+          dbgs() << "Boundary MI is not allowed in window scheduling!\n");
+      return false;
+    }
+    if (PLI->shouldIgnoreForPipelining(&MI)) {
+      LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
+                           "window scheduling!\n");
+      return false;
+    }
+    for (auto &Def : MI.defs())
+      if (Def.isReg() && Def.getReg().isPhysical())
+        return false;
+  }
+  if (SchedInstrNum <= WindowRegionLimit) {
+    LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
+    return false;
+  }
+  return true;
+}
+
+void WindowScheduler::preProcess() {
+  // Prior to window scheduling, it's necessary to backup the original MBB,
+  // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
+  backupMBB();
+  generateTripleMBB();
+  TripleDAG->startBlock(MBB);
+  TripleDAG->enterRegion(
+      MBB, MBB->begin(), MBB->getFirstTerminator(),
+      std::distance(MBB->begin(), MBB->getFirstTerminator()));
+  TripleDAG->buildSchedGraph(Context->AA);
+}
+
+void WindowScheduler::postProcess() {
+  // After window scheduling, it's necessary to clear the TripleDAG and restore
+  // to the original MBB.
+  TripleDAG->exitRegion();
+  TripleDAG->finishBlock();
+  restoreMBB();
+}
+
+void WindowScheduler::backupMBB() {
+  for (auto &MI : MBB->instrs())
+    OriMIs.push_back(&MI);
+  // Remove MIs and the corresponding live intervals.
+  for (auto &MI : make_early_inc_range(*MBB)) {
+    Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
+    MBB->remove(&MI);
+  }
+}
+
+void WindowScheduler::restoreMBB() {
+  // Erase MIs and the corresponding live intervals.
+  for (auto &MI : make_early_inc_range(*MBB)) {
+    Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
+    MI.eraseFromParent();
+  }
+  // Restore MBB to the state before window scheduling.
+  for (auto *MI : OriMIs)
+    MBB->push_back(MI);
+  updateLiveIntervals();
+}
+
+void WindowScheduler::generateTripleMBB() {
+  const unsigned DuplicateNum = 3;
+  TriMIs.clear();
+  TriToOri.clear();
+  assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
+  // Step 1: Performing the first copy of MBB instructions, excluding
+  // terminators. At the same time, we back up the anti-register of phis.
+  // DefPairs hold the old and new define register pairs.
+  std::map<Register, Register> DefPairs;
+  for (auto *MI : OriMIs) {
+    if (MI->isDebugInstr() || MI->isTerminator())
+      continue;
+    if (MI->isPHI())
+      if (Register AntiReg = getAntiRegister(MI))
+        DefPairs[MI->getOperand(0).getReg()] = AntiReg;
+    auto *NewMI = MF->CloneMachineInstr(MI);
+    MBB->push_back(NewMI);
+    TriMIs.push_back(NewMI);
+    TriToOri[NewMI] = MI;
+  }
+  // Step 2: Performing the remaining two copies of MBB instructions excluding
+  // phis, and the last one contains terminators. At the same time, registers
+  // are updated accordingly.
+  for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
+    for (auto *MI : OriMIs) {
+      if (MI->isPHI() || MI->isDebugInstr() ||
+          (MI->isTerminator() && Cnt < DuplicateNum - 1))
+        continue;
+      auto *NewMI = MF->CloneMachineInstr(MI);
+      std::map<Register, Register> NewDefs;
+      // New defines are updated.
+      for (auto MO : NewMI->defs())
+        if (MO.isReg() && MO.getReg().isVirtual()) {
+          Register NewDef =
+              MRI->createVirtualRegister(MRI->getRegClass(MO.getReg()));
+          NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI);
+          NewDefs[MO.getReg()] = NewDef;
+        }
+      // New uses are updated.
+      for (auto DefRegPair : DefPairs)
+        if (NewMI->readsRegister(DefRegPair.first, TRI)) {
+          Register NewUse = DefRegPair.second;
+          // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
+          //
+          // BB.3:                                  DefPairs
+          // ==================================
+          // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]  (%1,%7)
+          // ...
+          // ==================================
+          // ...
+          // %4 = sub i32 %1, %3
+          // ...
+          // %7 = add i32 %5, %6
+          // ...
+          // ----------------------------------
+          // ...
+          // %8 = sub i32 %7, %3                    (%1,%7),(%4,%8)
+          // ...
+          // %9 = add i32 %5, %6                    (%1,%7),(%4,%8),(%7,%9)
+          // ...
+          // ----------------------------------
+          // ...
+          // %10 = sub i32 %9, %3                   (%1,%7),(%4,%10),(%7,%9)
+          // ...            ^
+          // %11 = add i32 %5, %6                   (%1,%7),(%4,%10),(%7,%11)
+          // ...
+          // ==================================
+          //          < Terminators >
+          // ==================================
+          if (DefPairs.count(NewUse))
+            NewUse = DefPairs[NewUse];
+          NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
+        }
+      // DefPairs is updated at last.
+      for (auto &NewDef : NewDefs)
+        DefPairs[NewDef.first] = NewDef.second;
+      MBB->push_back(NewMI);
+      TriMIs.push_back(NewMI);
+      TriToOri[NewMI] = MI;
+    }
+  }
+  // Step 3: The registers used by phis are updated, and they are generated in
+  // the third copy of MBB.
+  // In the privious example, the old phi is:
+  // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
+  // The new phi is:
+  // %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
+  for (auto &Phi : MBB->phis())
+    for (auto DefRegPair : DefPairs)
+      if (Phi.readsRegister(DefRegPair.first, TRI))
+        Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI);
----------------
huaatian wrote:

I'm not sure if I have understood your comment correctly. Let me elaborate on this part of the code. 
In this section of the algorithm, DefPairs also includes the substitute registers for the phi-defined registers. Therefore it must be constrained to replacing the registers that are used by phi.

https://github.com/llvm/llvm-project/pull/84443