[llvm] [llvm][CodeGen] Add a new software pipeliner 'Window Scheduler' (PR #84443)
Hua Tian via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 7 01:02:45 PDT 2024
================
@@ -0,0 +1,692 @@
+//======----------- WindowScheduler.cpp - window scheduler -------------======//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An implementation of the Window Scheduling software pipelining algorithm.
+//
+// The fundamental concept of the window scheduling algorithm involves folding
+// the original MBB at a specific position, followed by list scheduling on the
+// folded MIs. The optimal scheduling result is then chosen from various folding
+// positions as the final scheduling outcome.
+//
+// The primary challenge in this algorithm lies in generating the folded MIs and
+// establishing their dependencies. We have innovatively employed a new MBB,
+// created by copying the original MBB three times, known as TripleMBB. This
+// TripleMBB enables the convenient implementation of MI folding and dependency
+// establishment. To facilitate the algorithm's implementation, we have also
+// devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
+//
+// Another challenge in the algorithm is the scheduling of phis. Semantically,
+// it is difficult to place the phis in the window and perform list scheduling.
+// Therefore, we schedule these phis separately after each list scheduling.
+//
+// The provided implementation is designed for use before the Register Allocator
+// (RA). If the target requires implementation after RA, it is recommended to
+// reimplement analyseII(), schedulePhi(), and expand(). Additionally,
+// target-specific logic can be added in initialize(), preProcess(), and
+// postProcess().
+//
+// Lastly, it is worth mentioning that getSearchIndexes() is an important
+// function. We have experimented with more complex heuristics on downstream
+// target and achieved favorable results.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/WindowScheduler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePipeliner.h"
+#include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TimeProfiler.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pipeliner"
+
+namespace {
+STATISTIC(NumTryWindowSchedule,
+ "Number of loops that we attempt to use window scheduling");
+STATISTIC(NumTryWindowSearch,
+ "Number of times that we run list schedule in the window scheduling");
+STATISTIC(NumWindowSchedule,
+ "Number of loops that we successfully use window scheduling");
+STATISTIC(NumFailAnalyseII,
+ "Window scheduling abort due to the failure of the II analysis");
+
+cl::opt<unsigned>
+ WindowSearchNum("window-search-num",
+ cl::desc("The number of searches per loop in the window "
+ "algorithm. 0 means no search number limit."),
+ cl::Hidden, cl::init(6));
+
+cl::opt<unsigned> WindowSearchRatio(
+ "window-search-ratio",
+ cl::desc("The ratio of searches per loop in the window algorithm. 100 "
+ "means search all positions in the loop, while 0 means not "
+ "performing any search."),
+ cl::Hidden, cl::init(40));
+
+cl::opt<unsigned> WindowIICoeff(
+ "window-ii-coeff",
+ cl::desc(
+ "The coefficient used when initializing II in the window algorithm."),
+ cl::Hidden, cl::init(5));
+
+cl::opt<unsigned> WindowRegionLimit(
+ "window-region-limit",
+ cl::desc(
+ "The lower limit of the scheduling region in the window algorithm."),
+ cl::Hidden, cl::init(3));
+
+cl::opt<unsigned> WindowDiffLimit(
+ "window-diff-limit",
+ cl::desc("The lower limit of the difference between best II and base II in "
+ "the window algorithm. If the difference is smaller than "
+ "this lower limit, window scheduling will not be performed."),
+ cl::Hidden, cl::init(2));
+} // namespace
+
+// WindowIILimit serves as an indicator of abnormal scheduling results and could
+// potentially be referenced by the derived target window scheduler.
+cl::opt<unsigned>
+ WindowIILimit("window-ii-limit",
+ cl::desc("The upper limit of II in the window algorithm."),
+ cl::Hidden, cl::init(1000));
+
+WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
+ : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML) {
+ Subtarget = &(MF->getSubtarget());
+ TII = Subtarget->getInstrInfo();
+ TRI = Subtarget->getRegisterInfo();
+ MRI = &MF->getRegInfo();
+ TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
+ createMachineScheduler(/*OnlyBuildGraph=*/true));
+}
+
+bool WindowScheduler::run() {
+ if (!initialize()) {
+ LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
+ return false;
+ }
+ // The window algorithm is time-consuming, and its compilation time should be
+ // taken into consideration.
+ TimeTraceScope Scope("WindowSearch");
+ ++NumTryWindowSchedule;
+ // Performing the relevant processing before window scheduling.
+ preProcess();
+ // The main window scheduling begins.
+ std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
+ auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio);
+ for (unsigned Idx : SearchIndexes) {
+ OriToCycle.clear();
+ ++NumTryWindowSearch;
+ // The scheduling starts with non-phi instruction, so SchedPhiNum needs to
+ // be added to Idx.
+ unsigned Offset = Idx + SchedPhiNum;
+ auto Range = getScheduleRange(Offset, SchedInstrNum);
+ SchedDAG->startBlock(MBB);
+ SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum);
+ SchedDAG->schedule();
+ LLVM_DEBUG(SchedDAG->dump());
+ unsigned II = analyseII(*SchedDAG, Offset);
+ if (II == WindowIILimit) {
+ restoreTripleMBB();
+ LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
+ ++NumFailAnalyseII;
+ continue;
+ }
+ schedulePhi(Offset, II);
+ updateScheduleResult(Offset, II);
+ restoreTripleMBB();
+ LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
+ << II << ".\n");
+ }
+ // Performing the relevant processing after window scheduling.
+ postProcess();
+ // Check whether the scheduling result is valid.
+ if (!isScheduleValid()) {
+ LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
+ << " and Best II is " << BestII << ".\n");
+ // Expand the scheduling result to prologue, kernel, and epilogue.
+ expand();
+ ++NumWindowSchedule;
+ return true;
+}
+
+ScheduleDAGInstrs *
+WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
+ return OnlyBuildGraph
+ ? new ScheduleDAGMI(
+ Context, std::make_unique<PostGenericScheduler>(Context),
+ true)
+ : Context->PassConfig->createMachineScheduler(Context);
+}
+
+bool WindowScheduler::initialize() {
+ if (!Subtarget->enableWindowScheduler()) {
+ LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
+ return false;
+ }
+ // Initialized the member variables used by window algorithm.
+ OriMIs.clear();
+ TriMIs.clear();
+ TriToOri.clear();
+ OriToCycle.clear();
+ SchedResult.clear();
+ SchedPhiNum = 0;
+ SchedInstrNum = 0;
+ BestII = UINT_MAX;
+ BestOffset = 0;
+ BaseII = 0;
+ // List scheduling used in the window algorithm depends on LiveIntervals.
+ if (!Context->LIS) {
+ LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
+ return false;
+ }
+ // Check each MI in MBB.
+ SmallVector<Register, 8> PhiDefs;
+ auto PLI = TII->analyzeLoopForPipelining(MBB);
+ for (auto &MI : *MBB) {
+ if (MI.isDebugInstr() || MI.isTerminator())
+ continue;
+ if (MI.isPHI()) {
+ for (auto Def : PhiDefs)
+ if (MI.readsRegister(Def, TRI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Consecutive phis are not allowed in window scheduling!\n");
+ return false;
+ }
+ for (auto Def : MI.defs())
+ if (Def.isReg())
+ PhiDefs.push_back(Def.getReg());
+ ++SchedPhiNum;
+ ++BestOffset;
+ } else
+ ++SchedInstrNum;
+ if (TII->isSchedulingBoundary(MI, MBB, *MF)) {
+ LLVM_DEBUG(
+ dbgs() << "Boundary MI is not allowed in window scheduling!\n");
+ return false;
+ }
+ if (PLI->shouldIgnoreForPipelining(&MI)) {
+ LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
+ "window scheduling!\n");
+ return false;
+ }
+ for (auto &Def : MI.defs())
+ if (Def.isReg() && Def.getReg().isPhysical())
+ return false;
+ }
+ if (SchedInstrNum <= WindowRegionLimit) {
+ LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
+ return false;
+ }
+ return true;
+}
+
+void WindowScheduler::preProcess() {
+ // Prior to window scheduling, it's necessary to backup the original MBB,
+ // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
+ backupMBB();
+ generateTripleMBB();
+ TripleDAG->startBlock(MBB);
+ TripleDAG->enterRegion(
+ MBB, MBB->begin(), MBB->getFirstTerminator(),
+ std::distance(MBB->begin(), MBB->getFirstTerminator()));
+ TripleDAG->buildSchedGraph(Context->AA);
+}
+
+void WindowScheduler::postProcess() {
+ // After window scheduling, it's necessary to clear the TripleDAG and restore
+ // to the original MBB.
+ TripleDAG->exitRegion();
+ TripleDAG->finishBlock();
+ restoreMBB();
+}
+
+void WindowScheduler::backupMBB() {
+ for (auto &MI : MBB->instrs())
+ OriMIs.push_back(&MI);
+ // Remove MIs and the corresponding live intervals.
+ for (auto &MI : make_early_inc_range(*MBB)) {
+ Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
+ MBB->remove(&MI);
+ }
+}
+
+void WindowScheduler::restoreMBB() {
+ // Erase MIs and the corresponding live intervals.
+ for (auto &MI : make_early_inc_range(*MBB)) {
+ Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
+ MI.eraseFromParent();
+ }
+ // Restore MBB to the state before window scheduling.
+ for (auto *MI : OriMIs)
+ MBB->push_back(MI);
+ updateLiveIntervals();
+}
+
+void WindowScheduler::generateTripleMBB() {
+ const unsigned DuplicateNum = 3;
+ TriMIs.clear();
+ TriToOri.clear();
+ assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
+ // Step 1: Performing the first copy of MBB instructions, excluding
+ // terminators. At the same time, we back up the anti-register of phis.
+ // DefPairs hold the old and new define register pairs.
+ std::map<Register, Register> DefPairs;
+ for (auto *MI : OriMIs) {
+ if (MI->isDebugInstr() || MI->isTerminator())
+ continue;
+ if (MI->isPHI())
+ if (Register AntiReg = getAntiRegister(MI))
+ DefPairs[MI->getOperand(0).getReg()] = AntiReg;
+ auto *NewMI = MF->CloneMachineInstr(MI);
+ MBB->push_back(NewMI);
+ TriMIs.push_back(NewMI);
+ TriToOri[NewMI] = MI;
+ }
+ // Step 2: Performing the remaining two copies of MBB instructions excluding
+ // phis, and the last one contains terminators. At the same time, registers
+ // are updated accordingly.
+ for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
+ for (auto *MI : OriMIs) {
+ if (MI->isPHI() || MI->isDebugInstr() ||
+ (MI->isTerminator() && Cnt < DuplicateNum - 1))
+ continue;
+ auto *NewMI = MF->CloneMachineInstr(MI);
+ std::map<Register, Register> NewDefs;
+ // New defines are updated.
+ for (auto MO : NewMI->defs())
+ if (MO.isReg() && MO.getReg().isVirtual()) {
+ Register NewDef =
+ MRI->createVirtualRegister(MRI->getRegClass(MO.getReg()));
+ NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI);
+ NewDefs[MO.getReg()] = NewDef;
+ }
+ // New uses are updated.
+ for (auto DefRegPair : DefPairs)
+ if (NewMI->readsRegister(DefRegPair.first, TRI)) {
+ Register NewUse = DefRegPair.second;
+ // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
+ //
+ // BB.3: DefPairs
+ // ==================================
+ // %1 = phi i32 [%2, %BB.1], [%7, %BB.3] (%1,%7)
+ // ...
+ // ==================================
+ // ...
+ // %4 = sub i32 %1, %3
+ // ...
+ // %7 = add i32 %5, %6
+ // ...
+ // ----------------------------------
+ // ...
+ // %8 = sub i32 %7, %3 (%1,%7),(%4,%8)
+ // ...
+ // %9 = add i32 %5, %6 (%1,%7),(%4,%8),(%7,%9)
+ // ...
+ // ----------------------------------
+ // ...
+ // %10 = sub i32 %9, %3 (%1,%7),(%4,%10),(%7,%9)
+ // ... ^
+ // %11 = add i32 %5, %6 (%1,%7),(%4,%10),(%7,%11)
+ // ...
+ // ==================================
+ // < Terminators >
+ // ==================================
+ if (DefPairs.count(NewUse))
+ NewUse = DefPairs[NewUse];
+ NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
+ }
+ // DefPairs is updated at last.
+ for (auto &NewDef : NewDefs)
+ DefPairs[NewDef.first] = NewDef.second;
+ MBB->push_back(NewMI);
+ TriMIs.push_back(NewMI);
+ TriToOri[NewMI] = MI;
+ }
+ }
+ // Step 3: The registers used by phis are updated, and they are generated in
+ // the third copy of MBB.
+ // In the privious example, the old phi is:
+ // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
+ // The new phi is:
+ // %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
+ for (auto &Phi : MBB->phis())
+ for (auto DefRegPair : DefPairs)
+ if (Phi.readsRegister(DefRegPair.first, TRI))
+ Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI);
+ updateLiveIntervals();
+}
+
+void WindowScheduler::restoreTripleMBB() {
+ // After list scheduling, the MBB is restored in one traversal.
+ for (size_t I = 0; I < TriMIs.size(); ++I) {
+ auto *MI = TriMIs[I];
+ auto OldPos = MBB->begin();
+ std::advance(OldPos, I);
+ auto CurPos = MI->getIterator();
+ if (CurPos != OldPos) {
+ MBB->splice(OldPos, MBB, CurPos);
+ Context->LIS->handleMove(*MI, /*UpdateFlags=*/false);
+ }
+ }
+}
+
+SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
+ unsigned SearchRatio) {
+ // We use SearchRatio to get the index range, and then evenly get the indexes
+ // according to the SearchNum. This is a simple huristic. Depending on the
+ // characteristics of the target, more complex algorithms can be used for both
+ // performance and compilation time.
+ assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!");
+ unsigned MaxIdx = SchedInstrNum * SearchRatio / 100;
+ unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1;
+ SmallVector<unsigned> SearchIndexes;
+ for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step)
+ SearchIndexes.push_back(Idx);
+ return SearchIndexes;
+}
+
+int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
+ // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
+ unsigned MaxDepth = 1;
+ for (auto &SU : DAG.SUnits)
+ MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth);
+ return MaxDepth * WindowIICoeff;
+}
+
+int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
+ unsigned Offset) {
+ int InitII = getEstimatedII(DAG);
+ ResourceManager RM(Subtarget, &DAG);
+ RM.init(InitII);
+ // ResourceManager and DAG are used to calculate the maximum cycle for the
+ // scheduled MIs. Since MIs in the Region have already been scheduled, the
+ // emit cycles can be estimated in order here.
+ int CurCycle = 0;
+ auto Range = getScheduleRange(Offset, SchedInstrNum);
+ for (auto &MI : Range) {
+ auto *SU = DAG.getSUnit(&MI);
+ int ExpectCycle = CurCycle;
+ // The predecessors of current MI determine its earliest issue cycle.
+ for (auto &Pred : SU->Preds) {
+ auto *PredMI = Pred.getSUnit()->getInstr();
+ int PredCycle = getOriCycle(PredMI);
+ ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
+ }
+ // ResourceManager can be used to detect resource conflicts between the
+ // current MI and the previously inserted MIs.
+ while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
+ ++CurCycle;
+ if (CurCycle == (int)WindowIILimit)
+ return CurCycle;
+ }
+ RM.reserveResources(*SU, CurCycle);
+ OriToCycle[getOriMI(&MI)] = CurCycle;
+ LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
+ << getOriStage(getOriMI(&MI), Offset) << "]: ");
+ LLVM_DEBUG(MI.dump());
+ }
+ LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
+ return CurCycle;
+}
+
+// By utilizing TripleDAG, we can easily establish dependencies between A and B.
+// Based on the MaxCycle and the issue cycle of A and B, we can determine
+// whether it is necessary to add a stall cycle. This is because, without
+// inserting the stall cycle, the latency constraint between A and B cannot be
+// satisfied. The details are as follows:
+//
+// New MBB:
+// ========================================
+// < Phis >
+// ======================================== (sliding direction)
+// MBB copy 1 |
+// V
+//
+// ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~ ----schedule window-----
+// | |
+// ===================V==================== |
+// MBB copy 2 < MI B > |
+// |
+// < MI A > V
+// ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~ ------------------------
+// :
+// ===================V====================
+// MBB copy 3 < MI B'>
+//
+//
+//
+//
+// ========================================
+// < Terminators >
+// ========================================
+int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
+ int MaxStallCycle = 0;
+ auto Range = getScheduleRange(Offset, SchedInstrNum);
+ for (auto &MI : Range) {
+ auto *SU = TripleDAG->getSUnit(&MI);
+ int DefCycle = getOriCycle(&MI);
+ for (auto &Succ : SU->Succs) {
+ if (Succ.getSUnit() == &TripleDAG->ExitSU)
+ continue;
+ // If the expected cycle does not exceed MaxCycle, no check is needed.
+ if (DefCycle + (int)Succ.getLatency() <= MaxCycle)
+ continue;
+ // If the cycle of the scheduled MI A is less than that of the scheduled
+ // MI B, the scheduling will fail because the lifetime of the
+ // corresponding register exceeds II.
+ auto *SuccMI = Succ.getSUnit()->getInstr();
+ int UseCycle = getOriCycle(SuccMI);
+ if (DefCycle < UseCycle)
+ return WindowIILimit;
+ // Get the stall cycle introduced by the register between two trips.
+ int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle;
+ MaxStallCycle = std::max(MaxStallCycle, StallCycle);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
+ return MaxStallCycle;
+}
+
+unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
+ LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
+ int MaxCycle = calculateMaxCycle(DAG, Offset);
+ if (MaxCycle == (int)WindowIILimit)
+ return MaxCycle;
+ int StallCycle = calculateStallCycle(Offset, MaxCycle);
+ if (StallCycle == (int)WindowIILimit)
+ return StallCycle;
+ // The value of II is equal to the maximum execution cycle plus 1.
+ return MaxCycle + StallCycle + 1;
+}
+
+void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
+ LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
+ for (auto &Phi : MBB->phis()) {
+ int LateCycle = INT_MAX;
+ auto *SU = TripleDAG->getSUnit(&Phi);
+ for (auto &Succ : SU->Succs) {
+ // Phi doesn't have any Anti successors.
+ if (Succ.getKind() != SDep::Data)
+ continue;
+ // Phi is scheduled before the successor of stage 0. The issue cycle of
+ // phi is the latest cycle in this interval.
+ auto *SuccMI = Succ.getSUnit()->getInstr();
+ int Cycle = getOriCycle(SuccMI);
+ if (getOriStage(getOriMI(SuccMI), Offset) == 0)
+ LateCycle = std::min(LateCycle, Cycle);
+ }
+ // The anti-dependency of phi need to be handled separately in the same way.
+ if (Register AntiReg = getAntiRegister(&Phi)) {
+ auto *AntiMI = MRI->getVRegDef(AntiReg);
+ auto AntiCycle = getOriCycle(AntiMI);
+ if (getOriStage(getOriMI(AntiMI), Offset) == 0)
+ LateCycle = std::min(LateCycle, AntiCycle);
+ }
+ // If there is no limit to the late cycle, a default value is given.
+ if (LateCycle == INT_MAX)
+ LateCycle = (int)(II - 1);
+ LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] ");
+ LLVM_DEBUG(Phi.dump());
+ // The issue cycle of phi is set to the latest cycle in the interval.
+ auto *OriPhi = getOriMI(&Phi);
+ OriToCycle[OriPhi] = LateCycle;
+ }
+}
+
+DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
+ unsigned II) {
+ // At each issue cycle, phi is placed before MIs in stage 0. So the simplest
+ // way is to put phi at the beginning of the current cycle.
+ DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
+ auto Range = getScheduleRange(Offset, SchedInstrNum);
+ for (auto &Phi : MBB->phis())
+ CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi));
+ for (auto &MI : Range)
+ CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI));
+ // Each MI is assigned a separate ordered Id, which is used as a sort marker
+ // in the following expand process.
+ DenseMap<MachineInstr *, int> IssueOrder;
+ int Id = 0;
+ for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
+ if (!CycleToMIs.count(Cycle))
+ continue;
+ for (auto *MI : CycleToMIs[Cycle])
+ IssueOrder[MI] = Id++;
+ }
+ return IssueOrder;
+}
+
+void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
+ // At the first update, Offset is equal to SchedPhiNum. At this time, only
+ // BestII, BestOffset, and BaseII need to be updated.
+ if (Offset == SchedPhiNum) {
+ BestII = II;
+ BestOffset = SchedPhiNum;
+ BaseII = II;
+ return;
+ }
+ // The update will only continue if the II is smaller than BestII and the II
+ // is sufficiently small.
+ if ((II >= BestII) || (II + WindowDiffLimit > BaseII))
+ return;
+ BestII = II;
+ BestOffset = Offset;
+ // Record the result of the current list scheduling, noting that each MI is
+ // stored unordered in SchedResult.
+ SchedResult.clear();
+ auto IssueOrder = getIssueOrder(Offset, II);
+ for (auto &Pair : OriToCycle) {
+ assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
+ SchedResult.push_back(std::make_tuple(Pair.first, Pair.second,
+ getOriStage(Pair.first, Offset),
+ IssueOrder[Pair.first]));
+ }
+}
+
+void WindowScheduler::expand() {
+ // The MIs in the SchedResult are sorted by the issue order ID.
+ llvm::stable_sort(SchedResult,
+ [](const std::tuple<MachineInstr *, int, int, int> &A,
+ const std::tuple<MachineInstr *, int, int, int> &B) {
+ return std::get<3>(A) < std::get<3>(B);
+ });
+ // Use the scheduling infrastructure for expansion, noting that InstrChanges
+ // is not supported here.
+ DenseMap<MachineInstr *, int> Cycles, Stages;
+ std::vector<MachineInstr *> OrderedInsts;
+ for (auto &Info : SchedResult) {
+ auto *MI = std::get<0>(Info);
+ OrderedInsts.push_back(MI);
+ Cycles[MI] = std::get<1>(Info);
+ Stages[MI] = std::get<2>(Info);
+ LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
+ << "]: ");
+ LLVM_DEBUG(MI->dump());
----------------
huaatian wrote:
Updated
https://github.com/llvm/llvm-project/pull/84443
More information about the llvm-commits
mailing list