[llvm] [ModuloSchedule] Implement modulo variable expansion for pipelining (PR #65609)
Yuta Mukai via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 5 05:55:06 PST 2024
https://github.com/ytmukai updated https://github.com/llvm/llvm-project/pull/65609
>From b9d9cbf54c347e7dcb7e31154f950ccb504809b7 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 5 Sep 2023 11:43:06 +0000
Subject: [PATCH 1/5] [ModuloSchedule] Implement modulo variable expansion for
pipelining
Modulo variable expansion is a technique that resolves overlap of
variable lifetimes by unrolling. The existing implementation solves
it by making a copy by move instruction for processors with ordinary
registers such as Arm and x86. This method may result in a very large
number of move instructions, which can cause performance problems.
Modulo variable expansion is enabled by specifing -pipeliner-mve-cg.
A backend must implement some newly defined interfaces in
PipelinerLoopInfo.
---
llvm/include/llvm/CodeGen/ModuloSchedule.h | 65 +++
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 24 +
llvm/lib/CodeGen/MachinePipeliner.cpp | 9 +
llvm/lib/CodeGen/ModuloSchedule.cpp | 615 ++++++++++++++++++++
4 files changed, 713 insertions(+)
diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index fd424163f0d19b..1a555728a09857 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -370,6 +370,71 @@ class PeelingModuloScheduleExpander {
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
};
+/// Expand the kernel using modulo variable expansion algorithm (MVE).
+/// It unrolls the kernel enough to avoid overlap of register lifetime.
+class ModuloScheduleExpanderMVE {
+private:
+ using ValueMapTy = DenseMap<unsigned, unsigned>;
+ using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
+ using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
+
+ ModuloSchedule &Schedule;
+ MachineFunction &MF;
+ const TargetSubtargetInfo &ST;
+ MachineRegisterInfo &MRI;
+ const TargetInstrInfo *TII = nullptr;
+ LiveIntervals &LIS;
+
+ MachineBasicBlock *OrigKernel = nullptr;
+ MachineBasicBlock *OrigPreheader = nullptr;
+ MachineBasicBlock *OrigExit = nullptr;
+ MachineBasicBlock *Check = nullptr;
+ MachineBasicBlock *Prolog = nullptr;
+ MachineBasicBlock *NewKernel = nullptr;
+ MachineBasicBlock *Epilog = nullptr;
+ MachineBasicBlock *NewPreheader = nullptr;
+ MachineBasicBlock *NewExit = nullptr;
+ std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
+
+ /// The number of unroll required to avoid overlap of live ranges.
+ /// NumUnroll = 1 means no unrolling.
+ int NumUnroll;
+
+ void calcNumUnroll();
+ void generatePipelinedLoop();
+ void generateProlog(SmallVectorImpl<ValueMapTy> &VRMap);
+ void generatePhi(MachineInstr *OrigMI, int UnrollNum,
+ SmallVectorImpl<ValueMapTy> &PrologVRMap,
+ SmallVectorImpl<ValueMapTy> &KernelVRMap,
+ SmallVectorImpl<ValueMapTy> &PhiVRMap);
+ void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap,
+ SmallVectorImpl<ValueMapTy> &KernelVRMap);
+ void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap,
+ SmallVectorImpl<ValueMapTy> &EpilogVRMap);
+ void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg);
+
+ MachineInstr *cloneInstr(MachineInstr *OldMI);
+
+ void updateInstrDef(MachineInstr *NewMI, ValueMapTy &VRMap, bool LastDef);
+
+ void generateKernelPhi(Register OrigLoopVal, Register NewLoopVal,
+ unsigned UnrollNum,
+ SmallVectorImpl<ValueMapTy> &VRMapProlog,
+ SmallVectorImpl<ValueMapTy> &VRMapPhi);
+ void updateInstrUse(MachineInstr *MI, int StageNum, int PhaseNum,
+ SmallVectorImpl<ValueMapTy> &CurVRMap,
+ SmallVectorImpl<ValueMapTy> *PrevVRMap);
+
+public:
+ ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S,
+ LiveIntervals &LIS)
+ : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
+ TII(ST.getInstrInfo()), LIS(LIS) {}
+
+ void expand();
+ static bool canApply(MachineLoop &L);
+};
+
/// Expander that simply annotates each scheduled instruction with a post-instr
/// symbol that can be consumed by the ModuloScheduleTest pass.
///
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index e7787aafb98e2d..3aba6e1c4a2187 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -765,6 +765,20 @@ class TargetInstrInfo : public MCInstrInfo {
createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
SmallVectorImpl<MachineOperand> &Cond) = 0;
+ /// Create a condtion to determine if the remaining trip count represented
+ /// by the loop counter CounterReg is greater than TC. Some instructions
+ /// such as comparisons may be inserted at the bottom of MBB. CounterReg
+ /// must be accessible there.
+ ///
+ /// The definition of the return value is the same as for the variant above.
+ virtual std::optional<bool>
+ createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ Register CounterReg) {
+ llvm_unreachable(
+ "Target didn't implement createTripCountGreaterCondition");
+ }
+
/// Modify the loop such that the trip count is
/// OriginalTC + TripCountAdjust.
virtual void adjustTripCount(int TripCountAdjust) = 0;
@@ -778,6 +792,16 @@ class TargetInstrInfo : public MCInstrInfo {
/// Once this function is called, no other functions on this object are
/// valid; the loop has been removed.
virtual void disposed() = 0;
+
+ /// Return the initial value of the loop counter.
+ virtual Register getCounterInitReg() {
+ llvm_unreachable("Target didn't implement getCounterInitReg");
+ }
+
+ /// Return the updated value of the loop counter in the original loop.
+ virtual Register getCounterUpdatedReg() {
+ llvm_unreachable("Target didn't implement getCounterUpdatedReg");
+ }
};
/// Analyze loop L, which must be a single-basic-block loop, and if the
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index d8cb6816883394..376384dd3e36a1 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -192,6 +192,10 @@ static cl::opt<int>
cl::desc("Margin representing the unused percentage of "
"the register pressure limit"));
+static cl::opt<bool>
+ MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
+ cl::desc("Use the MVE code generator for software pipelining"));
+
namespace llvm {
// A command line option to enable the CopyToPhi DAG mutation.
@@ -677,6 +681,11 @@ void SwingSchedulerDAG::schedule() {
if (ExperimentalCodeGen && NewInstrChanges.empty()) {
PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
MSE.expand();
+ }
+ if (MVECodeGen && NewInstrChanges.empty() &&
+ ModuloScheduleExpanderMVE::canApply(Loop)) {
+ ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
+ MSE.expand();
} else {
ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
MSE.expand();
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0bef513342ff12..e09fbd02df14e9 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2096,6 +2096,621 @@ void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
MSE.cleanup();
}
+MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
+ MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+
+ // TODO: Offset information needs to be corrected.
+ NewMI->dropMemRefs(MF);
+
+ return NewMI;
+}
+
+/// Create a dedicated exit for Loop. Exit is the original exit for Loop.
+/// If it is already dedicated exit, return it. Otherwise, insert a new
+/// block between them and return the new block.
+static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
+ MachineBasicBlock *Exit) {
+ if (Exit->pred_size() == 1)
+ return Exit;
+
+ MachineFunction *MF = Loop->getParent();
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+ MachineBasicBlock *NewExit =
+ MF->CreateMachineBasicBlock(Loop->getBasicBlock());
+ MF->insert(Loop->getIterator(), NewExit);
+
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ TII->analyzeBranch(*Loop, TBB, FBB, Cond);
+ if (TBB == Loop)
+ FBB = NewExit;
+ else if (FBB == Loop)
+ TBB = NewExit;
+ else
+ llvm_unreachable("unexpected loop structure");
+ TII->removeBranch(*Loop);
+ TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
+ Loop->removeSuccessor(Exit);
+ Loop->addSuccessor(NewExit);
+ TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
+ NewExit->addSuccessor(Exit);
+
+ for (MachineInstr &Phi : Exit->phis()) {
+ for (MachineOperand &MO : Phi.operands())
+ if (MO.isMBB() && MO.getMBB() == Loop)
+ MO.setMBB(NewExit);
+ }
+
+ return NewExit;
+}
+
+/// Generate a pipelined loop that is unrolled by using MVE algorithm and any
+/// other necessary blocks. The control flow is modified to execute the
+/// pipelined loop if the trip count satisfies the condition, otherwise the
+/// original loop. The original loop is also used to execute the reminder
+/// iterations which occur due to unrolling.
+void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
+ // The control flow for pipelining with MVE:
+ //
+ // OrigPreheader:
+ // // The block that is originally the loop preheader
+ // goto Check
+ //
+ // Check:
+ // // Check whether the trip count satisfies the requirements to pipeline.
+ // if (LoopCounter > NumStages + NumUnroll - 2)
+ // // The minimum number of iterations to pipeline =
+ // // iterations executed in prolog/epilog (NumStages-1) +
+ // // iterations executed in one kernel run (NumUnroll)
+ // goto Prolog
+ // // fallback to the original loop
+ // goto NewPreheader
+ //
+ // Prolog:
+ // // All prolog stages. There are no direct branches to the epilogue.
+ // goto NewKernel
+ //
+ // NewKernel:
+ // // NumUnroll copies of the kernel
+ // if (LoopCounter > MVE-1)
+ // goto NewKernel
+ // goto Epilog
+ //
+ // Epilog:
+ // // All epilog stages.
+ // if (LoopCounter > 0)
+ // // The remainder is executed in the original loop
+ // goto NewPreheader
+ // goto NewExit
+ //
+ // NewPreheader:
+ // // Newly created preheader for the original loop.
+ // // The initial values of the phis in the loop are merged from two paths.
+ // NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
+ // goto OrigKernel
+ //
+ // OrigKernel:
+ // // The original loop block.
+ // if (LoopCounter != 0)
+ // goto OrigKernel
+ // goto NewExit
+ //
+ // NewExit:
+ // // Newly created dedicated exit for the original loop.
+ // // Merge values which are referenced after the loop
+ // Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
+ // goto OrigExit
+ //
+ // OrigExit:
+ // // The block that is originally the loop exit.
+ // // If it is already deicated exit, NewExit is not created.
+
+ // An example of where each stage is executed:
+ // Assume #Stages 3, #MVE 4, #Iterations 12
+ // Iter 0 1 2 3 4 5 6 7 8 9 10-11
+ // -------------------------------------------------
+ // Stage 0 Prolog#0
+ // Stage 1 0 Prolog#1
+ // Stage 2 1 0 Kernel Unroll#0 Iter#0
+ // Stage 2 1 0 Kernel Unroll#1 Iter#0
+ // Stage 2 1 0 Kernel Unroll#2 Iter#0
+ // Stage 2 1 0 Kernel Unroll#3 Iter#0
+ // Stage 2 1 0 Kernel Unroll#0 Iter#1
+ // Stage 2 1 0 Kernel Unroll#1 Iter#1
+ // Stage 2 1 0 Kernel Unroll#2 Iter#1
+ // Stage 2 1 0 Kernel Unroll#3 Iter#1
+ // Stage 2 1 Epilog#0
+ // Stage 2 Epilog#1
+ // Stage 0-2 OrigKernel
+
+ LoopInfo = TII->analyzeLoopForPipelining(OrigKernel);
+ assert(LoopInfo && "Must be able to analyze loop!");
+
+ calcNumUnroll();
+
+ Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+ Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+ NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+ Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+ NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
+
+ MF.insert(OrigKernel->getIterator(), Check);
+ MF.insert(OrigKernel->getIterator(), Prolog);
+ MF.insert(OrigKernel->getIterator(), NewKernel);
+ MF.insert(OrigKernel->getIterator(), Epilog);
+ MF.insert(OrigKernel->getIterator(), NewPreheader);
+
+ NewExit = createDedicatedExit(OrigKernel, OrigExit);
+
+ NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader);
+ TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc());
+
+ OrigPreheader->addSuccessor(Check);
+ TII->removeBranch(*OrigPreheader);
+ TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc());
+
+ Check->addSuccessor(Prolog);
+ Check->addSuccessor(NewPreheader);
+
+ Prolog->addSuccessor(NewKernel);
+
+ NewKernel->addSuccessor(NewKernel);
+ NewKernel->addSuccessor(Epilog);
+
+ Epilog->addSuccessor(NewPreheader);
+ Epilog->addSuccessor(NewExit);
+
+ SmallVector<MachineOperand, 4> Cond;
+ LoopInfo->createTripCountGreaterCondition(
+ Schedule.getNumStages() + NumUnroll - 2, *Check, Cond,
+ LoopInfo->getCounterInitReg());
+ TII->insertBranch(*Check, Prolog, NewPreheader, Cond, DebugLoc());
+
+ // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
+ // register#
+ SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
+ generateProlog(PrologVRMap);
+ generateKernel(PrologVRMap, KernelVRMap);
+ generateEpilog(KernelVRMap, EpilogVRMap);
+}
+
+/// Replace MI's use operands according to the maps.
+void ModuloScheduleExpanderMVE::updateInstrUse(
+ MachineInstr *MI, int StageNum, int PhaseNum,
+ SmallVectorImpl<ValueMapTy> &CurVRMap,
+ SmallVectorImpl<ValueMapTy> *PrevVRMap) {
+ // If MI is in the prolog/kernel/epilog block, CurVRMap is
+ // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
+ // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
+ // Refer to the appropriate map according to the stage difference between
+ // MI and the definition of an operand.
+
+ for (MachineOperand &UseMO : MI->uses()) {
+ if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
+ continue;
+ int DiffStage = 0;
+ Register OrigReg = UseMO.getReg();
+ MachineInstr *DefInst = MRI.getVRegDef(OrigReg);
+ if (!DefInst || DefInst->getParent() != OrigKernel)
+ continue;
+ unsigned InitReg = 0;
+ unsigned DefReg = OrigReg;
+ if (DefInst->isPHI()) {
+ ++DiffStage;
+ unsigned LoopReg;
+ getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg);
+ // LoopReg is guaranteed to be defined within the loop by canApply()
+ DefReg = LoopReg;
+ DefInst = MRI.getVRegDef(LoopReg);
+ }
+ unsigned DefStageNum = Schedule.getStage(DefInst);
+ DiffStage += StageNum - DefStageNum;
+ Register NewReg;
+ if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg))
+ // NewReg is defined in a previous phase of the same block
+ NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
+ else if (!PrevVRMap)
+ // Since this is the first iteration, refer the initial register of the
+ // loop
+ NewReg = InitReg;
+ else
+ // Cases where DiffStage is larger than PhaseNum.
+ // If MI is in the kernel block, the value is defined by the previous
+ // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
+ // value is defined in the kernel block and KernelVRMap is referenced.
+ NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
+
+ const TargetRegisterClass *NRC =
+ MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg));
+ if (NRC)
+ UseMO.setReg(NewReg);
+ else {
+ Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+ BuildMI(*OrigKernel, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+ SplitReg)
+ .addReg(NewReg);
+ UseMO.setReg(SplitReg);
+ }
+ }
+}
+
+/// Return a phi if Reg is referenced by the phi.
+/// canApply() guarantees that at most only one such phi exists.
+static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
+ for (MachineInstr &Phi : Loop->phis()) {
+ unsigned InitVal, LoopVal;
+ getPhiRegs(Phi, Loop, InitVal, LoopVal);
+ if (LoopVal == Reg)
+ return Φ
+ }
+ return nullptr;
+}
+
+/// Generate phis for registers defined by OrigMI.
+void ModuloScheduleExpanderMVE::generatePhi(
+ MachineInstr *OrigMI, int UnrollNum,
+ SmallVectorImpl<ValueMapTy> &PrologVRMap,
+ SmallVectorImpl<ValueMapTy> &KernelVRMap,
+ SmallVectorImpl<ValueMapTy> &PhiVRMap) {
+ int StageNum = Schedule.getStage(OrigMI);
+ bool UsePrologReg;
+ if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
+ UsePrologReg = true;
+ else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
+ UsePrologReg = false;
+ else
+ return;
+
+ // Examples that show which stages are merged by phi.
+ // Meaning of the symbol following the stage number:
+ // a/b: Stages with the same letter are merged (UsePrologReg == true)
+ // +: Merged with the initial value (UsePrologReg == false)
+ // *: No phis required
+ //
+ // #Stages 3, #MVE 4
+ // Iter 0 1 2 3 4 5 6 7 8
+ // -----------------------------------------
+ // Stage 0a Prolog#0
+ // Stage 1a 0b Prolog#1
+ // Stage 2* 1* 0* Kernel Unroll#0
+ // Stage 2* 1* 0+ Kernel Unroll#1
+ // Stage 2* 1+ 0a Kernel Unroll#2
+ // Stage 2+ 1a 0b Kernel Unroll#3
+ //
+ // #Stages 3, #MVE 2
+ // Iter 0 1 2 3 4 5 6 7 8
+ // -----------------------------------------
+ // Stage 0a Prolog#0
+ // Stage 1a 0b Prolog#1
+ // Stage 2* 1+ 0a Kernel Unroll#0
+ // Stage 2+ 1a 0b Kernel Unroll#1
+ //
+ // #Stages 3, #MVE 1
+ // Iter 0 1 2 3 4 5 6 7 8
+ // -----------------------------------------
+ // Stage 0* Prolog#0
+ // Stage 1a 0b Prolog#1
+ // Stage 2+ 1a 0b Kernel Unroll#0
+
+ for (MachineOperand &DefMO : OrigMI->defs()) {
+ if (!DefMO.isReg())
+ continue;
+ Register OrigReg = DefMO.getReg();
+ auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
+ if (NewReg == KernelVRMap[UnrollNum].end())
+ continue;
+ Register CorrespondReg;
+ if (UsePrologReg) {
+ int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
+ CorrespondReg = PrologVRMap[PrologNum][OrigReg];
+ } else {
+ MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel);
+ if (!Phi)
+ continue;
+ CorrespondReg = getInitPhiReg(*Phi, OrigKernel);
+ }
+
+ assert(CorrespondReg.isValid());
+ Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+ BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), PhiReg)
+ .addReg(NewReg->second)
+ .addMBB(NewKernel)
+ .addReg(CorrespondReg)
+ .addMBB(Prolog);
+ PhiVRMap[UnrollNum][OrigReg] = PhiReg;
+ }
+}
+
+static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
+ MachineBasicBlock *NewMBB) {
+ for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
+ if (Phi.getOperand(Idx).getReg() == OrigReg) {
+ Phi.getOperand(Idx).setReg(NewReg);
+ Phi.getOperand(Idx + 1).setMBB(NewMBB);
+ return;
+ }
+ }
+}
+
+/// Generate phis that merge values from multiple routes
+void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
+ Register NewReg) {
+ SmallVector<MachineOperand *> UsesAfterLoop;
+ SmallVector<MachineInstr *> LoopPhis;
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg),
+ E = MRI.use_end();
+ I != E; ++I) {
+ MachineOperand &O = *I;
+ if (O.getParent()->getParent() != OrigKernel &&
+ O.getParent()->getParent() != Prolog &&
+ O.getParent()->getParent() != NewKernel &&
+ O.getParent()->getParent() != Epilog)
+ UsesAfterLoop.push_back(&O);
+ if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
+ LoopPhis.push_back(O.getParent());
+ }
+
+ // Merge the route that only execute the pipelined loop (when there are no
+ // remaining iterations) with the route that execute the original loop.
+ if (!UsesAfterLoop.empty()) {
+ Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+ BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), PhiReg)
+ .addReg(OrigReg)
+ .addMBB(OrigKernel)
+ .addReg(NewReg)
+ .addMBB(Epilog);
+
+ for (MachineOperand *MO : UsesAfterLoop)
+ MO->setReg(PhiReg);
+
+ if (!LIS.hasInterval(PhiReg))
+ LIS.createEmptyInterval(PhiReg);
+ }
+
+ // Merge routes from the pipelined loop and the bypassed route before the
+ // original loop
+ if (!LoopPhis.empty()) {
+ for (MachineInstr *Phi : LoopPhis) {
+ unsigned InitReg, LoopReg;
+ getPhiRegs(*Phi, OrigKernel, InitReg, LoopReg);
+ Register NewInit = MRI.createVirtualRegister(MRI.getRegClass(InitReg));
+ BuildMI(*NewPreheader, NewPreheader->getFirstNonPHI(), Phi->getDebugLoc(),
+ TII->get(TargetOpcode::PHI), NewInit)
+ .addReg(InitReg)
+ .addMBB(Check)
+ .addReg(NewReg)
+ .addMBB(Epilog);
+ replacePhiSrc(*Phi, InitReg, NewInit, NewPreheader);
+ }
+ }
+}
+
+void ModuloScheduleExpanderMVE::generateProlog(
+ SmallVectorImpl<ValueMapTy> &PrologVRMap) {
+ PrologVRMap.clear();
+ PrologVRMap.resize(Schedule.getNumStages() - 1);
+ DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
+ for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
+ ++PrologNum) {
+ for (MachineInstr *MI : Schedule.getInstructions()) {
+ if (MI->isPHI())
+ continue;
+ int StageNum = Schedule.getStage(MI);
+ if (StageNum > PrologNum)
+ continue;
+ MachineInstr *NewMI = cloneInstr(MI);
+ updateInstrDef(NewMI, PrologVRMap[PrologNum], false);
+ NewMIMap[NewMI] = {PrologNum, StageNum};
+ Prolog->push_back(NewMI);
+ }
+ }
+
+ for (auto I : NewMIMap) {
+ MachineInstr *MI = I.first;
+ int PrologNum = I.second.first;
+ int StageNum = I.second.second;
+ updateInstrUse(MI, StageNum, PrologNum, PrologVRMap, nullptr);
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "prolog:\n";
+ Prolog->dump();
+ });
+}
+
+void ModuloScheduleExpanderMVE::generateKernel(
+ SmallVectorImpl<ValueMapTy> &PrologVRMap,
+ SmallVectorImpl<ValueMapTy> &KernelVRMap) {
+ KernelVRMap.clear();
+ KernelVRMap.resize(NumUnroll);
+ SmallVector<ValueMapTy> PhiVRMap;
+ PhiVRMap.resize(NumUnroll);
+ DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
+ for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
+ for (MachineInstr *MI : Schedule.getInstructions()) {
+ if (MI->isPHI())
+ continue;
+ int StageNum = Schedule.getStage(MI);
+ MachineInstr *NewMI = cloneInstr(MI);
+ updateInstrDef(NewMI, KernelVRMap[UnrollNum],
+ (UnrollNum == NumUnroll - 1 && StageNum == 0));
+ generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
+ NewMIMap[NewMI] = {UnrollNum, StageNum};
+ NewKernel->push_back(NewMI);
+ }
+ }
+
+ for (auto I : NewMIMap) {
+ MachineInstr *MI = I.first;
+ int UnrollNum = I.second.first;
+ int StageNum = I.second.second;
+ updateInstrUse(MI, StageNum, UnrollNum, KernelVRMap, &PhiVRMap);
+ }
+
+ // If remaining trip count is greater than NumUnroll-1, loop continues
+ SmallVector<MachineOperand, 4> Cond;
+ LoopInfo->createTripCountGreaterCondition(
+ NumUnroll - 1, *NewKernel, Cond,
+ KernelVRMap[NumUnroll - 1][LoopInfo->getCounterUpdatedReg()]);
+ TII->insertBranch(*NewKernel, NewKernel, Epilog, Cond, DebugLoc());
+
+ LLVM_DEBUG({
+ dbgs() << "kernel:\n";
+ NewKernel->dump();
+ });
+}
+
+void ModuloScheduleExpanderMVE::generateEpilog(
+ SmallVectorImpl<ValueMapTy> &KernelVRMap,
+ SmallVectorImpl<ValueMapTy> &EpilogVRMap) {
+ EpilogVRMap.clear();
+ EpilogVRMap.resize(Schedule.getNumStages() - 1);
+ DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
+ for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
+ ++EpilogNum) {
+ for (MachineInstr *MI : Schedule.getInstructions()) {
+ if (MI->isPHI())
+ continue;
+ int StageNum = Schedule.getStage(MI);
+ if (StageNum <= EpilogNum)
+ continue;
+ MachineInstr *NewMI = cloneInstr(MI);
+ updateInstrDef(NewMI, EpilogVRMap[EpilogNum], StageNum - 1 == EpilogNum);
+ NewMIMap[NewMI] = {EpilogNum, StageNum};
+ Epilog->push_back(NewMI);
+ }
+ }
+
+ for (auto I : NewMIMap) {
+ MachineInstr *MI = I.first;
+ int EpilogNum = I.second.first;
+ int StageNum = I.second.second;
+ updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
+ }
+
+ // If there are remaining iterations, they are executed in the original loop
+ SmallVector<MachineOperand, 4> Cond;
+ LoopInfo->createTripCountGreaterCondition(
+ 0, *Epilog, Cond,
+ KernelVRMap[NumUnroll - 1][LoopInfo->getCounterUpdatedReg()]);
+ TII->insertBranch(*Epilog, NewPreheader, NewExit, Cond, DebugLoc());
+
+ LLVM_DEBUG({
+ dbgs() << "epilog:\n";
+ Epilog->dump();
+ });
+}
+
+/// Calculate the number of unroll required and set it to NumUnroll
+void ModuloScheduleExpanderMVE::calcNumUnroll() {
+ DenseMap<MachineInstr *, unsigned> Inst2Idx;
+ NumUnroll = 1;
+ for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
+ Inst2Idx[Schedule.getInstructions()[I]] = I;
+
+ for (MachineInstr *MI : Schedule.getInstructions()) {
+ if (MI->isPHI())
+ continue;
+ int StageNum = Schedule.getStage(MI);
+ for (const MachineOperand &MO : MI->uses()) {
+ if (!MO.isReg() || !MO.getReg().isVirtual())
+ continue;
+ MachineInstr *DefMI = MRI.getVRegDef(MO.getReg());
+ if (DefMI->getParent() != OrigKernel)
+ continue;
+
+ int NumUnrollLocal = 1;
+ if (DefMI->isPHI()) {
+ ++NumUnrollLocal;
+ // canApply() guarantees that DefMI is not phi and is an instruction in
+ // the loop
+ DefMI = MRI.getVRegDef(getLoopPhiReg(*DefMI, OrigKernel));
+ }
+ NumUnrollLocal += StageNum - Schedule.getStage(DefMI);
+ if (Inst2Idx[MI] <= Inst2Idx[DefMI])
+ --NumUnrollLocal;
+ NumUnroll = std::max(NumUnroll, NumUnrollLocal);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
+}
+
+/// Create new virtual registers for definitions of NewMI and update NewMI.
+/// If the definitions are referenced after the pipelined loop, phis are
+/// created to merge with other routes.
+void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
+ ValueMapTy &VRMap,
+ bool LastDef) {
+ for (MachineOperand &MO : NewMI->operands()) {
+ if (!MO.isReg() || !MO.getReg().isVirtual() || !MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ Register NewReg = MRI.createVirtualRegister(RC);
+ MO.setReg(NewReg);
+ VRMap[Reg] = NewReg;
+ if (LastDef)
+ mergeRegUsesAfterPipeline(Reg, NewReg);
+ }
+}
+
+void ModuloScheduleExpanderMVE::expand() {
+ OrigKernel = Schedule.getLoop()->getTopBlock();
+ OrigPreheader = Schedule.getLoop()->getLoopPreheader();
+ OrigExit = Schedule.getLoop()->getExitBlock();
+
+ LLVM_DEBUG(Schedule.dump());
+
+ generatePipelinedLoop();
+}
+
+/// Check if ModuloScheduleExpanderMVE can be applied to L
+bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
+ if (!L.getExitBlock()) {
+ LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+ return false;
+ }
+
+ MachineBasicBlock *BB = L.getTopBlock();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+ // Put some constraints on the operands of the phis to simplify the
+ // transformation
+ DenseSet<unsigned> UsedByPhi;
+ for (MachineInstr &MI : BB->phis()) {
+ // Registers defined by phis must be used only inside the loop and be never
+ // used by phis.
+ for (MachineOperand &MO : MI.defs())
+ if (MO.isReg())
+ for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
+ if (Ref.getParent() != BB || Ref.isPHI()) {
+ LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+ return false;
+ }
+
+ // A source register from the loop block must be defined inside the loop.
+ // A register defined inside the loop must be referenced by only one phi at
+ // most.
+ unsigned InitVal, LoopVal;
+ getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
+ if (!Register(LoopVal).isVirtual() ||
+ MRI.getVRegDef(LoopVal)->getParent() != BB) {
+ LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+ return false;
+ }
+ if (UsedByPhi.count(LoopVal)) {
+ LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+ return false;
+ }
+ UsedByPhi.insert(LoopVal);
+ }
+
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// ModuloScheduleTestPass implementation
//===----------------------------------------------------------------------===//
>From 545b8947f754f0e6bfcc140a10f3f77b1f6fd801 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Fri, 22 Sep 2023 05:19:58 +0000
Subject: [PATCH 2/5] fixup! [ModuloSchedule] Implement modulo variable
expansion for pipelining
---
llvm/lib/CodeGen/ModuloSchedule.cpp | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index e09fbd02df14e9..2ff71945f07188 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2670,7 +2670,7 @@ void ModuloScheduleExpanderMVE::expand() {
/// Check if ModuloScheduleExpanderMVE can be applied to L
bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
if (!L.getExitBlock()) {
- LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+ LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block\n";);
return false;
}
@@ -2687,7 +2687,8 @@ bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
if (MO.isReg())
for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
if (Ref.getParent() != BB || Ref.isPHI()) {
- LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+ LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
+ "referenced outside of the loop or by phi.";);
return false;
}
@@ -2698,11 +2699,14 @@ bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
if (!Register(LoopVal).isVirtual() ||
MRI.getVRegDef(LoopVal)->getParent() != BB) {
- LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+ LLVM_DEBUG(
+ dbgs() << "Can not apply MVE expander: A phi source value coming "
+ "from the loop is not defined in the loop.\n";);
return false;
}
if (UsedByPhi.count(LoopVal)) {
- LLVM_DEBUG(dbgs() << "Can not apply MVE expander\n";);
+ LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
+ "loop is referenced by two or more phis.\n";);
return false;
}
UsedByPhi.insert(LoopVal);
>From b9d203d55e0a2762441a71224d0a957ef7b46962 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Fri, 22 Sep 2023 13:56:28 +0000
Subject: [PATCH 3/5] fixup! [ModuloSchedule] Implement modulo variable
expansion for pipelining
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 35 ++++++++------------
llvm/lib/CodeGen/MachinePipeliner.cpp | 6 ++--
llvm/lib/CodeGen/ModuloSchedule.cpp | 20 +++++------
llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 9 +++++
llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp | 9 +++++
llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 9 +++++
6 files changed, 54 insertions(+), 34 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 3aba6e1c4a2187..45854f214abcc3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -765,19 +765,18 @@ class TargetInstrInfo : public MCInstrInfo {
createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
SmallVectorImpl<MachineOperand> &Cond) = 0;
- /// Create a condtion to determine if the remaining trip count represented
- /// by the loop counter CounterReg is greater than TC. Some instructions
- /// such as comparisons may be inserted at the bottom of MBB. CounterReg
- /// must be accessible there.
+ /// Create a condition to determine if the remaining trip count for a phase
+ /// is greater than TC. Some instructions such as comparisons may be
+ /// inserted at the bottom of MBB. The all instructions expanded for the
+ /// phase must be inserted in MBB before calling this function. RegMap is
+ /// the map from the original registers to the expanded registers for the
+ /// phase.
///
- /// The definition of the return value is the same as for the variant above.
- virtual std::optional<bool>
- createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
- SmallVectorImpl<MachineOperand> &Cond,
- Register CounterReg) {
- llvm_unreachable(
- "Target didn't implement createTripCountGreaterCondition");
- }
+ /// MBB can also be a predecessor of the prologue block. Then RegMap must be
+ /// empty and the compared value is the initial value of the trip count.
+ virtual void createRemainingIterationsGreaterCondition(
+ int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+ DenseMap<unsigned, unsigned> RegMap) = 0;
/// Modify the loop such that the trip count is
/// OriginalTC + TripCountAdjust.
@@ -793,15 +792,9 @@ class TargetInstrInfo : public MCInstrInfo {
/// valid; the loop has been removed.
virtual void disposed() = 0;
- /// Return the initial value of the loop counter.
- virtual Register getCounterInitReg() {
- llvm_unreachable("Target didn't implement getCounterInitReg");
- }
-
- /// Return the updated value of the loop counter in the original loop.
- virtual Register getCounterUpdatedReg() {
- llvm_unreachable("Target didn't implement getCounterUpdatedReg");
- }
+ /// Return true if the target can expand pipelined schedule with modulo
+ /// variable expansion.
+ virtual bool isMVEExpanderSupported() = 0;
};
/// Analyze loop L, which must be a single-basic-block loop, and if the
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 376384dd3e36a1..cf205b277c92b3 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -681,9 +681,9 @@ void SwingSchedulerDAG::schedule() {
if (ExperimentalCodeGen && NewInstrChanges.empty()) {
PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
MSE.expand();
- }
- if (MVECodeGen && NewInstrChanges.empty() &&
- ModuloScheduleExpanderMVE::canApply(Loop)) {
+ } else if (MVECodeGen && NewInstrChanges.empty() &&
+ LoopPipelinerInfo->isMVEExpanderSupported() &&
+ ModuloScheduleExpanderMVE::canApply(Loop)) {
ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
MSE.expand();
} else {
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 2ff71945f07188..6a2d141e8f0f7c 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2262,9 +2262,8 @@ void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
Epilog->addSuccessor(NewExit);
SmallVector<MachineOperand, 4> Cond;
- LoopInfo->createTripCountGreaterCondition(
- Schedule.getNumStages() + NumUnroll - 2, *Check, Cond,
- LoopInfo->getCounterInitReg());
+ LoopInfo->createRemainingIterationsGreaterCondition(
+ Schedule.getNumStages() + NumUnroll - 2, *Check, Cond, ValueMapTy());
TII->insertBranch(*Check, Prolog, NewPreheader, Cond, DebugLoc());
// VRMaps map (prolog/kernel/epilog phase#, original register#) to new
@@ -2552,9 +2551,8 @@ void ModuloScheduleExpanderMVE::generateKernel(
// If remaining trip count is greater than NumUnroll-1, loop continues
SmallVector<MachineOperand, 4> Cond;
- LoopInfo->createTripCountGreaterCondition(
- NumUnroll - 1, *NewKernel, Cond,
- KernelVRMap[NumUnroll - 1][LoopInfo->getCounterUpdatedReg()]);
+ LoopInfo->createRemainingIterationsGreaterCondition(
+ NumUnroll - 1, *NewKernel, Cond, KernelVRMap[NumUnroll - 1]);
TII->insertBranch(*NewKernel, NewKernel, Epilog, Cond, DebugLoc());
LLVM_DEBUG({
@@ -2591,11 +2589,13 @@ void ModuloScheduleExpanderMVE::generateEpilog(
updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
}
- // If there are remaining iterations, they are executed in the original loop
+ // If there are remaining iterations, they are executed in the original loop.
+ // Instructions related to loop control, such as loop counter comparison,
+ // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
+ // in stage 0. Thus, the map is for the last one in the kernel.
SmallVector<MachineOperand, 4> Cond;
- LoopInfo->createTripCountGreaterCondition(
- 0, *Epilog, Cond,
- KernelVRMap[NumUnroll - 1][LoopInfo->getCounterUpdatedReg()]);
+ LoopInfo->createRemainingIterationsGreaterCondition(
+ 0, *Epilog, Cond, KernelVRMap[NumUnroll - 1]);
TII->insertBranch(*Epilog, NewPreheader, NewExit, Cond, DebugLoc());
LLVM_DEBUG({
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index dd63ca17e5b9f1..f3dd3de4dcbe58 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6823,11 +6823,20 @@ class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
llvm_unreachable("Unknown EndLoop");
}
+ void createRemainingIterationsGreaterCondition(
+ int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+ DenseMap<unsigned, unsigned> RegMap) override {
+ llvm_unreachable(
+ "Target didn't implement createRemainingIterationsGreaterCondition");
+ }
+
void setPreheader(MachineBasicBlock *NewPreheader) override {}
void adjustTripCount(int TripCountAdjust) override {}
void disposed() override {}
+
+ bool isMVEExpanderSupported() override { return false; }
};
void ARMPipelinerLoopInfo::bumpCrossIterationPressure(RegPressureTracker &RPT,
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 619c7dc69f9b27..ab2a11bb210b5b 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -770,6 +770,13 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
return TripCount > TC;
}
+ void createRemainingIterationsGreaterCondition(
+ int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+ DenseMap<unsigned, unsigned> RegMap) override {
+ llvm_unreachable(
+ "Target didn't implement createRemainingIterationsGreaterCondition");
+ }
+
void setPreheader(MachineBasicBlock *NewPreheader) override {
NewPreheader->splice(NewPreheader->getFirstTerminator(), Loop->getParent(),
Loop);
@@ -798,6 +805,8 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
}
void disposed() override { Loop->eraseFromParent(); }
+
+ bool isMVEExpanderSupported() override { return false; }
};
} // namespace
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 1c610b269d32d8..2824ce945b00bb 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -5439,6 +5439,13 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
return TripCount > TC;
}
+ void createRemainingIterationsGreaterCondition(
+ int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
+ DenseMap<unsigned, unsigned> RegMap) override {
+ llvm_unreachable(
+ "Target didn't implement createRemainingIterationsGreaterCondition");
+ }
+
void setPreheader(MachineBasicBlock *NewPreheader) override {
// Do nothing. We want the LOOP setup instruction to stay in the *old*
// preheader, so we can use BDZ in the prologs to adapt the loop trip count.
@@ -5463,6 +5470,8 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
// Ensure the loop setup instruction is deleted too.
LoopCount->eraseFromParent();
}
+
+ bool isMVEExpanderSupported() override { return false; }
};
} // namespace
>From cf258e90934b9ff4e641a17fc0ac75d290319ee0 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Fri, 22 Sep 2023 14:38:57 +0000
Subject: [PATCH 4/5] fixup! [ModuloSchedule] Implement modulo variable
expansion for pipelining
---
llvm/lib/CodeGen/ModuloSchedule.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 6a2d141e8f0f7c..acec68a56a6faa 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2670,7 +2670,7 @@ void ModuloScheduleExpanderMVE::expand() {
/// Check if ModuloScheduleExpanderMVE can be applied to L
bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
if (!L.getExitBlock()) {
- LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block\n";);
+ LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n";);
return false;
}
@@ -2688,7 +2688,7 @@ bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
if (Ref.getParent() != BB || Ref.isPHI()) {
LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
- "referenced outside of the loop or by phi.";);
+ "referenced outside of the loop or by phi.\n";);
return false;
}
>From 921c493f80f371a50a8281365077278d1509133d Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 5 Mar 2024 22:40:25 +0900
Subject: [PATCH 5/5] fixup! [ModuloSchedule] Implement modulo variable
expansion for pipelining
---
llvm/include/llvm/CodeGen/ModuloSchedule.h | 6 ++--
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 15 ++++++----
llvm/lib/CodeGen/ModuloSchedule.cpp | 30 ++++++++++++--------
llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 2 +-
llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp | 2 +-
llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 2 +-
6 files changed, 34 insertions(+), 23 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index 1a555728a09857..0e6fd2dabc64f1 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -408,9 +408,11 @@ class ModuloScheduleExpanderMVE {
SmallVectorImpl<ValueMapTy> &KernelVRMap,
SmallVectorImpl<ValueMapTy> &PhiVRMap);
void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap,
- SmallVectorImpl<ValueMapTy> &KernelVRMap);
+ SmallVectorImpl<ValueMapTy> &KernelVRMap,
+ InstrMapTy &LastStage0Insts);
void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap,
- SmallVectorImpl<ValueMapTy> &EpilogVRMap);
+ SmallVectorImpl<ValueMapTy> &EpilogVRMap,
+ InstrMapTy &LastStage0Insts);
void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg);
MachineInstr *cloneInstr(MachineInstr *OldMI);
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 45854f214abcc3..607d806fa88553 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -768,15 +768,18 @@ class TargetInstrInfo : public MCInstrInfo {
/// Create a condition to determine if the remaining trip count for a phase
/// is greater than TC. Some instructions such as comparisons may be
/// inserted at the bottom of MBB. The all instructions expanded for the
- /// phase must be inserted in MBB before calling this function. RegMap is
- /// the map from the original registers to the expanded registers for the
- /// phase.
+ /// phase must be inserted in MBB before calling this function.
+ /// LastStage0Insts is the map from the original instructions scheduled at
+ /// stage#0 to the expanded instructions for the last iteration of the
+ /// kernel. LastStage0Insts is intended to obtain the instruction that
+ /// refers the latest loop counter value.
///
- /// MBB can also be a predecessor of the prologue block. Then RegMap must be
- /// empty and the compared value is the initial value of the trip count.
+ /// MBB can also be a predecessor of the prologue block. Then
+ /// LastStage0Insts must be empty and the compared value is the initial
+ /// value of the trip count.
virtual void createRemainingIterationsGreaterCondition(
int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
- DenseMap<unsigned, unsigned> RegMap) = 0;
+ DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) = 0;
/// Modify the loop such that the trip count is
/// OriginalTC + TripCountAdjust.
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index acec68a56a6faa..72b0eb5d1a5173 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2263,15 +2263,16 @@ void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
SmallVector<MachineOperand, 4> Cond;
LoopInfo->createRemainingIterationsGreaterCondition(
- Schedule.getNumStages() + NumUnroll - 2, *Check, Cond, ValueMapTy());
+ Schedule.getNumStages() + NumUnroll - 2, *Check, Cond, InstrMapTy());
TII->insertBranch(*Check, Prolog, NewPreheader, Cond, DebugLoc());
// VRMaps map (prolog/kernel/epilog phase#, original register#) to new
// register#
SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
+ InstrMapTy LastStage0Insts;
generateProlog(PrologVRMap);
- generateKernel(PrologVRMap, KernelVRMap);
- generateEpilog(KernelVRMap, EpilogVRMap);
+ generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
+ generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
}
/// Replace MI's use operands according to the maps.
@@ -2522,18 +2523,21 @@ void ModuloScheduleExpanderMVE::generateProlog(
void ModuloScheduleExpanderMVE::generateKernel(
SmallVectorImpl<ValueMapTy> &PrologVRMap,
- SmallVectorImpl<ValueMapTy> &KernelVRMap) {
+ SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
KernelVRMap.clear();
KernelVRMap.resize(NumUnroll);
SmallVector<ValueMapTy> PhiVRMap;
PhiVRMap.resize(NumUnroll);
DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
+ DenseMap<MachineInstr *, MachineInstr *> MIMapLastStage0;
for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
for (MachineInstr *MI : Schedule.getInstructions()) {
if (MI->isPHI())
continue;
int StageNum = Schedule.getStage(MI);
MachineInstr *NewMI = cloneInstr(MI);
+ if (UnrollNum == NumUnroll - 1)
+ LastStage0Insts[MI] = NewMI;
updateInstrDef(NewMI, KernelVRMap[UnrollNum],
(UnrollNum == NumUnroll - 1 && StageNum == 0));
generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
@@ -2551,8 +2555,8 @@ void ModuloScheduleExpanderMVE::generateKernel(
// If remaining trip count is greater than NumUnroll-1, loop continues
SmallVector<MachineOperand, 4> Cond;
- LoopInfo->createRemainingIterationsGreaterCondition(
- NumUnroll - 1, *NewKernel, Cond, KernelVRMap[NumUnroll - 1]);
+ LoopInfo->createRemainingIterationsGreaterCondition(NumUnroll - 1, *NewKernel,
+ Cond, LastStage0Insts);
TII->insertBranch(*NewKernel, NewKernel, Epilog, Cond, DebugLoc());
LLVM_DEBUG({
@@ -2563,7 +2567,7 @@ void ModuloScheduleExpanderMVE::generateKernel(
void ModuloScheduleExpanderMVE::generateEpilog(
SmallVectorImpl<ValueMapTy> &KernelVRMap,
- SmallVectorImpl<ValueMapTy> &EpilogVRMap) {
+ SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
EpilogVRMap.clear();
EpilogVRMap.resize(Schedule.getNumStages() - 1);
DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
@@ -2594,8 +2598,8 @@ void ModuloScheduleExpanderMVE::generateEpilog(
// are indicated by shouldIgnoreForPipelining() and are assumed to be placed
// in stage 0. Thus, the map is for the last one in the kernel.
SmallVector<MachineOperand, 4> Cond;
- LoopInfo->createRemainingIterationsGreaterCondition(
- 0, *Epilog, Cond, KernelVRMap[NumUnroll - 1]);
+ LoopInfo->createRemainingIterationsGreaterCondition(0, *Epilog, Cond,
+ LastStage0Insts);
TII->insertBranch(*Epilog, NewPreheader, NewExit, Cond, DebugLoc());
LLVM_DEBUG({
@@ -2670,7 +2674,8 @@ void ModuloScheduleExpanderMVE::expand() {
/// Check if ModuloScheduleExpanderMVE can be applied to L
bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
if (!L.getExitBlock()) {
- LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n";);
+ LLVM_DEBUG(
+ dbgs() << "Can not apply MVE expander: No single exit block.\n";);
return false;
}
@@ -2687,8 +2692,9 @@ bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
if (MO.isReg())
for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
if (Ref.getParent() != BB || Ref.isPHI()) {
- LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
- "referenced outside of the loop or by phi.\n";);
+ LLVM_DEBUG(dbgs()
+ << "Can not apply MVE expander: A phi result is "
+ "referenced outside of the loop or by phi.\n";);
return false;
}
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index f3dd3de4dcbe58..835c2baf90772a 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6825,7 +6825,7 @@ class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
void createRemainingIterationsGreaterCondition(
int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
- DenseMap<unsigned, unsigned> RegMap) override {
+ DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
llvm_unreachable(
"Target didn't implement createRemainingIterationsGreaterCondition");
}
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index ab2a11bb210b5b..4ca40a9c99b222 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -772,7 +772,7 @@ class HexagonPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
void createRemainingIterationsGreaterCondition(
int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
- DenseMap<unsigned, unsigned> RegMap) override {
+ DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
llvm_unreachable(
"Target didn't implement createRemainingIterationsGreaterCondition");
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 2824ce945b00bb..2a0f8f5da071cb 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -5441,7 +5441,7 @@ class PPCPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
void createRemainingIterationsGreaterCondition(
int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
- DenseMap<unsigned, unsigned> RegMap) override {
+ DenseMap<MachineInstr *, MachineInstr *> LastStage0Insts) override {
llvm_unreachable(
"Target didn't implement createRemainingIterationsGreaterCondition");
}
More information about the llvm-commits
mailing list