[llvm] r328113 - [Hexagon] Eliminate subregisters from PHI nodes before pipelining
Krzysztof Parzyszek via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 21 09:45:22 PDT 2018
Obviously not [Hexagon], but [Pipeliner]... :|
-K
On 3/21/2018 11:39 AM, Krzysztof Parzyszek via llvm-commits wrote:
> Author: kparzysz
> Date: Wed Mar 21 09:39:11 2018
> New Revision: 328113
>
> URL: http://llvm.org/viewvc/llvm-project?rev=328113&view=rev
> Log:
> [Hexagon] Eliminate subregisters from PHI nodes before pipelining
>
> The pipeliner needs to remove instructions from the SlotIndexes
> structure when they are deleted. Otherwise, the SlotIndexes map
> has stale data, and an assert will occur when adding new
> instructions.
>
> This patch also changes the pipeliner to make the back-edge of
> a loop carried dependence 1 cycle. The 1 cycle latency is added
> to the anti-dependence that represents the back-edge. This
> changes eliminates a couple of hacks added to the pipeliner to
> handle the latency of the back-edge. It is needed to correctly
> pipeline the test case for the sub-register elimination pass.
>
> Modified:
> llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
> llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll
>
> Modified: llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachinePipeliner.cpp?rev=328113&r1=328112&r2=328113&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/MachinePipeliner.cpp (original)
> +++ llvm/trunk/lib/CodeGen/MachinePipeliner.cpp Wed Mar 21 09:39:11 2018
> @@ -218,6 +218,7 @@ public:
> }
>
> private:
> + void preprocessPhiNodes(MachineBasicBlock &B);
> bool canPipelineLoop(MachineLoop &L);
> bool scheduleLoop(MachineLoop &L);
> bool swingModuloScheduler(MachineLoop &L);
> @@ -357,20 +358,6 @@ public:
>
> bool isLoopCarriedOrder(SUnit *Source, const SDep &Dep, bool isSucc = true);
>
> - /// The latency of the dependence.
> - unsigned getLatency(SUnit *Source, const SDep &Dep) {
> - // Anti dependences represent recurrences, so use the latency of the
> - // instruction on the back-edge.
> - if (Dep.getKind() == SDep::Anti) {
> - if (Source->getInstr()->isPHI())
> - return Dep.getSUnit()->Latency;
> - if (Dep.getSUnit()->getInstr()->isPHI())
> - return Source->Latency;
> - return Dep.getLatency();
> - }
> - return Dep.getLatency();
> - }
> -
> /// The distance function, which indicates that operation V of iteration I
> /// depends on operations U of iteration I-distance.
> unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
> @@ -484,12 +471,19 @@ class NodeSet {
> int MaxDepth = 0;
> unsigned Colocate = 0;
> SUnit *ExceedPressure = nullptr;
> + unsigned Latency = 0;
>
> public:
> using iterator = SetVector<SUnit *>::const_iterator;
>
> NodeSet() = default;
> - NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {}
> + NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
> + Latency = 0;
> + for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
> + for (const SDep &Succ : Nodes[i]->Succs)
> + if (Nodes.count(Succ.getSUnit()))
> + Latency += Succ.getLatency();
> + }
>
> bool insert(SUnit *SU) { return Nodes.insert(SU); }
>
> @@ -820,18 +814,41 @@ bool MachinePipeliner::canPipelineLoop(M
> if (!L.getLoopPreheader())
> return false;
>
> - // If any of the Phis contain subregs, then we can't pipeline
> - // because we don't know how to maintain subreg information in the
> - // VMap structure.
> - MachineBasicBlock *MBB = L.getHeader();
> - for (auto &PHI : MBB->phis())
> - for (unsigned i = 1; i != PHI.getNumOperands(); i += 2)
> - if (PHI.getOperand(i).getSubReg() != 0)
> - return false;
> -
> + // Remove any subregisters from inputs to phi nodes.
> + preprocessPhiNodes(*L.getHeader());
> return true;
> }
>
> +void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
> + MachineRegisterInfo &MRI = MF->getRegInfo();
> + SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
> +
> + for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
> + MachineOperand &DefOp = PI.getOperand(0);
> + assert(DefOp.getSubReg() == 0);
> + auto *RC = MRI.getRegClass(DefOp.getReg());
> +
> + for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
> + MachineOperand &RegOp = PI.getOperand(i);
> + if (RegOp.getSubReg() == 0)
> + continue;
> +
> + // If the operand uses a subregister, replace it with a new register
> + // without subregisters, and generate a copy to the new register.
> + unsigned NewReg = MRI.createVirtualRegister(RC);
> + MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
> + MachineBasicBlock::iterator At = PredB.getFirstTerminator();
> + const DebugLoc &DL = PredB.findDebugLoc(At);
> + auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
> + .addReg(RegOp.getReg(), getRegState(RegOp),
> + RegOp.getSubReg());
> + Slots.insertMachineInstrInMaps(*Copy);
> + RegOp.setReg(NewReg);
> + RegOp.setSubReg(0);
> + }
> + }
> +}
> +
> /// The SMS algorithm consists of the following main steps:
> /// 1. Computation and analysis of the dependence graph.
> /// 2. Ordering of the nodes (instructions).
> @@ -1078,31 +1095,41 @@ void SwingSchedulerDAG::addLoopCarriedDe
> int64_t Offset1, Offset2;
> if (!TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) ||
> !TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
> - SU.addPred(SDep(Load, SDep::Barrier));
> + SDep Dep(Load, SDep::Barrier);
> + Dep.setLatency(1);
> + SU.addPred(Dep);
> continue;
> }
> if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
> assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
> "What happened to the chain edge?");
> - SU.addPred(SDep(Load, SDep::Barrier));
> + SDep Dep(Load, SDep::Barrier);
> + Dep.setLatency(1);
> + SU.addPred(Dep);
> continue;
> }
> // Second, the more expensive check that uses alias analysis on the
> // base registers. If they alias, and the load offset is less than
> // the store offset, the mark the dependence as loop carried.
> if (!AA) {
> - SU.addPred(SDep(Load, SDep::Barrier));
> + SDep Dep(Load, SDep::Barrier);
> + Dep.setLatency(1);
> + SU.addPred(Dep);
> continue;
> }
> MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
> MachineMemOperand *MMO2 = *MI.memoperands_begin();
> if (!MMO1->getValue() || !MMO2->getValue()) {
> - SU.addPred(SDep(Load, SDep::Barrier));
> + SDep Dep(Load, SDep::Barrier);
> + Dep.setLatency(1);
> + SU.addPred(Dep);
> continue;
> }
> if (MMO1->getValue() == MMO2->getValue() &&
> MMO1->getOffset() <= MMO2->getOffset()) {
> - SU.addPred(SDep(Load, SDep::Barrier));
> + SDep Dep(Load, SDep::Barrier);
> + Dep.setLatency(1);
> + SU.addPred(Dep);
> continue;
> }
> AliasResult AAResult = AA->alias(
> @@ -1111,8 +1138,11 @@ void SwingSchedulerDAG::addLoopCarriedDe
> MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
> MMO2->getAAInfo()));
>
> - if (AAResult != NoAlias)
> - SU.addPred(SDep(Load, SDep::Barrier));
> + if (AAResult != NoAlias) {
> + SDep Dep(Load, SDep::Barrier);
> + Dep.setLatency(1);
> + SU.addPred(Dep);
> + }
> }
> }
> }
> @@ -1154,6 +1184,7 @@ void SwingSchedulerDAG::updatePhiDepende
> if (SU != nullptr && UseMI->isPHI()) {
> if (!MI->isPHI()) {
> SDep Dep(SU, SDep::Anti, Reg);
> + Dep.setLatency(1);
> I.addPred(Dep);
> } else {
> HasPhiDef = Reg;
> @@ -1599,12 +1630,12 @@ void SwingSchedulerDAG::computeNodeFunct
> EP = SU->Preds.end();
> IP != EP; ++IP) {
> SUnit *pred = IP->getSUnit();
> - if (getLatency(SU, *IP) == 0)
> + if (IP->getLatency() == 0)
> zeroLatencyDepth =
> std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
> if (ignoreDependence(*IP, true))
> continue;
> - asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
> + asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
> getDistance(pred, SU, *IP) * MII));
> }
> maxASAP = std::max(maxASAP, asap);
> @@ -1623,12 +1654,12 @@ void SwingSchedulerDAG::computeNodeFunct
> ES = SU->Succs.end();
> IS != ES; ++IS) {
> SUnit *succ = IS->getSUnit();
> - if (getLatency(SU, *IS) == 0)
> + if (IS->getLatency() == 0)
> zeroLatencyHeight =
> std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
> if (ignoreDependence(*IS, true))
> continue;
> - alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
> + alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
> getDistance(SU, succ, *IS) * MII));
> }
>
> @@ -2340,6 +2371,8 @@ void SwingSchedulerDAG::generatePipeline
> addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
>
> // Remove the original loop since it's no longer referenced.
> + for (auto &I : *BB)
> + LIS.RemoveMachineInstrFromMaps(I);
> BB->clear();
> BB->eraseFromParent();
>
> @@ -2916,6 +2949,7 @@ void SwingSchedulerDAG::removeDeadInstru
> used = false;
> }
> if (!used) {
> + LIS.RemoveMachineInstrFromMaps(*MI);
> MI++->eraseFromParent();
> continue;
> }
> @@ -2930,6 +2964,7 @@ void SwingSchedulerDAG::removeDeadInstru
> ++BBI;
> unsigned reg = MI->getOperand(0).getReg();
> if (MRI.use_begin(reg) == MRI.use_end()) {
> + LIS.RemoveMachineInstrFromMaps(*MI);
> MI->eraseFromParent();
> }
> }
> @@ -3636,7 +3671,7 @@ void SMSchedule::computeStart(SUnit *SU,
> const SDep &Dep = SU->Preds[i];
> if (Dep.getSUnit() == I) {
> if (!DAG->isBackedge(SU, Dep)) {
> - int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
> + int EarlyStart = cycle + Dep.getLatency() -
> DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
> *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
> if (DAG->isLoopCarriedOrder(SU, Dep, false)) {
> @@ -3644,7 +3679,7 @@ void SMSchedule::computeStart(SUnit *SU,
> *MinEnd = std::min(*MinEnd, End);
> }
> } else {
> - int LateStart = cycle - DAG->getLatency(SU, Dep) +
> + int LateStart = cycle - Dep.getLatency() +
> DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
> *MinLateStart = std::min(*MinLateStart, LateStart);
> }
> @@ -3660,7 +3695,7 @@ void SMSchedule::computeStart(SUnit *SU,
> if (SU->Succs[i].getSUnit() == I) {
> const SDep &Dep = SU->Succs[i];
> if (!DAG->isBackedge(SU, Dep)) {
> - int LateStart = cycle - DAG->getLatency(SU, Dep) +
> + int LateStart = cycle - Dep.getLatency() +
> DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
> *MinLateStart = std::min(*MinLateStart, LateStart);
> if (DAG->isLoopCarriedOrder(SU, Dep)) {
> @@ -3668,7 +3703,7 @@ void SMSchedule::computeStart(SUnit *SU,
> *MaxStart = std::max(*MaxStart, Start);
> }
> } else {
> - int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
> + int EarlyStart = cycle + Dep.getLatency() -
> DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
> *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
> }
>
> Modified: llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll?rev=328113&r1=328112&r2=328113&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll (original)
> +++ llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll Wed Mar 21 09:39:11 2018
> @@ -9,16 +9,16 @@
> ; CHECK-NOT: vcmp.gt([[VREG]].uh,v{{[0-9]+}}.uh)
> ; CHECK: loop0
>
> -define void @f0() #0 {
> +define void @f0(<64 x i32> %a0, <32 x i32> %a1) #0 {
> b0:
> br i1 undef, label %b1, label %b5
>
> b1: ; preds = %b0
> - %v0 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> undef)
> + %v0 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %a0)
> br label %b2
>
> b2: ; preds = %b4, %b1
> - %v1 = phi <32 x i32> [ undef, %b1 ], [ %v7, %b4 ]
> + %v1 = phi <32 x i32> [ %a1, %b1 ], [ %v7, %b4 ]
> br label %b3
>
> b3: ; preds = %b3, %b2
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation
More information about the llvm-commits
mailing list