[llvm] r328113 - [Hexagon] Eliminate subregisters from PHI nodes before pipelining

Wed Mar 21 09:45:22 PDT 2018

Obviously not [Hexagon], but [Pipeliner]...  :|

-K

On 3/21/2018 11:39 AM, Krzysztof Parzyszek via llvm-commits wrote:
> Author: kparzysz
> Date: Wed Mar 21 09:39:11 2018
> New Revision: 328113
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=328113&view=rev
> Log:
> [Hexagon] Eliminate subregisters from PHI nodes before pipelining
> 
> The pipeliner needs to remove instructions from the SlotIndexes
> structure when they are deleted. Otherwise, the SlotIndexes map
> has stale data, and an assert will occur when adding new
> instructions.
> 
> This patch also changes the pipeliner to make the back-edge of
> a loop carried dependence 1 cycle. The 1 cycle latency is added
> to the anti-dependence that represents the back-edge. This
> changes eliminates a couple of hacks added to the pipeliner to
> handle the latency of the back-edge. It is needed to correctly
> pipeline the test case for the sub-register elimination pass.
> 
> Modified:
>      llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
>      llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll
> 
> Modified: llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachinePipeliner.cpp?rev=328113&r1=328112&r2=328113&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/MachinePipeliner.cpp (original)
> +++ llvm/trunk/lib/CodeGen/MachinePipeliner.cpp Wed Mar 21 09:39:11 2018
> @@ -218,6 +218,7 @@ public:
>     }
>   
>   private:
> +  void preprocessPhiNodes(MachineBasicBlock &B);
>     bool canPipelineLoop(MachineLoop &L);
>     bool scheduleLoop(MachineLoop &L);
>     bool swingModuloScheduler(MachineLoop &L);
> @@ -357,20 +358,6 @@ public:
>   
>     bool isLoopCarriedOrder(SUnit *Source, const SDep &Dep, bool isSucc = true);
>   
> -  /// The latency of the dependence.
> -  unsigned getLatency(SUnit *Source, const SDep &Dep) {
> -    // Anti dependences represent recurrences, so use the latency of the
> -    // instruction on the back-edge.
> -    if (Dep.getKind() == SDep::Anti) {
> -      if (Source->getInstr()->isPHI())
> -        return Dep.getSUnit()->Latency;
> -      if (Dep.getSUnit()->getInstr()->isPHI())
> -        return Source->Latency;
> -      return Dep.getLatency();
> -    }
> -    return Dep.getLatency();
> -  }
> -
>     /// The distance function, which indicates that operation V of iteration I
>     /// depends on operations U of iteration I-distance.
>     unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
> @@ -484,12 +471,19 @@ class NodeSet {
>     int MaxDepth = 0;
>     unsigned Colocate = 0;
>     SUnit *ExceedPressure = nullptr;
> +  unsigned Latency = 0;
>   
>   public:
>     using iterator = SetVector<SUnit *>::const_iterator;
>   
>     NodeSet() = default;
> -  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {}
> +  NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
> +    Latency = 0;
> +    for (unsigned i = 0, e = Nodes.size(); i < e; ++i)
> +      for (const SDep &Succ : Nodes[i]->Succs)
> +        if (Nodes.count(Succ.getSUnit()))
> +          Latency += Succ.getLatency();
> +  }
>   
>     bool insert(SUnit *SU) { return Nodes.insert(SU); }
>   
> @@ -820,18 +814,41 @@ bool MachinePipeliner::canPipelineLoop(M
>     if (!L.getLoopPreheader())
>       return false;
>   
> -  // If any of the Phis contain subregs, then we can't pipeline
> -  // because we don't know how to maintain subreg information in the
> -  // VMap structure.
> -  MachineBasicBlock *MBB = L.getHeader();
> -  for (auto &PHI : MBB->phis())
> -    for (unsigned i = 1; i != PHI.getNumOperands(); i += 2)
> -      if (PHI.getOperand(i).getSubReg() != 0)
> -        return false;
> -
> +  // Remove any subregisters from inputs to phi nodes.
> +  preprocessPhiNodes(*L.getHeader());
>     return true;
>   }
>   
> +void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
> +  MachineRegisterInfo &MRI = MF->getRegInfo();
> +  SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
> +
> +  for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
> +    MachineOperand &DefOp = PI.getOperand(0);
> +    assert(DefOp.getSubReg() == 0);
> +    auto *RC = MRI.getRegClass(DefOp.getReg());
> +
> +    for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
> +      MachineOperand &RegOp = PI.getOperand(i);
> +      if (RegOp.getSubReg() == 0)
> +        continue;
> +
> +      // If the operand uses a subregister, replace it with a new register
> +      // without subregisters, and generate a copy to the new register.
> +      unsigned NewReg = MRI.createVirtualRegister(RC);
> +      MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
> +      MachineBasicBlock::iterator At = PredB.getFirstTerminator();
> +      const DebugLoc &DL = PredB.findDebugLoc(At);
> +      auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
> +                    .addReg(RegOp.getReg(), getRegState(RegOp),
> +                            RegOp.getSubReg());
> +      Slots.insertMachineInstrInMaps(*Copy);
> +      RegOp.setReg(NewReg);
> +      RegOp.setSubReg(0);
> +    }
> +  }
> +}
> +
>   /// The SMS algorithm consists of the following main steps:
>   /// 1. Computation and analysis of the dependence graph.
>   /// 2. Ordering of the nodes (instructions).
> @@ -1078,31 +1095,41 @@ void SwingSchedulerDAG::addLoopCarriedDe
>             int64_t Offset1, Offset2;
>             if (!TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) ||
>                 !TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
> -            SU.addPred(SDep(Load, SDep::Barrier));
> +            SDep Dep(Load, SDep::Barrier);
> +            Dep.setLatency(1);
> +            SU.addPred(Dep);
>               continue;
>             }
>             if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
>               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
>                      "What happened to the chain edge?");
> -            SU.addPred(SDep(Load, SDep::Barrier));
> +            SDep Dep(Load, SDep::Barrier);
> +            Dep.setLatency(1);
> +            SU.addPred(Dep);
>               continue;
>             }
>             // Second, the more expensive check that uses alias analysis on the
>             // base registers. If they alias, and the load offset is less than
>             // the store offset, the mark the dependence as loop carried.
>             if (!AA) {
> -            SU.addPred(SDep(Load, SDep::Barrier));
> +            SDep Dep(Load, SDep::Barrier);
> +            Dep.setLatency(1);
> +            SU.addPred(Dep);
>               continue;
>             }
>             MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
>             MachineMemOperand *MMO2 = *MI.memoperands_begin();
>             if (!MMO1->getValue() || !MMO2->getValue()) {
> -            SU.addPred(SDep(Load, SDep::Barrier));
> +            SDep Dep(Load, SDep::Barrier);
> +            Dep.setLatency(1);
> +            SU.addPred(Dep);
>               continue;
>             }
>             if (MMO1->getValue() == MMO2->getValue() &&
>                 MMO1->getOffset() <= MMO2->getOffset()) {
> -            SU.addPred(SDep(Load, SDep::Barrier));
> +            SDep Dep(Load, SDep::Barrier);
> +            Dep.setLatency(1);
> +            SU.addPred(Dep);
>               continue;
>             }
>             AliasResult AAResult = AA->alias(
> @@ -1111,8 +1138,11 @@ void SwingSchedulerDAG::addLoopCarriedDe
>                 MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
>                                MMO2->getAAInfo()));
>   
> -          if (AAResult != NoAlias)
> -            SU.addPred(SDep(Load, SDep::Barrier));
> +          if (AAResult != NoAlias) {
> +            SDep Dep(Load, SDep::Barrier);
> +            Dep.setLatency(1);
> +            SU.addPred(Dep);
> +          }
>           }
>         }
>       }
> @@ -1154,6 +1184,7 @@ void SwingSchedulerDAG::updatePhiDepende
>             if (SU != nullptr && UseMI->isPHI()) {
>               if (!MI->isPHI()) {
>                 SDep Dep(SU, SDep::Anti, Reg);
> +              Dep.setLatency(1);
>                 I.addPred(Dep);
>               } else {
>                 HasPhiDef = Reg;
> @@ -1599,12 +1630,12 @@ void SwingSchedulerDAG::computeNodeFunct
>                                       EP = SU->Preds.end();
>            IP != EP; ++IP) {
>         SUnit *pred = IP->getSUnit();
> -      if (getLatency(SU, *IP) == 0)
> +      if (IP->getLatency() == 0)
>           zeroLatencyDepth =
>               std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
>         if (ignoreDependence(*IP, true))
>           continue;
> -      asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
> +      asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
>                                     getDistance(pred, SU, *IP) * MII));
>       }
>       maxASAP = std::max(maxASAP, asap);
> @@ -1623,12 +1654,12 @@ void SwingSchedulerDAG::computeNodeFunct
>                                       ES = SU->Succs.end();
>            IS != ES; ++IS) {
>         SUnit *succ = IS->getSUnit();
> -      if (getLatency(SU, *IS) == 0)
> +      if (IS->getLatency() == 0)
>           zeroLatencyHeight =
>               std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
>         if (ignoreDependence(*IS, true))
>           continue;
> -      alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
> +      alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
>                                     getDistance(SU, succ, *IS) * MII));
>       }
>   
> @@ -2340,6 +2371,8 @@ void SwingSchedulerDAG::generatePipeline
>     addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
>   
>     // Remove the original loop since it's no longer referenced.
> +  for (auto &I : *BB)
> +    LIS.RemoveMachineInstrFromMaps(I);
>     BB->clear();
>     BB->eraseFromParent();
>   
> @@ -2916,6 +2949,7 @@ void SwingSchedulerDAG::removeDeadInstru
>           used = false;
>         }
>         if (!used) {
> +        LIS.RemoveMachineInstrFromMaps(*MI);
>           MI++->eraseFromParent();
>           continue;
>         }
> @@ -2930,6 +2964,7 @@ void SwingSchedulerDAG::removeDeadInstru
>       ++BBI;
>       unsigned reg = MI->getOperand(0).getReg();
>       if (MRI.use_begin(reg) == MRI.use_end()) {
> +      LIS.RemoveMachineInstrFromMaps(*MI);
>         MI->eraseFromParent();
>       }
>     }
> @@ -3636,7 +3671,7 @@ void SMSchedule::computeStart(SUnit *SU,
>           const SDep &Dep = SU->Preds[i];
>           if (Dep.getSUnit() == I) {
>             if (!DAG->isBackedge(SU, Dep)) {
> -            int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
> +            int EarlyStart = cycle + Dep.getLatency() -
>                                DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
>               *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
>               if (DAG->isLoopCarriedOrder(SU, Dep, false)) {
> @@ -3644,7 +3679,7 @@ void SMSchedule::computeStart(SUnit *SU,
>                 *MinEnd = std::min(*MinEnd, End);
>               }
>             } else {
> -            int LateStart = cycle - DAG->getLatency(SU, Dep) +
> +            int LateStart = cycle - Dep.getLatency() +
>                               DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
>               *MinLateStart = std::min(*MinLateStart, LateStart);
>             }
> @@ -3660,7 +3695,7 @@ void SMSchedule::computeStart(SUnit *SU,
>           if (SU->Succs[i].getSUnit() == I) {
>             const SDep &Dep = SU->Succs[i];
>             if (!DAG->isBackedge(SU, Dep)) {
> -            int LateStart = cycle - DAG->getLatency(SU, Dep) +
> +            int LateStart = cycle - Dep.getLatency() +
>                               DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
>               *MinLateStart = std::min(*MinLateStart, LateStart);
>               if (DAG->isLoopCarriedOrder(SU, Dep)) {
> @@ -3668,7 +3703,7 @@ void SMSchedule::computeStart(SUnit *SU,
>                 *MaxStart = std::max(*MaxStart, Start);
>               }
>             } else {
> -            int EarlyStart = cycle + DAG->getLatency(SU, Dep) -
> +            int EarlyStart = cycle + Dep.getLatency() -
>                                DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
>               *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
>             }
> 
> Modified: llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll?rev=328113&r1=328112&r2=328113&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll (original)
> +++ llvm/trunk/test/CodeGen/Hexagon/swp-prolog-phi.ll Wed Mar 21 09:39:11 2018
> @@ -9,16 +9,16 @@
>   ; CHECK-NOT: vcmp.gt([[VREG]].uh,v{{[0-9]+}}.uh)
>   ; CHECK: loop0
>   
> -define void @f0() #0 {
> +define void @f0(<64 x i32> %a0, <32 x i32> %a1) #0 {
>   b0:
>     br i1 undef, label %b1, label %b5
>   
>   b1:                                               ; preds = %b0
> -  %v0 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> undef)
> +  %v0 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %a0)
>     br label %b2
>   
>   b2:                                               ; preds = %b4, %b1
> -  %v1 = phi <32 x i32> [ undef, %b1 ], [ %v7, %b4 ]
> +  %v1 = phi <32 x i32> [ %a1, %b1 ], [ %v7, %b4 ]
>     br label %b3
>   
>   b3:                                               ; preds = %b3, %b2
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
> 

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, 
hosted by The Linux Foundation