[llvm] r344748 - [Pipeliner] copyToPhi DAG Mutation to improve scheduling.
Sumanth Gundapaneni via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 18 08:51:16 PDT 2018
Author: sgundapa
Date: Thu Oct 18 08:51:16 2018
New Revision: 344748
URL: http://llvm.org/viewvc/llvm-project?rev=344748&view=rev
Log:
[Pipeliner] copyToPhi DAG Mutation to improve scheduling.
In a loop, create artificial dependences between the source of a
COPY/REG_SEQUENCE to the use in next iteration.
Eg:
SRC ----Data Dep--> COPY
COPY ---Anti Dep--> PHI (implies, to be used in next iteration)
PHI ----Data Dep--> USE
This patches creates
USE ----Artificial Dep---> SRC
This will effectively schedule the COPY late to eliminate additional copies.
Before this patch, the schedule can be
SRC, COPY, USE : The COPY is used in next iteration and it needs to be
preserved.
After this patch, the schedule can be
USE, SRC, COPY : The COPY is used in next iteration and the live interval is
reduced.
Differential Revision: https://reviews.llvm.org/D53303
Added:
llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll
Modified:
llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
Modified: llvm/trunk/lib/CodeGen/MachinePipeliner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachinePipeliner.cpp?rev=344748&r1=344747&r2=344748&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/MachinePipeliner.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachinePipeliner.cpp Thu Oct 18 08:51:16 2018
@@ -102,6 +102,7 @@
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
@@ -171,6 +172,12 @@ static cl::opt<bool> SwpIgnoreRecMII("pi
cl::ReallyHidden, cl::init(false),
cl::ZeroOrMore, cl::desc("Ignore RecMII"));
+// A command line option to enable the CopyToPhi DAG mutation.
+static cl::opt<bool>
+ SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+ cl::init(true), cl::ZeroOrMore,
+ cl::desc("Enable CopyToPhi DAG Mutation"));
+
namespace {
class NodeSet;
@@ -307,12 +314,18 @@ class SwingSchedulerDAG : public Schedul
void unblock(int U);
};
+ struct CopyToPhiMutation : public ScheduleDAGMutation {
+ void apply(ScheduleDAGInstrs *DAG) override;
+ };
+
public:
SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
const RegisterClassInfo &rci)
: ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
RegClassInfo(rci), Topo(SUnits, &ExitSU) {
P.MF->getSubtarget().getSMSMutations(Mutations);
+ if (SwpEnableCopyToPhi)
+ Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
}
void schedule() override;
@@ -391,6 +404,8 @@ public:
Mutations.push_back(std::move(Mutation));
}
+ static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
+
private:
void addLoopCarriedDependences(AliasAnalysis *AA);
void updatePhiDependences();
@@ -893,8 +908,8 @@ void SwingSchedulerDAG::schedule() {
addLoopCarriedDependences(AA);
updatePhiDependences();
Topo.InitDAGTopologicalSorting();
- postprocessDAG();
changeDependences();
+ postprocessDAG();
LLVM_DEBUG(dump());
NodeSetType NodeSets;
@@ -1624,6 +1639,85 @@ void SwingSchedulerDAG::findCircuits(Nod
swapAntiDependences(SUnits);
}
+// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
+// is loop-carried to the USE in next iteration. This will help pipeliner avoid
+// additional copies that are needed across iterations. An artificial dependence
+// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
+
+// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
+// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
+// PHI-------True-Dep------> USEOfPhi
+
+// The mutation creates
+// USEOfPHI -------Artificial-Dep---> SRCOfCopy
+
+// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
+// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
+// late to avoid additional copies across iterations. The possible scheduling
+// order would be
+// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
+
+void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
+ for (SUnit &SU : DAG->SUnits) {
+ // Find the COPY/REG_SEQUENCE instruction.
+ if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
+ continue;
+
+ // Record the loop carried PHIs.
+ SmallVector<SUnit *, 4> PHISUs;
+ // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
+ SmallVector<SUnit *, 4> SrcSUs;
+
+ for (auto &Dep : SU.Preds) {
+ SUnit *TmpSU = Dep.getSUnit();
+ MachineInstr *TmpMI = TmpSU->getInstr();
+ SDep::Kind DepKind = Dep.getKind();
+ // Save the loop carried PHI.
+ if (DepKind == SDep::Anti && TmpMI->isPHI())
+ PHISUs.push_back(TmpSU);
+ // Save the source of COPY/REG_SEQUENCE.
+ // If the source has no pre-decessors, we will end up creating cycles.
+ else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
+ SrcSUs.push_back(TmpSU);
+ }
+
+ if (PHISUs.size() == 0 || SrcSUs.size() == 0)
+ continue;
+
+ // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
+ // SUnit to the container.
+ SmallVector<SUnit *, 8> UseSUs;
+ for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) {
+ for (auto &Dep : (*I)->Succs) {
+ if (Dep.getKind() != SDep::Data)
+ continue;
+
+ SUnit *TmpSU = Dep.getSUnit();
+ MachineInstr *TmpMI = TmpSU->getInstr();
+ if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
+ PHISUs.push_back(TmpSU);
+ continue;
+ }
+ UseSUs.push_back(TmpSU);
+ }
+ }
+
+ if (UseSUs.size() == 0)
+ continue;
+
+ SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
+ // Add the artificial dependencies if it does not form a cycle.
+ for (auto I : UseSUs) {
+ for (auto Src : SrcSUs) {
+ if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
+ Src->addPred(SDep(I, SDep::Artificial));
+ SDAG->Topo.AddPred(Src, I);
+ }
+ }
+ }
+ }
+}
+
/// Return true for DAG nodes that we ignore when computing the cost functions.
/// We ignore the back-edge recurrence in order to avoid unbounded recursion
/// in the calculation of the ASAP, ALAP, etc functions.
Added: llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll?rev=344748&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll (added)
+++ llvm/trunk/test/CodeGen/Hexagon/swp-copytophi-dag.ll Thu Oct 18 08:51:16 2018
@@ -0,0 +1,72 @@
+; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \
+; RUN: 2>&1 | FileCheck %s
+
+; Test that the artificial dependence is created as a result of
+; CopyToPhi DAG mutation.
+; CHECK: Ord Latency=0 Artificial
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @foo(i64* nocapture readonly %r64, i16 zeroext %n, i16 zeroext %s, i64* nocapture %p64) #0 {
+entry:
+ %conv = zext i16 %n to i32
+ %cmp = icmp eq i16 %n, 0
+ br i1 %cmp, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %tmp = load i64, i64* %r64, align 8
+ %v.sroa.0.0.extract.trunc = trunc i64 %tmp to i16
+ %v.sroa.4.0.extract.shift = lshr i64 %tmp, 16
+ %v.sroa.4.0.extract.trunc = trunc i64 %v.sroa.4.0.extract.shift to i16
+ %v.sroa.5.0.extract.shift = lshr i64 %tmp, 32
+ %v.sroa.5.0.extract.trunc = trunc i64 %v.sroa.5.0.extract.shift to i16
+ %v.sroa.6.0.extract.shift = lshr i64 %tmp, 48
+ %v.sroa.6.0.extract.trunc = trunc i64 %v.sroa.6.0.extract.shift to i16
+ %tmp1 = bitcast i64* %p64 to i16*
+ %conv2 = zext i16 %s to i32
+ %add.ptr = getelementptr inbounds i16, i16* %tmp1, i32 %conv2
+ %add.ptr.sum = add nuw nsw i32 %conv2, 1
+ %add.ptr3 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum
+ %add.ptr.sum50 = add nuw nsw i32 %conv2, 2
+ %add.ptr4 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum50
+ %add.ptr.sum51 = add nuw nsw i32 %conv2, 3
+ %add.ptr5 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum51
+ br label %for.body
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %add.ptr11.phi = phi i16* [ %add.ptr11.inc, %for.body ], [ %add.ptr, %for.body.preheader ]
+ %add.ptr16.phi = phi i16* [ %add.ptr16.inc, %for.body ], [ %add.ptr3, %for.body.preheader ]
+ %add.ptr21.phi = phi i16* [ %add.ptr21.inc, %for.body ], [ %add.ptr4, %for.body.preheader ]
+ %add.ptr26.phi = phi i16* [ %add.ptr26.inc, %for.body ], [ %add.ptr5, %for.body.preheader ]
+ %i.058.pmt = phi i32 [ %inc.pmt, %for.body ], [ 0, %for.body.preheader ]
+ %v.sroa.0.157 = phi i16 [ %v.sroa.0.0.extract.trunc34, %for.body ], [ %v.sroa.0.0.extract.trunc, %for.body.preheader ]
+ %v.sroa.4.156 = phi i16 [ %v.sroa.4.0.extract.trunc36, %for.body ], [ %v.sroa.4.0.extract.trunc, %for.body.preheader ]
+ %v.sroa.5.155 = phi i16 [ %v.sroa.5.0.extract.trunc38, %for.body ], [ %v.sroa.5.0.extract.trunc, %for.body.preheader ]
+ %v.sroa.6.154 = phi i16 [ %v.sroa.6.0.extract.trunc40, %for.body ], [ %v.sroa.6.0.extract.trunc, %for.body.preheader ]
+ %q64.153.pn = phi i64* [ %q64.153, %for.body ], [ %r64, %for.body.preheader ]
+ %q64.153 = getelementptr inbounds i64, i64* %q64.153.pn, i32 1
+ store i16 %v.sroa.0.157, i16* %add.ptr11.phi, align 2
+ store i16 %v.sroa.4.156, i16* %add.ptr16.phi, align 2
+ store i16 %v.sroa.5.155, i16* %add.ptr21.phi, align 2
+ store i16 %v.sroa.6.154, i16* %add.ptr26.phi, align 2
+ %tmp2 = load i64, i64* %q64.153, align 8
+ %v.sroa.0.0.extract.trunc34 = trunc i64 %tmp2 to i16
+ %v.sroa.4.0.extract.shift35 = lshr i64 %tmp2, 16
+ %v.sroa.4.0.extract.trunc36 = trunc i64 %v.sroa.4.0.extract.shift35 to i16
+ %v.sroa.5.0.extract.shift37 = lshr i64 %tmp2, 32
+ %v.sroa.5.0.extract.trunc38 = trunc i64 %v.sroa.5.0.extract.shift37 to i16
+ %v.sroa.6.0.extract.shift39 = lshr i64 %tmp2, 48
+ %v.sroa.6.0.extract.trunc40 = trunc i64 %v.sroa.6.0.extract.shift39 to i16
+ %inc.pmt = add i32 %i.058.pmt, 1
+ %cmp8 = icmp slt i32 %inc.pmt, %conv
+ %add.ptr11.inc = getelementptr i16, i16* %add.ptr11.phi, i32 4
+ %add.ptr16.inc = getelementptr i16, i16* %add.ptr16.phi, i32 4
+ %add.ptr21.inc = getelementptr i16, i16* %add.ptr21.phi, i32 4
+ %add.ptr26.inc = getelementptr i16, i16* %add.ptr26.phi, i32 4
+ br i1 %cmp8, label %for.body, label %for.end
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv65" }
More information about the llvm-commits
mailing list