[llvm] 6c2a4f5 - [TTI][LV] preferPredicateOverEpilogue
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 02:14:48 PST 2019
Author: Sjoerd Meijer
Date: 2019-11-06T10:14:20Z
New Revision: 6c2a4f5ff93e16c3b86c18543e02a193ced2d956
URL: https://github.com/llvm/llvm-project/commit/6c2a4f5ff93e16c3b86c18543e02a193ced2d956
DIFF: https://github.com/llvm/llvm-project/commit/6c2a4f5ff93e16c3b86c18543e02a193ced2d956.diff
LOG: [TTI][LV] preferPredicateOverEpilogue
We have two ways to steer creating a predicated vector body over creating a
scalar epilogue. To force this, we have 1) a command line option and 2) a
pragma available. This adds a third: a target hook to TargetTransformInfo that
can be queried whether predication is preferred or not, which allows the
vectoriser to make the decision without forcing it.
While this change behaves as a non-functional change for now, it shows the
required TTI plumbing, usage of this new hook in the vectoriser, and the
beginning of an ARM MVE implementation. I will follow up on this with:
- a complete MVE implementation, see D69845.
- a patch to disable this, i.e. we should respect "vector_predicate(disable)"
and its corresponding loophint.
Differential Revision: https://reviews.llvm.org/D69040
Added:
llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
llvm/lib/Target/ARM/ARMTargetTransformInfo.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 54e15792cdc7..be385deb01fa 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -46,6 +46,7 @@ class Function;
class GlobalValue;
class IntrinsicInst;
class LoadInst;
+class LoopAccessInfo;
class Loop;
class ProfileSummaryInfo;
class SCEV;
@@ -518,6 +519,13 @@ class TargetTransformInfo {
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) const;
+ /// Query the target whether it would be prefered to create a predicated vector
+ /// loop, which can avoid the need to emit a scalar epilogue loop.
+ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ AssumptionCache &AC, TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI) const;
+
/// @}
/// \name Scalar Target Information
@@ -1201,6 +1209,12 @@ class TargetTransformInfo::Concept {
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) = 0;
+ virtual bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
+ ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1471,6 +1485,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
HardwareLoopInfo &HWLoopInfo) override {
return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
+ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ AssumptionCache &AC, TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI) override {
+ return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
+ }
bool isLegalAddImmediate(int64_t Imm) override {
return Impl.isLegalAddImmediate(Imm);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6a8a0d74d4a8..6b4379487944 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -213,6 +213,13 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ AssumptionCache &AC, TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI) const {
+ return false;
+ }
+
void getUnrollingPreferences(Loop *, ScalarEvolution &,
TTI::UnrollingPreferences &) {}
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e711cd011a6f..b21c9c9e4ad9 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -510,6 +510,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
+ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ AssumptionCache &AC, TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI) {
+ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
+ }
+
int getInstructionLatency(const Instruction *I) {
if (isa<LoadInst>(I))
return getST()->getSchedModel().DefaultLoadLatency;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ba89a9eebdb6..0b409840351d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -243,6 +243,12 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
+bool TargetTransformInfo::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
+ ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI,
+ DominatorTree *DT, const LoopAccessInfo *LAI) const {
+ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
+}
+
void TargetTransformInfo::getUnrollingPreferences(
Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
return TTIImpl->getUnrollingPreferences(L, SE, UP);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index ed1d6e5ca365..eb698375985c 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1000,6 +1000,50 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
return true;
}
+bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
+ ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI) {
+ // Creating a predicated vector loop is the first step for generating a
+ // tail-predicated hardware loop, for which we need the MVE masked
+ // load/stores instructions:
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ HardwareLoopInfo HWLoopInfo(L);
+ if (!HWLoopInfo.canAnalyze(*LI)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "analyzable.\n");
+ return false;
+ }
+
+ // This checks if we have the low-overhead branch architecture
+ // extension, and if we will create a hardware-loop:
+ if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "profitable.\n");
+ return false;
+ }
+
+ if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "a candidate.\n");
+ return false;
+ }
+
+ // TODO: to set up a tail-predicated loop, which works by setting up
+ // the total number of elements processed by the loop, we need to
+ // determine the element size here, and if it is uniform for all operations
+ // in the vector loop. This means we will reject narrowing/widening
+ // operations, and don't want to predicate the vector loop, which is
+ // the main prep step for tail-predicated loops.
+
+ return false;
+}
+
+
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index c4e1a17d80c1..5bb3bcaf10e7 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -203,7 +203,12 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo);
-
+ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
+ ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9b6223cbbdce..f10f0f3320d0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7423,13 +7423,18 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
static ScalarEpilogueLowering
getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
- ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+ ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ AssumptionCache *AC, LoopInfo *LI,
+ ScalarEvolution *SE, DominatorTree *DT,
+ const LoopAccessInfo *LAI) {
ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
(F->hasOptSize() ||
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
SEL = CM_ScalarEpilogueNotAllowedOptSize;
- else if (PreferPredicateOverEpilog || Hints.getPredicate())
+ else if (PreferPredicateOverEpilog || Hints.getPredicate() ||
+ TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI))
SEL = CM_ScalarEpilogueNotNeededUsePredicate;
return SEL;
@@ -7449,7 +7454,10 @@ static bool processLoopInVPlanNativePath(
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+
+ ScalarEpilogueLowering SEL =
+ getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
+ PSE.getSE(), DT, LVL->getLAI());
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI);
@@ -7541,7 +7549,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+ ScalarEpilogueLowering SEL =
+ getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
+ PSE.getSE(), DT, LVL.getLAI());
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
new file mode 100644
index 000000000000..9fea4da6e38a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
@@ -0,0 +1,49 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf < %s -loop-vectorize -S | \
+; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING
+
+; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve < %s -loop-vectorize -enable-arm-maskedldst=true -S | \
+; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING
+
+; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve < %s -loop-vectorize -enable-arm-maskedldst=false -S | \
+; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING
+
+; Disabling the low-overhead branch extension will make
+; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for
+; these cases.
+; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob < %s -loop-vectorize -enable-arm-maskedldst=true -S | \
+; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING
+
+; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve < %s -loop-vectorize -enable-arm-maskedldst=true -S | \
+; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
+
+define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) {
+; CHECK-LABEL: tail_folding(
+;
+; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
+;
+; TODO: this needs implementation of TTI::preferPredicateOverEpilogue,
+; then this will be tail-folded too:
+;
+; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+ %1 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %1, %0
+ %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ store i32 %add, i32* %arrayidx4, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 430
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
index fc9ef2b3b76f..af755c887428 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
@@ -7,6 +7,37 @@
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL: tail_folding(
+; CHECK: vector.body:
+;
+; This needs implementation of TTI::preferPredicateOverEpilogue,
+; then this will be tail-folded too:
+;
+; CHECK-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; CHECK-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
+; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+ %1 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %1, %0
+ %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ store i32 %add, i32* %arrayidx4, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 430
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
; COMMON-LABEL: tail_folding_enabled(
; COMMON: vector.body:
@@ -50,7 +81,7 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32(
; PREDFLAG: %index.next = add i64 %index, 4
; PREDFLAG: %12 = icmp eq i64 %index.next, 432
-; PREDFLAG: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !4
+; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6
entry:
br label %for.body
@@ -77,7 +108,7 @@ for.body:
; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK-NEXT: !4 = distinct !{!4, !1}
; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
-
+; CHECK-NEXT: !6 = distinct !{!6, !1}
attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
!6 = distinct !{!6, !7, !8}
More information about the llvm-commits
mailing list