[llvm] 8f18874 - [LV] Still vectorise when tail-folding can't find a primary inducation variable
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 9 01:16:57 PST 2020
Author: Sjoerd Meijer
Date: 2020-01-09T09:14:00Z
New Revision: 8f1887456ab4ba24a62ccb19d0d04b08972a0289
URL: https://github.com/llvm/llvm-project/commit/8f1887456ab4ba24a62ccb19d0d04b08972a0289
DIFF: https://github.com/llvm/llvm-project/commit/8f1887456ab4ba24a62ccb19d0d04b08972a0289.diff
LOG: [LV] Still vectorise when tail-folding can't find a primary inducation variable
This addresses a vectorisation regression for tail-folded loops that are
counting down, e.g. loops as simple as this:
void foo(char *A, char *B, char *C, uint32_t N) {
while (N > 0) {
*C++ = *A++ + *B++;
N--;
}
}
These are loops that can be vectorised, but when tail-folding is requested, it
can't find a primary induction variable which we do need for predicating the
loop. As a result, the loop isn't vectorised at all, which it is able to do
when tail-folding is not attempted. So, this adds a check for the primary
induction variable where we decide how to lower the scalar epilogue. I.e., when
there isn't a primary induction variable, a scalar epilogue loop is allowed
(i.e. don't request tail-folding) so that vectorisation could still be
triggered.
Having this check for the primary induction variable make sense anyway, and in
addition, in a follow-up of this I will look into discovering earlier the
primary induction variable for counting down loops, so that this can also be
tail-folded.
Differential revision: https://reviews.llvm.org/D72324
Added:
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fd30d52a562a..0400e44dd0ec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7502,30 +7502,43 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
}
-static ScalarEpilogueLowering
-getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
- ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
- TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- AssumptionCache *AC, LoopInfo *LI,
- ScalarEvolution *SE, DominatorTree *DT,
- const LoopAccessInfo *LAI) {
- ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+// Determine how to lower the scalar epilogue, which depends on 1) optimising
+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
+// predication, and 4) a TTI hook that analyses whether the loop is suitable
+// for predication.
+static ScalarEpilogueLowering getScalarEpilogueLowering(
+ Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+ LoopVectorizationLegality &LVL) {
+ bool OptSize =
+ F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass);
+ // 1) OptSize takes precedence over all other options, i.e. if this is set,
+ // don't look at hints or options, and don't request a scalar epilogue.
+ if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+ return CM_ScalarEpilogueNotAllowedOptSize;
+
bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
!PreferPredicateOverEpilog;
- if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- (F->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
- PGSOQueryType::IRPass)))
- SEL = CM_ScalarEpilogueNotAllowedOptSize;
- else if (PreferPredicateOverEpilog ||
- Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
- (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
- Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
- !PredicateOptDisabled))
- SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+ // 2) Next, if disabling predication is requested on the command line, honour
+ // this and request a scalar epilogue. Also do this if we don't have a
+ // primary induction variable, which is required for predication.
+ if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+ return CM_ScalarEpilogueAllowed;
+
+ // 3) and 4) look if enabling predication is requested on the command line,
+ // with a loop hint, or if the TTI hook indicates this is profitable, request
+ // predication .
+ if (PreferPredicateOverEpilog ||
+ Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
+ (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+ LVL.getLAI()) &&
+ Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+ return CM_ScalarEpilogueNotNeededUsePredicate;
- return SEL;
+ return CM_ScalarEpilogueAllowed;
}
// Process the loop in the VPlan-native vectorization path. This path builds
@@ -7543,9 +7556,8 @@ static bool processLoopInVPlanNativePath(
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
- ScalarEpilogueLowering SEL =
- getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
- PSE.getSE(), DT, LVL->getLAI());
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI);
@@ -7637,9 +7649,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- ScalarEpilogueLowering SEL =
- getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
- PSE.getSE(), DT, LVL.getLAI());
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
new file mode 100644
index 000000000000..5a3438230a2d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) #0 {
+entry:
+ %cmp6 = icmp eq i32 %N, 0
+ br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+ br label %while.body
+
+while.body:
+ %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+ %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+ %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+ %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+ %0 = load i8, i8* %A.addr.07, align 1
+ %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+ %1 = load i8, i8* %B.addr.08, align 1
+ %add = add i8 %1, %0
+ %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+ store i8 %add, i8* %C.addr.09, align 1
+ %dec = add i32 %N.addr.010, -1
+ %cmp = icmp eq i32 %dec, 0
+ br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+ br label %while.end
+
+while.end:
+ ret void
+}
+
+attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
new file mode 100644
index 000000000000..2667bfe68f61
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp6 = icmp eq i32 %N, 0
+ br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+ br label %while.body
+
+while.body:
+ %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+ %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+ %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+ %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+ %0 = load i8, i8* %A.addr.07, align 1
+ %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+ %1 = load i8, i8* %B.addr.08, align 1
+ %add = add i8 %1, %0
+ %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+ store i8 %add, i8* %C.addr.09, align 1
+ %dec = add i32 %N.addr.010, -1
+ %cmp = icmp eq i32 %dec, 0
+ br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+ br label %while.end
+
+while.end:
+ ret void
+}
More information about the llvm-commits
mailing list