[llvm] c42bb30 - [LoopVectorize] Permit fixed-width epilogue loops for scalable vector bodies
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 8 01:41:19 PST 2021
Author: David Sherwood
Date: 2021-11-08T09:41:13Z
New Revision: c42bb30b9e2950efd1d1bd2df9ab8e5b098e65c7
URL: https://github.com/llvm/llvm-project/commit/c42bb30b9e2950efd1d1bd2df9ab8e5b098e65c7
DIFF: https://github.com/llvm/llvm-project/commit/c42bb30b9e2950efd1d1bd2df9ab8e5b098e65c7.diff
LOG: [LoopVectorize] Permit fixed-width epilogue loops for scalable vector bodies
At the moment in LoopVectorizationCostModel::selectEpilogueVectorizationFactor
we bail out if the main vector loop uses a scalable VF. This patch adds
support for generating epilogue vector loops using a fixed-width VF when the
main vector loop uses a scalable VF.
I've changed LoopVectorizationCostModel::selectEpilogueVectorizationFactor
so that we convert the scalable VF into a fixed-width VF and do profitability
checks on that instead. In addition, since the scalable and fixed-width VFs
live in different VPlans that means I had to change the calls to
LVP.hasPlanWithVFs so that we only pass in the fixed-width VF.
New tests added here:
Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
Differential Revision: https://reviews.llvm.org/D109432
Added:
llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 5a2eb809b681..a7d6609f8c56 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -303,12 +303,9 @@ class LoopVectorizationPlanner {
/// Look through the existing plans and return true if we have one with all
/// the vectorization factors in question.
- bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
- return any_of(VPlans, [&](const VPlanPtr &Plan) {
- return all_of(VFs, [&](const ElementCount &VF) {
- return Plan->hasVF(VF);
- });
- });
+ bool hasPlanWithVF(ElementCount VF) const {
+ return any_of(VPlans,
+ [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
}
/// Test a \p Predicate on a \p Range of VF's. Return the value of applying
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index da5298684647..cc6b0607a9ad 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6233,15 +6233,6 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}
- // FIXME: This can be fixed for scalable vectors later, because at this stage
- // the LoopVectorizer will only consider vectorizing a loop with scalable
- // vectors when the loop has a hint to enable vectorization for a given VF.
- if (MainLoopVF.isScalable()) {
- LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
- "yet supported.\n");
- return Result;
- }
-
// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
@@ -6254,7 +6245,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
if (EpilogueVectorizationForceVF > 1) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
- if (LVP.hasPlanWithVFs({MainLoopVF, ForcedEC}))
+ if (LVP.hasPlanWithVF(ForcedEC))
return {ForcedEC, 0};
else {
LLVM_DEBUG(
@@ -6272,14 +6263,24 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}
- if (!isEpilogueVectorizationProfitable(MainLoopVF))
+ auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
+ if (MainLoopVF.isScalable())
+ LLVM_DEBUG(
+ dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
+ "yet supported. Converting to fixed-width (VF="
+ << FixedMainLoopVF << ") instead\n");
+
+ if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
+ "this loop\n");
return Result;
+ }
for (auto &NextVF : ProfitableVFs)
- if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+ if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
(Result.Width.getFixedValue() == 1 ||
isMoreProfitable(NextVF, Result)) &&
- LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
+ LVP.hasPlanWithVF(NextVF.Width))
Result = NextVF;
if (Result != VectorizationFactor::Disabled())
@@ -8408,7 +8409,9 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
OldInduction = Legal->getPrimaryInduction();
Type *IdxTy = Legal->getWidestInductionType();
Value *StartIdx = ConstantInt::get(IdxTy, 0);
- Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+
+ IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
+ Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
EPI.VectorTripCount = CountRoundDown;
Induction =
@@ -10418,7 +10421,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
F->getParent()->getDataLayout());
if (!VF.Width.isScalar() || IC > 1)
Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
- VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
using namespace ore;
if (!VectorizeLoop) {
@@ -10427,6 +10429,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM, BFI, PSI, Checks);
+
+ VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
ORE->emit([&]() {
@@ -10450,7 +10454,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &LVL, &CM, BFI, PSI, Checks);
- LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestPlan, MainILV, DT);
+ VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
+ LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
+ DT);
++LoopsVectorized;
simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
@@ -10463,7 +10469,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
Checks);
- LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestPlan, EpilogILV,
+
+ VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+ LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT);
++LoopsEpilogueVectorized;
@@ -10472,6 +10480,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM, BFI, PSI, Checks);
+
+ VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
++LoopsVectorized;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
new file mode 100644
index 000000000000..d175eed822c1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -force-target-instruction-cost=1 -S -scalable-vectorization=preferred 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED
+
+target triple = "aarch64-linux-gnu"
+
+; DEBUG: LV: Checking a loop in "f1"
+; DEBUG: LEV: Epilogue vectorization using scalable vectors not yet supported. Converting to fixed-width (VF=16) instead
+; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
+; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+
+; DEBUG-FORCED: LV: Checking a loop in "f1"
+; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
+; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
+; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+
+define void @f1(i8* %A) #0 {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT: iter.check:
+; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP5]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 16 x i8>*
+; CHECK-NEXT: store <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8>* [[TMP15]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 16
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <vscale x 16 x i8>*
+; CHECK-NEXT: store <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8>* [[TMP19]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK: vec.epilog.iter.check:
+; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK: vec.epilog.ph:
+; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK: vec.epilog.vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <8 x i8>*
+; CHECK-NEXT: store <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <8 x i8>* [[TMP24]], align 1
+; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
+; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK: vec.epilog.middle.block:
+; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 1024, 1024
+; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK: vec.epilog.scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]]
+; CHECK-NEXT: store i8 1, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv
+ store i8 1, i8* %arrayidx, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ne i64 %iv.next, 1024
+ br i1 %exitcond, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
index c42bb332ab28..61c5f3824c11 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
@@ -1,11 +1,12 @@
; REQUIRES: asserts
-; RUN: opt < %s -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='loop-vectorize' -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
; Currently we cannot handle scalable vectorization factors.
; CHECK: LV: Checking a loop in "f1"
-; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported.
+; CHECK: LEV: Epilogue vectorization factor is forced.
+; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1
define void @f1(i8* %A) {
entry:
More information about the llvm-commits
mailing list