[llvm-branch-commits] [llvm] 549e30c - [LV] Don't vectorize if we can prove RT + vector cost >= scalar cost.
Florian Hahn via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Sep 25 12:29:41 PDT 2021
Author: Florian Hahn
Date: 2021-09-25T19:34:59+01:00
New Revision: 549e30c356b635e8d712ad9decbaf54ca520e0e5
URL: https://github.com/llvm/llvm-project/commit/549e30c356b635e8d712ad9decbaf54ca520e0e5
DIFF: https://github.com/llvm/llvm-project/commit/549e30c356b635e8d712ad9decbaf54ca520e0e5.diff
LOG: [LV] Don't vectorize if we can prove RT + vector cost >= scalar cost.
If we can prove that the cost of the runtime checks + the total vector
loop cost exceed the total scalar cost, vectorization with runtime
checks is not profitable.
This is a first step towards guarding against regressions in cases where
we already know runtime checks are unprofitable, as the heuristics get
tweaked.
Differential Revision: https://reviews.llvm.org/D109368
Foo
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c3a440ecc9b85..2fcff89e17160 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -34,7 +34,6 @@ class LoopInfo;
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class PredicatedScalarEvolution;
-class LoopVectorizationRequirements;
class LoopVectorizeHints;
class OptimizationRemarkEmitter;
class TargetTransformInfo;
@@ -188,6 +187,8 @@ struct VectorizationFactor {
/// Cost of the scalar loop.
InstructionCost ScalarCost;
+ ElementCount MinProfTripCount;
+
VectorizationFactor(ElementCount Width, InstructionCost Cost,
InstructionCost ScalarCost)
: Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
@@ -265,8 +266,6 @@ class LoopVectorizationPlanner {
const LoopVectorizeHints &Hints;
- LoopVectorizationRequirements &Requirements;
-
OptimizationRemarkEmitter *ORE;
SmallVector<VPlanPtr, 4> VPlans;
@@ -288,10 +287,9 @@ class LoopVectorizationPlanner {
InterleavedAccessInfo &IAI,
PredicatedScalarEvolution &PSE,
const LoopVectorizeHints &Hints,
- LoopVectorizationRequirements &Requirements,
OptimizationRemarkEmitter *ORE)
: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
- PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {}
+ PSE(PSE), Hints(Hints), ORE(ORE) {}
/// Plan how to best vectorize, return the best VF and its cost, or None if
/// vectorization and interleaving should be avoided up front.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8db7ecc7cd7ef..88c5c722a4790 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -198,11 +198,6 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
- "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum allowed number of runtime memory checks with a "
- "vectorize(enable) pragma."));
-
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -443,6 +438,30 @@ namespace llvm {
/// and reduction variables that were found to a given vectorization factor.
class InnerLoopVectorizer {
public:
+ InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
+ unsigned UnrollFactor, LoopVectorizationLegality *LVL,
+ LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ ElementCount MinProfTripCount)
+ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+ AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
+ Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
+ PSI(PSI), RTChecks(RTChecks) {
+ // Query this against the original loop and save it here because the profile
+ // of the original loop header may change as the transformation happens.
+ OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
+ OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
+
+ if (MinProfTripCount.isZero())
+ this->MinProfTripCount = VecWidth;
+ else
+ this->MinProfTripCount = MinProfTripCount;
+ }
+
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
LoopInfo *LI, DominatorTree *DT,
const TargetLibraryInfo *TLI,
@@ -459,6 +478,8 @@ class InnerLoopVectorizer {
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
+
+ MinProfTripCount = VecWidth;
}
virtual ~InnerLoopVectorizer() = default;
@@ -795,6 +816,8 @@ class InnerLoopVectorizer {
/// vector elements.
ElementCount VF;
+ ElementCount MinProfTripCount;
+
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many
diff erent vector instructions.
unsigned UF;
@@ -2006,6 +2029,25 @@ class GeneratedRTChecks {
}
}
+ InstructionCost getCost(LoopVectorizationCostModel &CM) {
+ InstructionCost RTCheckCost = 0;
+ if (SCEVCheckBlock)
+ for (Instruction &I : *SCEVCheckBlock) {
+ if (SCEVCheckBlock->getTerminator() == &I)
+ continue;
+ RTCheckCost +=
+ CM.getInstructionCost(&I, ElementCount::getFixed(1)).first;
+ }
+ if (MemCheckBlock)
+ for (Instruction &I : *MemCheckBlock) {
+ if (MemCheckBlock->getTerminator() == &I)
+ continue;
+ RTCheckCost +=
+ CM.getInstructionCost(&I, ElementCount::getFixed(1)).first;
+ }
+ return RTCheckCost;
+ }
+
/// Remove the created SCEV & memory runtime check blocks & instructions, if
/// unused.
~GeneratedRTChecks() {
@@ -3275,8 +3317,14 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
// If tail is to be folded, vector loop takes care of all iterations.
Value *CheckMinIters = Builder.getFalse();
if (!Cost->foldTailByMasking()) {
- Value *Step =
- createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+ Value *Step = nullptr;
+
+ if (UF * VF.getKnownMinValue() < MinProfTripCount.getKnownMinValue())
+ Step = createStepForVF(Builder, ConstantInt::get(Count->getType(), 1),
+ MinProfTripCount);
+ else
+ Step =
+ createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
}
// Create new preheader for vector loop.
@@ -3303,7 +3351,6 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
}
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
-
BasicBlock *const SCEVCheckBlock =
RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
if (!SCEVCheckBlock)
@@ -8163,27 +8210,77 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
if (!SelectedVF.Width.isScalar())
Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate());
+ bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+
// Check if it is profitable to vectorize with runtime checks.
- unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
- if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
- bool PragmaThresholdReached =
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
- bool ThresholdReached =
- NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
- if ((ThresholdReached && !Hints.allowReordering()) ||
- PragmaThresholdReached) {
- ORE->emit([&]() {
- return OptimizationRemarkAnalysisAliasing(
- DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
- OrigLoop->getHeader())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations";
- });
- LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- Hints.emitRemarkWithHints();
- return VectorizationFactor::Disabled();
+ if (!ForceVectorization && SelectedVF.Width.isVector()) {
+ // First, compute the minimum iteration count required so that the vector
+ // loop outperforms the scalar loop.
+ // The total cost of the scalar loop is
+ // ScalarC * TC
+ // where
+ // * TC is the actual trip count of the loop.
+ // * ScalarC is the cost of a single scalar iteration.
+ //
+ // The total cost of the vector loop is
+ // RtC + VecC * (TC / VF) + EpiC
+ // where
+ // * RtC is the cost of the generated runtime checks
+ // * VecC is the cost of a single vector iteration.
+ // * TC is the actual trip count of the loop
+ // * VF is the vectorization factor
+ // * EpiCost is the cost of the generated epilogue, including the cost
+ // of the remaining scalar operations.
+ //
+ // Vectorization is profitable once the total vector cost is less than the
+ // total scalar cost:
+ // RtC + VecC (TC / VF) + EpiC < ScalarC * TC
+ //
+ // Now we can compute the minimum required trip count TC as
+ // (RtC + EpiC) / (ScalarC + (VecC / VF)) < TC
+ //
+ // For now we assume the epilogue cost EpiC = 0 for simplicity.
+ unsigned VF = SelectedVF.Width.getKnownMinValue();
+ double ScalarC = *SelectedVF.ScalarCost.getValue();
+ double VecC = double(*SelectedVF.Cost.getValue()) / VF;
+ double RtC = *Checks.getCost(CM).getValue();
+ double MinTC1 = RtC / (ScalarC - VecC);
+
+ // Second, compute a minimum iteration count so that the cost of the runtime
+ // checks is only a fraction of the total scalar loop cost. This adds a
+ // loop-dependent bound on the overhead incurred if the runtime checks fail.
+ // In case the runtime checks fail, the cost is RtC + ScalarC * TC. To bound
+ // the runtime check to be a fraction 1/X of the scalar cost, compute
+ // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
+ double MinTC2 = RtC * 10 / ScalarC;
+
+ // Now pick the larger minimum. If it is not a multiple of VF, choose the
+ // next closest multiple of VF. This should partly compensate for ignoring
+ // the epilogue cost.
+ uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
+ uint64_t MinTCDown = MinTC & ~(VF - 1);
+ SelectedVF.MinProfTripCount =
+ ElementCount::getFixed(MinTCDown == MinTC ? MinTC : MinTCDown + VF);
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
+ << SelectedVF.MinProfTripCount << "\n");
+
+ // Skip vectorization if the expected trip count is less than the minimum
+ // required trip count.
+ if (auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), OrigLoop)) {
+ if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
+ SelectedVF.MinProfTripCount)) {
+ LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
+ "trip count < minimum profitable VF ("
+ << *ExpectedTC << " < " << SelectedVF.MinProfTripCount
+ << ")\n");
+
+ return None;
+ }
}
}
+
return SelectedVF;
}
@@ -10036,8 +10133,7 @@ static bool processLoopInVPlanNativePath(
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
- Requirements, ORE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
@@ -10274,8 +10370,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectElementTypesForWidening();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
- Requirements, ORE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
@@ -10434,7 +10529,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
DisableRuntimeUnroll = true;
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
- &LVL, &CM, BFI, PSI, Checks);
+ &LVL, &CM, BFI, PSI, Checks,
+ VF.MinProfTripCount);
LVP.executePlan(LB, DT);
++LoopsVectorized;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
index ccb57dee6cbf8..ea7642905cf2a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck %s
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck %s
; Tests for loops with large numbers of runtime checks. Check that loops are
; vectorized, if the loop trip counts are large and the impact of the runtime
@@ -50,18 +50,17 @@ loop: ; preds = %bb54, %bb37
%gep.4 = getelementptr inbounds i16, i16* %ptr.4, i64 %iv
store i16 %trunc.2, i16* %gep.4, align 2
%iv.next = add nuw nsw i64 %iv, 1
- %cmp = icmp ult i64 %iv, 50
+ %cmp = icmp ult i64 %iv, 10
br i1 %cmp, label %loop, label %exit
exit:
ret void
}
-; FIXME
; The trip count in the loop in this function high enough to warrant large runtime checks.
; CHECK-LABEL: define {{.*}} @test_tc_big_enough
-; CHECK-NOT: vector.memcheck
-; CHECK-NOT: vector.body
+; CHECK: vector.memcheck
+; CHECK: vector.body
define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) {
entry:
br label %loop
@@ -109,3 +108,56 @@ loop: ; preds = %bb54, %bb37
exit:
ret void
}
+
+
+define void @test_tc_unknown(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2, i64 %N) {
+; CHECK-LABEL: define void @test_tc_unknown
+; CHECK: [[ADD:%.+]] = add i64 %N, 1
+; CHECK-NEXT: [[C:%.+]] = icmp ult i64 [[ADD]], 16
+; CHECK-NEXT: br i1 [[C]], label %scalar.ph, label %vector.memcheck
+entry:
+ br label %loop
+
+loop: ; preds = %bb54, %bb37
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.1 = getelementptr inbounds i16, i16* %ptr.1, i64 %iv
+ %lv.1 = load i16, i16* %gep.1, align 2
+ %ext.1 = sext i16 %lv.1 to i32
+ %gep.2 = getelementptr inbounds i16, i16* %ptr.2, i64 %iv
+ %lv.2 = load i16, i16* %gep.2, align 2
+ %ext.2 = sext i16 %lv.2 to i32
+ %gep.off.1 = getelementptr inbounds i16, i16* %gep.2, i64 %off.1
+ %lv.3 = load i16, i16* %gep.off.1, align 2
+ %ext.3 = sext i16 %lv.3 to i32
+ %gep.off.2 = getelementptr inbounds i16, i16* %gep.2, i64 %off.2
+ %lv.4 = load i16, i16* %gep.off.2, align 2
+ %ext.4 = sext i16 %lv.4 to i32
+ %tmp62 = mul nsw i32 %ext.2, 11
+ %tmp66 = mul nsw i32 %ext.3, -4
+ %tmp70 = add nsw i32 %tmp62, 4
+ %tmp71 = add nsw i32 %tmp70, %tmp66
+ %tmp72 = add nsw i32 %tmp71, %ext.4
+ %tmp73 = lshr i32 %tmp72, 3
+ %tmp74 = add nsw i32 %tmp73, %ext.1
+ %tmp75 = lshr i32 %tmp74, 1
+ %tmp76 = mul nsw i32 %ext.2, 5
+ %tmp77 = shl nsw i32 %ext.3, 2
+ %tmp78 = add nsw i32 %tmp76, 4
+ %tmp79 = add nsw i32 %tmp78, %tmp77
+ %tmp80 = sub nsw i32 %tmp79, %ext.4
+ %tmp81 = lshr i32 %tmp80, 3
+ %tmp82 = sub nsw i32 %tmp81, %ext.1
+ %tmp83 = lshr i32 %tmp82, 1
+ %trunc.1 = trunc i32 %tmp75 to i16
+ %gep.3 = getelementptr inbounds i16, i16* %ptr.3, i64 %iv
+ store i16 %trunc.1, i16* %gep.3, align 2
+ %trunc.2 = trunc i32 %tmp83 to i16
+ %gep.4 = getelementptr inbounds i16, i16* %ptr.4, i64 %iv
+ store i16 %trunc.2, i16* %gep.4, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cmp = icmp ult i64 %iv, %N
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
index 50b80e0082ac4..6b85b487f85b6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
@@ -200,7 +200,7 @@ define void @arm_abs_q31(i32* nocapture readonly %pSrc, i32* nocapture %pDst, i3
; CHECK-NEXT: [[CMP_NOT14:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0
; CHECK-NEXT: br i1 [[CMP_NOT14]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
; CHECK: while.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PDST:%.*]], i32 [[BLOCKSIZE]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index bb0b19a1ad8e4..165efe605222f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -1765,7 +1765,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, fl
; AVX512-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
; AVX512-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 2
; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 60
+; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
index 628457a1c1e43..4cf0ae3d121db 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
@@ -1,4 +1,6 @@
-; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S %s | FileCheck %s
+; REQUIRES: asserts
+
+; RUN: opt -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -8,11 +10,14 @@ declare double @llvm.pow.f64(double, double)
; Test case where the memory runtime checks and vector body is more expensive
; than running the scalar loop.
-; TODO: should not be vectorized.
define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) {
+; CHECK: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 72)
+;
; CHECK-LABEL: @test(
-; CHECK: vector.memcheck
-; CHECK: vector.body
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label %for.body
+; CHECK-NOT: vector.memcheck
+; CHECK-NOT: vector.body
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
index d06f9d3aeca89..e1008c38417d5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -15,7 +15,7 @@ define void @foo(i8 addrspace(1)* align 8 dereferenceable_or_null(16), i8 addrsp
; CHECK-NEXT: [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16
; CHECK-NEXT: [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)*
; CHECK-NEXT: [[UMAX2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2:%.*]], i64 1)
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX2]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX2]], 20
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 1)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index dc30fc1ab5f16..4972a5ffdb660 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -41,7 +41,7 @@ define i32 @main() local_unnamed_addr #0 {
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], 1
; CHECK-NEXT: [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP4]])
; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[UMIN1]]
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 8
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 60
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[CONV3]], -1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
index 1b5622dbde7d1..86fae7804060e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE
-; RUN: opt < %s -loop-vectorize -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -8,20 +7,12 @@ target triple = "x86_64-unknown-linux"
; First loop produced diagnostic pass remark.
;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2)
; Second loop produces diagnostic analysis remark.
-;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
-
-; First loop produced diagnostic pass remark.
-;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2)
-; Second loop produces diagnostic pass remark.
-;OVERRIDE: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
+;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
; We are vectorizing with 6 runtime checks.
;CHECK-LABEL: func1x6(
;CHECK: <4 x i32>
;CHECK: ret
-;OVERRIDE-LABEL: func1x6(
-;OVERRIDE: <4 x i32>
-;OVERRIDE: ret
define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
entry:
br label %for.body
@@ -52,14 +43,10 @@ for.end: ; preds = %for.body
ret i32 undef
}
-; We are not vectorizing with 12 runtime checks.
+; We are vectorizing with 12 runtime checks.
;CHECK-LABEL: func2x6(
-;CHECK-NOT: <4 x i32>
+;CHECK: <4 x i32>
;CHECK: ret
-; We vectorize with 12 checks if a vectorization hint is provided.
-;OVERRIDE-LABEL: func2x6(
-;OVERRIDE-NOT: <4 x i32>
-;OVERRIDE: ret
define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
entry:
br label %for.body
@@ -100,4 +87,3 @@ for.body: ; preds = %for.body, %entry
for.end: ; preds = %for.body
ret i32 undef
}
-
More information about the llvm-branch-commits
mailing list