[llvm] [LV]Enable max safe distance in predicated DataWithEVL vectorization mode. (PR #100755)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 24 13:29:36 PST 2024
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/100755
>From 4c16b87b90aee4bebc1b6640af554fd449565d8d Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 26 Jul 2024 14:38:27 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../llvm/Analysis/LoopAccessAnalysis.h | 25 ++
.../Vectorize/LoopVectorizationLegality.h | 12 +
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 55 ++++-
.../Transforms/Vectorize/LoopVectorize.cpp | 75 ++++--
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 11 +-
.../Transforms/Vectorize/VPlanTransforms.h | 4 +-
.../LoopAccessAnalysis/depend_diff_types.ll | 4 +-
.../forward-loop-independent.ll | 5 +-
.../forward-negative-step.ll | 5 +-
.../num-iters-for-store-load-conflict.ll | 40 ++--
.../Analysis/LoopAccessAnalysis/pr64637.ll | 5 +-
.../stride-access-dependence.ll | 5 +-
.../RISCV/riscv-vector-reverse.ll | 4 +-
...e-force-tail-with-evl-safe-dep-distance.ll | 225 +++++++++++++-----
.../LoopVectorize/memory-dep-remarks.ll | 25 +-
16 files changed, 359 insertions(+), 146 deletions(-)
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index cc40d2e83f2e0a..b661e117d01ee7 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -37,6 +37,8 @@ class Value;
struct VectorizerParams {
/// Maximum SIMD width.
static const unsigned MaxVectorWidth;
+ /// Maximum LMUL factor.
+ static const unsigned MaxVectorLMUL;
/// VF as overridden by the user.
static unsigned VectorizationFactor;
@@ -222,6 +224,23 @@ class MemoryDepChecker {
return MaxSafeVectorWidthInBits;
}
+ /// Return safe power-of-2 number of elements, which do not prevent store-load
+ /// forwarding.
+ std::optional<uint64_t> getStoreLoadForwardSafeVFPowerOf2() const {
+ if (MaxStoreLoadForwardSafeVF.first == std::numeric_limits<uint64_t>::max())
+ return std::nullopt;
+ return MaxStoreLoadForwardSafeVF.first;
+ }
+
+ /// Return safe non-power-of-2 number of elements, which do not prevent
+ /// store-load forwarding.
+ std::optional<uint64_t> getStoreLoadForwardSafeVFNonPowerOf2() const {
+ if (MaxStoreLoadForwardSafeVF.second ==
+ std::numeric_limits<uint64_t>::max())
+ return std::nullopt;
+ return MaxStoreLoadForwardSafeVF.second;
+ }
+
/// In same cases when the dependency check fails we can still
/// vectorize the loop with a dynamic array access check.
bool shouldRetryWithRuntimeCheck() const {
@@ -310,6 +329,12 @@ class MemoryDepChecker {
/// restrictive.
uint64_t MaxSafeVectorWidthInBits = -1U;
+ /// Maximum number of elements (power-of-2 and non-power-of-2), which do not
+ /// prevent store-load forwarding.
+ std::pair<uint64_t, uint64_t> MaxStoreLoadForwardSafeVF =
+ std::make_pair(std::numeric_limits<uint64_t>::max(),
+ std::numeric_limits<uint64_t>::max());
+
/// If we see a non-constant dependence distance we can still try to
/// vectorize this loop with runtime checks.
bool FoundNonConstantDistanceDependence = false;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 0f4d1355dd2bfe..c16a5f9a1344c2 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -377,6 +377,18 @@ class LoopVectorizationLegality {
return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
}
+ /// Return safe power-of-2 number of elements, which do not prevent store-load
+ /// forwarding.
+ std::optional<unsigned> getMaxStoreLoadForwardSafeVFPowerOf2() const {
+ return LAI->getDepChecker().getStoreLoadForwardSafeVFPowerOf2();
+ }
+
+ /// Return safe non-power-of-2 number of elements, which do not prevent
+ /// store-load forwarding.
+ std::optional<unsigned> getMaxStoreLoadForwardSafeVFNonPowerOf2() const {
+ return LAI->getDepChecker().getStoreLoadForwardSafeVFNonPowerOf2();
+ }
+
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) const {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 646d2f7ef30773..29816bd1d845cc 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -100,6 +100,8 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold(
/// Maximum SIMD width.
const unsigned VectorizerParams::MaxVectorWidth = 64;
+/// Maximum LMUL factor.
+const unsigned VectorizerParams::MaxVectorLMUL = 8;
/// We collect dependences up to this threshold.
static cl::opt<unsigned>
@@ -1764,31 +1766,64 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
// cause any slowdowns.
const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
// Maximum vector factor.
- uint64_t MaxVFWithoutSLForwardIssues = std::min(
- VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
+ uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
+ std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
+ MaxStoreLoadForwardSafeVF.first);
+ uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 =
+ std::min(VectorizerParams::MaxVectorLMUL *
+ VectorizerParams::MaxVectorWidth * TypeByteSize,
+ MaxStoreLoadForwardSafeVF.second);
// Compute the smallest VF at which the store and load would be misaligned.
- for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
- VF *= 2) {
+ for (uint64_t VF = 2 * TypeByteSize;
+ VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
// If the number of vector iteration between the store and the load are
// small we could incur conflicts.
if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
- MaxVFWithoutSLForwardIssues = (VF >> 1);
+ MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
+ break;
+ }
+ }
+ // RISCV VLA supports non-power-2 vector factor. So, we iterate in a
+ // backward order to find largest VF, which allows aligned stores-loads or
+ // the number of iterations between conflicting memory addresses is not less
+ // than 8 (NumItersForStoreLoadThroughMemory).
+ for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2,
+ E = 2 * TypeByteSize;
+ VF >= E; VF -= TypeByteSize) {
+ if (Distance % VF == 0 ||
+ Distance / VF >= NumItersForStoreLoadThroughMemory) {
+ uint64_t GCD = MaxStoreLoadForwardSafeVF.second ==
+ std::numeric_limits<uint64_t>::max()
+ ? VF
+ : std::gcd(MaxStoreLoadForwardSafeVF.second, VF);
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD;
break;
}
}
- if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
+ if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize &&
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) {
LLVM_DEBUG(
dbgs() << "LAA: Distance " << Distance
<< " that could cause a store-load forwarding conflict\n");
return true;
}
- if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
- MaxVFWithoutSLForwardIssues !=
- VectorizerParams::MaxVectorWidth * TypeByteSize)
- MinDepDistBytes = MaxVFWithoutSLForwardIssues;
+ if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.first = 1;
+ else if (MaxVFWithoutSLForwardIssuesPowerOf2 <
+ MaxStoreLoadForwardSafeVF.first &&
+ MaxVFWithoutSLForwardIssuesPowerOf2 !=
+ VectorizerParams::MaxVectorWidth * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.first = MaxVFWithoutSLForwardIssuesPowerOf2;
+ if (MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.second = 1;
+ else if (MaxVFWithoutSLForwardIssuesNonPowerOf2 <
+ MaxStoreLoadForwardSafeVF.second &&
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 !=
+ VectorizerParams::MaxVectorWidth * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.second = MaxVFWithoutSLForwardIssuesNonPowerOf2;
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f526808..28e814e9c89e9c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1444,9 +1444,8 @@ class LoopVectorizationCostModel {
/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
- /// \param IsScalableVF true if scalable vector factors enabled.
/// \param UserIC User specific interleave count.
- void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ void setTailFoldingStyles(unsigned UserIC) {
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
if (!Legal->canFoldTailByMasking()) {
ChosenTailFoldingStyle =
@@ -1470,11 +1469,9 @@ class LoopVectorizationCostModel {
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
bool EVLIsLegal =
- IsScalableVF && UserIC <= 1 &&
+ UserIC <= 1 &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath &&
- // FIXME: implement support for max safe dependency distance.
- Legal->isSafeForAnyVectorWidth();
+ !EnableVPlanNativePath;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -1492,6 +1489,14 @@ class LoopVectorizationCostModel {
}
}
+ /// Disables previously chosen tail folding policy, sets it to None. Expects,
+ /// that the tail policy was selected.
+ void disableTailFolding() {
+ assert(ChosenTailFoldingStyle && "Tail folding must be selected.");
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ }
+
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const {
// TODO: check if it is possible to check for None style independent of
@@ -1499,6 +1504,14 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ /// Return maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ /// TODO: need to consider adjusting cost model to use this value as a
+ /// vectorization factor for EVL-based vectorization.
+ std::optional<unsigned> getMaxEVLSafeElements() const {
+ return MaxEVLSafeElements;
+ }
+
/// Returns true if the instructions in this block requires predication
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
@@ -1654,6 +1667,10 @@ class LoopVectorizationCostModel {
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;
+ /// Maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ std::optional<unsigned> MaxEVLSafeElements;
+
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
@@ -3903,11 +3920,31 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
- unsigned MaxSafeElements =
- llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+ unsigned MaxSafeElements = Legal->getMaxSafeVectorWidthInBits() / WidestType;
+ if (Legal->isSafeForAnyVectorWidth())
+ MaxSafeElements = PowerOf2Ceil(MaxSafeElements);
+ unsigned MaxFixedSafeElements = std::gcd(
+ MaxSafeElements,
+ Legal->getMaxStoreLoadForwardSafeVFPowerOf2().value_or(MaxSafeElements));
+ MaxFixedSafeElements = bit_floor(MaxFixedSafeElements);
+ unsigned MaxScalableSafeElements = MaxFixedSafeElements;
+ if (foldTailWithEVL()) {
+ MaxScalableSafeElements = std::numeric_limits<unsigned>::max();
+ std::optional<unsigned> SafeStoreLoadForwarding =
+ Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2();
+ if (!Legal->isSafeForAnyVectorWidth() || SafeStoreLoadForwarding) {
+ unsigned SLForwardDist =
+ Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2().value_or(
+ MaxSafeElements);
+ if (MaxSafeElements >= SLForwardDist)
+ MaxEVLSafeElements = SLForwardDist;
+ else
+ MaxEVLSafeElements = std::gcd(MaxSafeElements, SLForwardDist);
+ }
+ }
- auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
- auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
+ auto MaxSafeFixedVF = ElementCount::getFixed(MaxFixedSafeElements);
+ auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxScalableSafeElements);
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
@@ -4077,7 +4114,13 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ setTailFoldingStyles(UserIC);
+ FixedScalableVFPair MaxFactors =
+ computeFeasibleMaxVF(MaxTC, UserVF, foldTailByMasking());
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we choose.
@@ -4108,15 +4151,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Rem->isZero()) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ disableTailFolding();
return MaxFactors;
}
}
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -8388,8 +8427,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::optimize(*Plan, *PSE.getSE());
// TODO: try to put it close to addActiveLaneMask().
// Discard the plan if it is not EVL-compatible
- if (CM.foldTailWithEVL() &&
- !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+ if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
+ *Plan, CM.getMaxEVLSafeElements()))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d3..de24688593ebe9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -471,6 +471,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
assert(State.VF.isScalable() && "Expected scalable vector factor.");
Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+ if (getNumOperands() == 3) {
+ Value *MaxSafeVF = State.get(getOperand(2), VPIteration(0, 0));
+ AVL = State.Builder.CreateBinaryIntrinsic(Intrinsic::umin, AVL,
+ MaxSafeVF);
+ }
Value *EVL = State.Builder.CreateIntrinsic(
State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
{AVL, VFArg, State.Builder.getTrue()});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e311..e703bb893d9383 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1427,7 +1427,8 @@ void VPlanTransforms::addActiveLaneMask(
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// ...
///
-bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
+bool VPlanTransforms::tryAddExplicitVectorLength(
+ VPlan &Plan, const std::optional<unsigned> &MaxEVLSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
@@ -1452,8 +1453,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
EVLPhi->insertAfter(CanonicalIVPHI);
- auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
- {EVLPhi, Plan.getTripCount()});
+ SmallVector<VPValue *, 3> Operands = {EVLPhi, Plan.getTripCount()};
+ if (MaxEVLSafeElements)
+ Operands.push_back(Plan.getOrAddLiveIn(ConstantInt::get(
+ CanonicalIVPHI->getScalarType(), *MaxEVLSafeElements)));
+ auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, Operands,
+ DebugLoc());
VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
auto *CanonicalIVIncrement =
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c2..8158c832f1a951 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -105,7 +105,9 @@ struct VPlanTransforms {
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
/// \returns true if the transformation succeeds, or false if it doesn't.
- static bool tryAddExplicitVectorLength(VPlan &Plan);
+ static bool
+ tryAddExplicitVectorLength(VPlan &Plan,
+ const std::optional<unsigned> &MaxEVLSafeElements);
};
} // namespace llvm
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 81d8b01fe7fb72..c5ba25a5c0ace5 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -140,11 +140,11 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 ->
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
; CHECK-EMPTY:
-; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8
; CHECK-EMPTY:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8 ->
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
index 7fc9958dba552b..6e4bcec013a73a 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
@@ -24,14 +24,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @f(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %N) {
; CHECK-LABEL: 'f'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 %b_p1, ptr %Aidx, align 4 ->
; CHECK-NEXT: %a = load i32, ptr %Aidx, align 4
; CHECK-EMPTY:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 %b_p2, ptr %Aidx_next, align 4 ->
; CHECK-NEXT: %a = load i32, ptr %Aidx, align 4
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
index 46e81cd74ab31f..f182a3d2aa7d17 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
@@ -47,10 +47,9 @@ exit:
define void @neg_step_ForwardButPreventsForwarding(ptr nocapture %A, ptr noalias %B) {
; CHECK-LABEL: 'neg_step_ForwardButPreventsForwarding'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 0, ptr %gep.A, align 4 ->
; CHECK-NEXT: %l = load i32, ptr %gep.A.plus.1, align 4
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll b/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll
index d3eda21dee27e6..3bb51553692056 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll
@@ -4,10 +4,9 @@
define void @forward_dist_7(ptr %A, ptr noalias %B) {
; CHECK-LABEL: 'forward_dist_7'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4
; CHECK-EMPTY:
@@ -40,10 +39,9 @@ exit:
define void @forward_dist_9(ptr %A, ptr noalias %B) {
; CHECK-LABEL: 'forward_dist_9'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4
; CHECK-EMPTY:
@@ -76,10 +74,9 @@ exit:
define void @forward_dist_11(ptr %A, ptr noalias %B) {
; CHECK-LABEL: 'forward_dist_11'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4
; CHECK-EMPTY:
@@ -112,10 +109,9 @@ exit:
define void @forward_dist_13(ptr %A, ptr noalias %B) {
; CHECK-LABEL: 'forward_dist_13'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4
; CHECK-EMPTY:
@@ -148,10 +144,9 @@ exit:
define void @forward_dist_15(ptr %A, ptr noalias %B) {
; CHECK-LABEL: 'forward_dist_15'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4
; CHECK-EMPTY:
@@ -184,10 +179,9 @@ exit:
define void @forward_dist_17(ptr %A, ptr noalias %B) {
; CHECK-LABEL: 'forward_dist_17'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4
; CHECK-EMPTY:
@@ -220,10 +214,9 @@ exit:
define void @forward_dist_19(ptr %A, ptr noalias %B) {
; CHECK-LABEL: 'forward_dist_19'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i32, ptr %gep.1, align 4
; CHECK-EMPTY:
@@ -258,10 +251,9 @@ exit:
define void @unknown_loop_bounds(i64 %x, i64 %y) {
; CHECK-LABEL: 'unknown_loop_bounds'
; CHECK-NEXT: inner:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2368 bits
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %l = load double, ptr %gep.0, align 8 ->
; CHECK-NEXT: store double %l, ptr %gep.1, align 8
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
index d3e589cf99cf36..4c0fb6adce1a80 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
@@ -20,10 +20,9 @@
define void @foo(ptr noalias nocapture noundef %y, ptr noalias nocapture noundef readnone %x, ptr noalias nocapture noundef readonly %indices, i32 noundef %n) {
; CHECK-LABEL: 'foo'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 96 bits
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %1 = load i32, ptr %arrayidx, align 4 ->
; CHECK-NEXT: store i32 %add8, ptr %arrayidx12, align 4
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
index ef19e173b65994..6aeebebb057665 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
@@ -416,10 +416,9 @@ for.body: ; preds = %entry, %for.body
define void @vectorizable_unscaled_Read_Write(ptr nocapture %A) {
; CHECK-LABEL: 'vectorizable_unscaled_Read_Write'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 32 bits
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %0 = load i32, ptr %arrayidx, align 4 ->
; CHECK-NEXT: store i32 %add, ptr %arrayidx2, align 4
; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 3a14842580425b..4906cc1ca30421 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -21,7 +21,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Loop does not require scalar epilogue
; CHECK-NEXT: LV: Found trip count: 0
; CHECK-NEXT: LV: Scalable vectorization is available
-; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT: LV: The max safe fixed VF is: 134217728.
; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
@@ -226,7 +226,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Loop does not require scalar epilogue
; CHECK-NEXT: LV: Found trip count: 0
; CHECK-NEXT: LV: Scalable vectorization is available
-; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT: LV: The max safe fixed VF is: 134217728.
; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295.
; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
index 2dd47d5c1ea8a7..6143ca8de8b346 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
@@ -103,24 +103,38 @@ define void @test_may_clobber1(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 100
-; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 100)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 100
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -192,17 +206,51 @@ exit:
define void @test_may_clobber2(ptr %p) {
; IF-EVL-LABEL: @test_may_clobber2(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 9)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 9
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
; IF-EVL-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32
; IF-EVL-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 9
; IF-EVL-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32
; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; IF-EVL: exit:
; IF-EVL-NEXT: ret void
;
@@ -245,24 +293,38 @@ define void @test_may_clobber3(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 10
-; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 10)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 10
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -273,7 +335,7 @@ define void @test_may_clobber3(ptr %p) {
; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32
; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
; IF-EVL: exit:
; IF-EVL-NEXT: ret void
;
@@ -363,7 +425,7 @@ define void @trivial_due_max_vscale(ptr %p) {
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -378,7 +440,7 @@ define void @trivial_due_max_vscale(ptr %p) {
; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32
; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
; IF-EVL: exit:
; IF-EVL-NEXT: ret void
;
@@ -422,28 +484,38 @@ define void @no_high_lmul_or_interleave(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
-; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison)
-; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1024
-; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
-; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; IF-EVL-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]])
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004
-; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 3002, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 1024)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 1024
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -454,7 +526,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
; IF-EVL-NEXT: store i64 [[V]], ptr [[A2]], align 32
; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; IF-EVL-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
-; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
; IF-EVL: exit:
; IF-EVL-NEXT: ret void
;
@@ -495,22 +567,63 @@ exit:
define void @non-power-2-storeloadforward(ptr %A) {
; IF-EVL-LABEL: @non-power-2-storeloadforward(
; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 112, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[IND_END:%.*]] = add i64 16, [[N_VEC]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 112, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 3)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 4, i1 true)
+; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = add i64 16, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = add nsw i64 [[TMP8]], -3
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP8]], 4
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP15]], ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 16, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: for.body:
-; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add nsw i64 [[IV]], -3
-; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT: [[TMP2:%.*]] = add nsw i64 [[IV]], 4
-; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
-; IF-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP20:%.*]] = add nsw i64 [[IV]], -3
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP22:%.*]] = add nsw i64 [[IV]], 4
+; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP23]], [[TMP21]]
; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; IF-EVL-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4
; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; IF-EVL-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
; IF-EVL-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
-; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; IF-EVL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP15:![0-9]+]]
; IF-EVL: for.end:
; IF-EVL-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
index d96d85512621c6..6945afd09dd072 100644
--- a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
+++ b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
@@ -194,9 +194,7 @@ for.body: ; preds = %for.body.preheader,
; }
; }
-; CHECK: remark: source.c:61:12: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. Memory location is the same as accessed at source.c:60:5
-
+; CHECK-NOT: remark: source.c:{{0-9]+}}:{{[0-9]+}}:
define void @test_forwardButPreventsForwarding_dep(i64 %n, ptr nocapture %A, ptr nocapture %B) !dbg !166 {
entry:
%cmp11 = icmp sgt i64 %n, 3
@@ -233,7 +231,7 @@ for.body: ; preds = %entry, %for.body
; }
; CHECK: remark: source.c:74:5: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK: Backward loop carried data dependence that prevents store-to-load forwarding. Memory location is the same as accessed at source.c:74:21
+; CHECK: Backward loop carried data dependence that prevents store-to-load forwarding.
define void @test_backwardVectorizableButPreventsForwarding(i64 %n, ptr nocapture %A) !dbg !189 {
entry:
@@ -328,25 +326,19 @@ for.body: ; preds = %entry, %for.body
; YAML-NEXT: Args:
; YAML-NEXT: - String: loop not vectorized
; YAML-NEXT: ...
-; YAML-NEXT: --- !Analysis
+; YAML-NEXT: --- !Missed
; YAML-NEXT: Pass: loop-vectorize
-; YAML-NEXT: Name: UnsafeDep
-; YAML-NEXT: DebugLoc: { File: source.c, Line: 61, Column: 12 }
+; YAML-NEXT: Name: VectorizationNotBeneficial
; YAML-NEXT: Function: test_forwardButPreventsForwarding_dep
; YAML-NEXT: Args:
-; YAML-NEXT: - String: 'loop not vectorized: '
-; YAML-NEXT: - String: 'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
-; YAML-NEXT: - String: "\nForward loop carried data dependence that prevents store-to-load forwarding."
-; YAML-NEXT: - String: ' Memory location is the same as accessed at '
-; YAML-NEXT: - Location: 'source.c:60:5'
-; YAML-NEXT: DebugLoc: { File: source.c, Line: 60, Column: 5 }
+; YAML-NEXT: - String: the cost-model indicates that vectorization is not beneficial
; YAML-NEXT: ...
; YAML-NEXT: --- !Missed
; YAML-NEXT: Pass: loop-vectorize
-; YAML-NEXT: Name: MissedDetails
+; YAML-NEXT: Name: InterleavingNotBeneficial
; YAML-NEXT: Function: test_forwardButPreventsForwarding_dep
; YAML-NEXT: Args:
-; YAML-NEXT: - String: loop not vectorized
+; YAML-NEXT: - String: the cost-model indicates that interleaving is not beneficial
; YAML-NEXT: ...
; YAML-NEXT: --- !Analysis
; YAML-NEXT: Pass: loop-vectorize
@@ -357,9 +349,6 @@ for.body: ; preds = %entry, %for.body
; YAML-NEXT: - String: 'loop not vectorized: '
; YAML-NEXT: - String: 'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
; YAML-NEXT: - String: "\nBackward loop carried data dependence that prevents store-to-load forwarding."
-; YAML-NEXT: - String: ' Memory location is the same as accessed at '
-; YAML-NEXT: - Location: 'source.c:74:21'
-; YAML-NEXT: DebugLoc: { File: source.c, Line: 74, Column: 21 }
; YAML-NEXT: ...
; YAML-NEXT: --- !Missed
; YAML-NEXT: Pass: loop-vectorize
More information about the llvm-commits
mailing list