[llvm] [LV]Initial support for safe distance in predicated DataWithEVL vectorization mode. (PR #102897)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 27 05:53:47 PDT 2024
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/102897
>From fadc2abb26e062e9c52e43d4c0e4335f1fd8fca4 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 12 Aug 2024 13:12:21 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/LoopVectorize.cpp | 57 ++++++---
llvm/lib/Transforms/Vectorize/VPlan.h | 7 ++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 11 +-
.../Transforms/Vectorize/VPlanTransforms.h | 4 +-
...e-force-tail-with-evl-safe-dep-distance.ll | 118 ++++++++++++------
6 files changed, 141 insertions(+), 61 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 542d74ef0e1ef1..3b1c4830159969 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1445,9 +1445,8 @@ class LoopVectorizationCostModel {
/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
- /// \param IsScalableVF true if scalable vector factors enabled.
/// \param UserIC User specific interleave count.
- void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ void setTailFoldingStyles(unsigned UserIC) {
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
if (!Legal->canFoldTailByMasking()) {
ChosenTailFoldingStyle =
@@ -1470,12 +1469,9 @@ class LoopVectorizationCostModel {
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
- bool EVLIsLegal =
- IsScalableVF && UserIC <= 1 &&
- TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath &&
- // FIXME: implement support for max safe dependency distance.
- Legal->isSafeForAnyVectorWidth();
+ bool EVLIsLegal = UserIC <= 1 &&
+ TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+ !EnableVPlanNativePath;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -1493,6 +1489,14 @@ class LoopVectorizationCostModel {
}
}
+ /// Disables previously chosen tail folding policy, sets it to None. Expects,
+ /// that the tail policy was selected.
+ void disableTailFolding() {
+ assert(ChosenTailFoldingStyle && "Tail folding must be selected.");
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ }
+
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const {
// TODO: check if it is possible to check for None style independent of
@@ -1500,6 +1504,14 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ /// Return maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ /// TODO: need to consider adjusting cost model to use this value as a
+ /// vectorization factor for EVL-based vectorization.
+ std::optional<unsigned> getMaxEVLSafeElements() const {
+ return MaxEVLSafeElements;
+ }
+
/// Returns true if the instructions in this block requires predication
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
@@ -1651,6 +1663,10 @@ class LoopVectorizationCostModel {
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;
+ /// Maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ std::optional<unsigned> MaxEVLSafeElements;
+
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
@@ -3903,9 +3919,14 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// dependence distance).
unsigned MaxSafeElements =
llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+ unsigned MaxScalableSafeElements = MaxSafeElements;
+ if (foldTailWithEVL() && !Legal->isSafeForAnyVectorWidth()) {
+ MaxScalableSafeElements = std::numeric_limits<unsigned>::max();
+ MaxEVLSafeElements = MaxSafeElements;
+ }
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
- auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
+ auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxScalableSafeElements);
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
@@ -4075,7 +4096,13 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ setTailFoldingStyles(UserIC);
+ FixedScalableVFPair MaxFactors =
+ computeFeasibleMaxVF(MaxTC, UserVF, foldTailByMasking());
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we choose.
@@ -4106,15 +4133,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Rem->isZero()) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ disableTailFolding();
return MaxFactors;
}
}
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -8496,8 +8519,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::optimize(*Plan, *PSE.getSE());
// TODO: try to put it close to addActiveLaneMask().
// Discard the plan if it is not EVL-compatible
- if (CM.foldTailWithEVL() &&
- !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+ if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
+ *Plan, CM.getMaxEVLSafeElements()))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 016ad75c21ceb0..c80538c03b5751 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1239,6 +1239,13 @@ class VPInstruction : public VPRecipeWithIRFlags {
SLPLoad,
SLPStore,
ActiveLaneMask,
+ /// Creates special scalar explicit-vector-length instruction, which
+ /// calculates the vectorization factor (number of iterations, that can be
+ /// executed simultaneously) at runtime.
+ /// Has two mandatory parameters - EVL (effective vector length) on the
+ /// previous iteration and original trip count.
+ /// Also, has one optional parameter - max safe distance, allowed for the
+ /// loop.
ExplicitVectorLength,
/// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
/// The first operand is the incoming value from the predecessor in VPlan,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a93f275a39f5f..cd7b854ea848d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -471,6 +471,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
assert(State.VF.isScalable() && "Expected scalable vector factor.");
Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+ if (getNumOperands() == 3) {
+ Value *MaxSafeVF = State.get(getOperand(2), VPIteration(0, 0));
+ AVL = State.Builder.CreateBinaryIntrinsic(Intrinsic::umin, AVL,
+ MaxSafeVF);
+ }
Value *EVL = State.Builder.CreateIntrinsic(
State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
{AVL, VFArg, State.Builder.getTrue()});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 045f6c356669fa..17fffa0ac2e0a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1427,7 +1427,8 @@ void VPlanTransforms::addActiveLaneMask(
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// ...
///
-bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
+bool VPlanTransforms::tryAddExplicitVectorLength(
+ VPlan &Plan, const std::optional<unsigned> &MaxEVLSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
@@ -1452,8 +1453,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
EVLPhi->insertAfter(CanonicalIVPHI);
- auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
- {EVLPhi, Plan.getTripCount()});
+ SmallVector<VPValue *, 3> Operands = {EVLPhi, Plan.getTripCount()};
+ if (MaxEVLSafeElements)
+ Operands.push_back(Plan.getOrAddLiveIn(ConstantInt::get(
+ CanonicalIVPHI->getScalarType(), *MaxEVLSafeElements)));
+ auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, Operands,
+ DebugLoc());
VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
auto *CanonicalIVIncrement =
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c2..8158c832f1a951 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -105,7 +105,9 @@ struct VPlanTransforms {
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
/// \returns true if the transformation succeeds, or false if it doesn't.
- static bool tryAddExplicitVectorLength(VPlan &Plan);
+ static bool
+ tryAddExplicitVectorLength(VPlan &Plan,
+ const std::optional<unsigned> &MaxEVLSafeElements);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
index 2dd47d5c1ea8a7..a1401a247a53e9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
@@ -103,24 +103,38 @@ define void @test_may_clobber1(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 100
-; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 4)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 100
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -245,24 +259,38 @@ define void @test_may_clobber3(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 10
-; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 2)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 10
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -422,28 +450,38 @@ define void @no_high_lmul_or_interleave(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
-; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison)
-; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1024
-; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
-; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; IF-EVL-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]])
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004
-; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 3002, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 1024)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 1024
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
More information about the llvm-commits
mailing list