[llvm] [LV][EVL] Support fixed-order recurrence idiom with EVL tail folding. (PR #124093)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 17 06:32:49 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
@llvm/pr-subscribers-llvm-analysis
Author: Mel Chen (Mel-Chen)
<details>
<summary>Changes</summary>
This patch converts the llvm.vector.splice intrinsic to llvm.experimental.vp.splice, ensuring that fixed-order recurrences execute correctly when tail folding by EVL is enable.
Due to the non-VFxUF penultimate EVL issue, the EVL from the previous iteration will be preserved and used in llvm.experimental.vp.splice.
Fixes https://github.com/llvm/llvm-project/issues/122461
---
Patch is 39.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124093.diff
9 Files Affected:
- (modified) llvm/lib/Analysis/VectorUtils.cpp (+2)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+1-4)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+11-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+86-3)
- (modified) llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp (+5-1)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll (+55-77)
- (added) llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll (+81)
``````````diff
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 53be7fc0bee9f..db7f2d1d75952 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -160,6 +160,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
case Intrinsic::umul_fix:
case Intrinsic::umul_fix_sat:
return (ScalarOpdIdx == 2);
+ case Intrinsic::experimental_vp_splice:
+ return ScalarOpdIdx == 2 || ScalarOpdIdx == 4 || ScalarOpdIdx == 5;
default:
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..bafdbf9f9f67d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1452,12 +1452,9 @@ class LoopVectorizationCostModel {
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
- // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
- // penultimate EVL.
bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath &&
- Legal->getFixedOrderRecurrences().empty();
+ !EnableVPlanNativePath;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fbbc466f2f7f6..2d7e1ce2dbec5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -535,6 +535,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
case VPRecipeBase::VPScalarCastSC:
+ case VPRecipeBase::VPScalarPHISC:
case VPRecipeBase::VPPartialReductionSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index bf61251fc9133..2347149f65762 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -78,6 +78,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::AnyOf:
return SetResultTyFromOp();
+ case Instruction::ExtractElement:
case VPInstruction::ExtractFirstActive:
case VPInstruction::ExtractFromEnd: {
Type *BaseTy = inferScalarType(R->getOperand(0));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1bba667c206cf..08c26af7f657a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -478,6 +478,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
}
+ case Instruction::ExtractElement: {
+ assert(State.VF.isVector() && "Only extract elements from vectors");
+ Value *Vec = State.get(getOperand(0));
+ Value *Idx = State.get(getOperand(1), /*IsScalar*/ true);
+ return Builder.CreateExtractElement(Vec, Idx, Name);
+ }
case VPInstruction::ActiveLaneMask: {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
@@ -752,7 +758,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
bool VPInstruction::isVectorToScalar() const {
- return getOpcode() == VPInstruction::ExtractFromEnd ||
+ return getOpcode() == Instruction::ExtractElement ||
+ getOpcode() == VPInstruction::ExtractFromEnd ||
getOpcode() == VPInstruction::ExtractFirstActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
@@ -814,6 +821,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
switch (getOpcode()) {
case Instruction::ICmp:
case Instruction::Select:
+ case Instruction::ExtractElement:
case VPInstruction::AnyOf:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
@@ -851,6 +859,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::BranchOnCond:
case VPInstruction::ResumePhi:
return true;
+ case Instruction::ExtractElement:
+ return Op == getOperand(1);
};
llvm_unreachable("switch should return");
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6c917e4eef655..d7145b2a5521d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1635,6 +1635,48 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}
+/// Adjust the way the resume value is obtained when using tail folding by EVL.
+/// Expanding ExtractFromEnd since the penultimate EVL could not equals to
+/// VFxUF. Expand
+/// %resume = ExtractFromEnd %vec, 1
+/// to
+/// %last.active.idx = sub %EVL, 1
+/// %resume = extractelement %vec, %last.active.idx
+static void adjustResumePhisForEVL(VPlan &Plan, VPValue &EVL) {
+ LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
+ using namespace VPlanPatternMatch;
+ for (VPRecipeBase &R : *cast<VPBasicBlock>(Plan.getScalarPreheader())) {
+ VPValue *FromMiddleBlock;
+ if (!match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
+ m_VPValue(FromMiddleBlock), m_VPValue())))
+ continue;
+
+ VPValue *ExtractFrom;
+ if (match(FromMiddleBlock, m_VPInstruction<VPInstruction::ExtractFromEnd>(
+ m_VPValue(ExtractFrom), m_SpecificInt(1)))) {
+ // Skip if all elements are uniform.
+ if (vputils::isUniformAfterVectorization(ExtractFrom))
+ continue;
+ auto *ExtractR = cast<VPInstruction>(FromMiddleBlock);
+ VPBuilder Builder(ExtractR);
+ VPValue *OneVPV =
+ Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(Ctx), 1));
+ VPValue *LastActiveIdx =
+ Builder.createNaryOp(Instruction::Sub, {&EVL, OneVPV},
+ ExtractR->getDebugLoc(), "last.active.idx");
+ VPValue *NewExtract = Builder.createNaryOp(
+ Instruction::ExtractElement, {ExtractFrom, LastActiveIdx},
+ ExtractR->getDebugLoc(), ExtractR->getName());
+ ExtractR->replaceAllUsesWith(NewExtract);
+ ExtractR->eraseFromParent();
+ }
+ assert((!dyn_cast<VPInstruction>(FromMiddleBlock) ||
+ cast<VPInstruction>(FromMiddleBlock)->getOpcode() !=
+ VPInstruction::ExtractFromEnd) &&
+ "Only extract the last lane for resumed values");
+ }
+}
+
/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
/// nullptr if no EVL-based recipe could be created.
/// \p HeaderMask Header Mask.
@@ -1643,10 +1685,13 @@ void VPlanTransforms::addActiveLaneMask(
/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
/// \p EVL The explicit vector length parameter of vector-predication
/// intrinsics.
+/// \p PrevEVL The explicit vector length of the previous iteration. Only
+/// required if \p CurRecipe is a VPInstruction::FirstOrderRecurrenceSplice.
static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo,
- VPValue &AllOneMask, VPValue &EVL) {
+ VPValue &AllOneMask, VPValue &EVL,
+ VPValue *PrevEVL) {
using namespace llvm::VPlanPatternMatch;
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
assert(OrigMask && "Unmasked recipe when folding tail");
@@ -1704,6 +1749,18 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
Sel->getDebugLoc());
})
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
+ if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
+ assert(PrevEVL && "Fixed-order recurrences require previous EVL");
+ VPValue *MinusOneVPV = VPI->getParent()->getPlan()->getOrAddLiveIn(
+ ConstantInt::getSigned(Type::getInt32Ty(TypeInfo.getContext()),
+ -1));
+ SmallVector<VPValue *> Ops(VPI->operands());
+ Ops.append({MinusOneVPV, &AllOneMask, PrevEVL, &EVL});
+ return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_splice,
+ Ops, TypeInfo.inferScalarType(VPI),
+ VPI->getDebugLoc());
+ }
+
VPValue *LHS, *RHS;
// Transform select with a header mask condition
// select(header_mask, LHS, RHS)
@@ -1727,6 +1784,30 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
VPTypeAnalysis TypeInfo(CanonicalIVType);
LLVMContext &Ctx = CanonicalIVType->getContext();
VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+
+ // Create a scalar phi to track the previous EVL if fixed-order recurrence is
+ // contained.
+ VPScalarPHIRecipe *PrevEVL = nullptr;
+ bool ContainsFORs =
+ any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
+ if (ContainsFORs) {
+ // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
+ VPValue *MaxEVL = &Plan.getVF();
+ // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
+ if (unsigned VFSize =
+ TypeInfo.inferScalarType(MaxEVL)->getScalarSizeInBits();
+ VFSize != 32) {
+ MaxEVL = new VPScalarCastRecipe(
+ VFSize > 32 ? Instruction::Trunc : Instruction::ZExt, MaxEVL,
+ Type::getInt32Ty(Ctx), DebugLoc());
+ VPBasicBlock *Preheader = LoopRegion->getPreheaderVPBB();
+ Preheader->appendRecipe(cast<VPScalarCastRecipe>(MaxEVL));
+ }
+ PrevEVL = new VPScalarPHIRecipe(MaxEVL, &EVL, DebugLoc(), "prev.evl");
+ PrevEVL->insertBefore(*Header, Header->getFirstNonPhi());
+ }
for (VPUser *U : to_vector(Plan.getVF().users())) {
if (auto *R = dyn_cast<VPReverseVectorPointerRecipe>(U))
@@ -1738,8 +1819,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
auto *CurRecipe = cast<VPRecipeBase>(U);
- VPRecipeBase *EVLRecipe =
- createEVLRecipe(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
+ VPRecipeBase *EVLRecipe = createEVLRecipe(
+ HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL, PrevEVL);
if (!EVLRecipe)
continue;
@@ -1761,6 +1842,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}
+ adjustResumePhisForEVL(Plan, EVL);
+
for (VPRecipeBase *R : reverse(ToErase)) {
SmallVector<VPValue *> PossiblyDead(R->operands());
R->eraseFromParent();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 96156de444f88..f2b532ec06b37 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -143,7 +143,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
})
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe>(
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
- .Case<VPWidenLoadEVLRecipe, VPReverseVectorPointerRecipe>(
+ .Case<VPWidenLoadEVLRecipe, VPReverseVectorPointerRecipe,
+ VPScalarPHIRecipe>(
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
.Case<VPWidenEVLRecipe>([&](const VPWidenEVLRecipe *W) {
return VerifyEVLUse(*W,
@@ -152,6 +153,9 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
.Case<VPScalarCastRecipe>(
[&](const VPScalarCastRecipe *S) { return VerifyEVLUse(*S, 0); })
.Case<VPInstruction>([&](const VPInstruction *I) {
+ // Used by extracting the element at last active lane.
+ if (I->getOpcode() == Instruction::Sub)
+ return VerifyEVLUse(*I, 0);
if (I->getOpcode() != Instruction::Add) {
errs() << "EVL is used as an operand in non-VPInstruction::Add\n";
return false;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 1dfda837f95a5..1dfa1e78ac3c2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -11,10 +11,6 @@
; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
-; FIXME: Fixed-order recurrence is not supported yet with EVL tail folding.
-; The llvm.splice may occur unexpected behavior if the evl of the second-to-last
-; iteration is not VF*UF.
-
define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-LABEL: define void @first_order_recurrence(
; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -31,41 +27,37 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP8]] to i32
; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1
; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
; IF-EVL: [[VECTOR_BODY]]:
-; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP25]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP25:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[TMP26:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP25]]
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP26]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
-; IF-EVL-NEXT: [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT: [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP12]])
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP19]], ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP27]])
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: [[MIDDLE_BLOCK]]:
-; IF-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
-; IF-EVL-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 4
-; IF-EVL-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1
+; IF-EVL-NEXT: [[TMP23:%.*]] = sub i32 [[TMP12]], 1
; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[VP_OP_LOAD]], i32 [[TMP23]]
; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
; IF-EVL: [[SCALAR_PH]]:
@@ -180,9 +172,9 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP8]] to i32
; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1
@@ -191,51 +183,45 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4
; IF-EVL-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1
; IF-EVL-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vsc...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/124093
More information about the llvm-commits
mailing list