[llvm] [LV] Support interleaving with conditional scalar assignments (PR #184099)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 03:08:46 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Benjamin Maxwell (MacDue)
<details>
<summary>Changes</summary>
This extends the existing support to work with arbitrary interleave factors. The main change here is reworking the ExtractLastActive VPInstruction to take a variable amount of arguments and handling it in unrollRecipeByUF and VPInstruction::generate.
---
Patch is 107.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/184099.diff
13 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+11-13)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+8-5)
- (modified) llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp (+1-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h (+7)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+17-8)
- (modified) llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (+15)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll (+33-12)
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll (+4-4)
- (added) llvm/test/Transforms/LoopVectorize/VPlan/interleave-conditional-scalar-assignment-vplan.ll (+185)
- (added) llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-interleave-only.ll (+78)
- (modified) llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll (+40-7)
- (modified) llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll (+289-48)
- (modified) llvm/test/Transforms/LoopVectorize/select-cmp.ll (+42-6)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0debab4a2a0ee..b703bd262098c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -198,6 +198,11 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));
+static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps(
+ "force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden,
+ cl::desc("Assume the target supports masked memory operations (used for "
+ "testing)."));
+
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -1181,16 +1186,18 @@ class LoopVectorizationCostModel {
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
unsigned AddressSpace) const {
- return Legal->isConsecutivePtr(DataType, Ptr) &&
- TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
+ return ForceTargetSupportsMaskedMemoryOps ||
+ (Legal->isConsecutivePtr(DataType, Ptr) &&
+ TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace));
}
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
unsigned AddressSpace) const {
- return Legal->isConsecutivePtr(DataType, Ptr) &&
- TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
+ return ForceTargetSupportsMaskedMemoryOps ||
+ (Legal->isConsecutivePtr(DataType, Ptr) &&
+ TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace));
}
/// Returns true if the target machine can represent \p V as a masked gather
@@ -9771,15 +9778,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;
- // FIXME: Enable interleaving for FindLast reductions.
- if (InterleaveLoop && hasFindLastReductionPhi(LVP.getPlanFor(VF.Width))) {
- LLVM_DEBUG(dbgs() << "LV: Not interleaving due to FindLast reduction.\n");
- IntDiagMsg = {"FindLastPreventsScalarInterleaving",
- "Unable to interleave due to FindLast reduction."};
- InterleaveLoop = false;
- IC = 1;
- }
-
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 97e9c64d6481d..2a4098f53ae56 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1263,9 +1263,11 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
/// Explicit user for the resume phi of the canonical induction in the main
/// VPlan, used by the epilogue vector loop.
ResumeForEpilogue,
- /// Extracts the lane from the first operand corresponding to the last
- /// active (non-zero) lane in the mask (second operand), or if no lanes
- /// were active in the mask, returns the default value (third operand).
+ /// Extracts the last active lane from a set of vectors. The first operand
+ /// is the default value if no lanes are active. Conceptually, this
+ /// concatenates all data vectors (odd operands), concatenates all masks
+ /// (even operands -- ignoring the default value), and returns the last
+ /// active value from the combined data vector using the combined mask.
ExtractLastActive,
/// Returns the value for vscale.
@@ -2530,8 +2532,9 @@ class LLVM_ABI_FOR_TEST VPWidenPHIRecipe : public VPSingleDefRecipe,
}
VPWidenPHIRecipe *clone() override {
- auto *C = new VPWidenPHIRecipe(cast<PHINode>(getUnderlyingValue()),
- getOperand(0), getDebugLoc(), Name);
+ auto *C =
+ new VPWidenPHIRecipe(cast_if_present<PHINode>(getUnderlyingValue()),
+ getOperand(0), getDebugLoc(), Name);
for (VPValue *Op : llvm::drop_begin(operands()))
C->addOperand(Op);
return C;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index ab79822d7fb67..986ed1bb8d23e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1424,7 +1424,7 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan) {
Builder.setInsertPoint(RdxResult);
auto *ExtractLastActive =
Builder.createNaryOp(VPInstruction::ExtractLastActive,
- {DataSelect, MaskSelect, PhiR->getStartValue()},
+ {PhiR->getStartValue(), DataSelect, MaskSelect},
RdxResult->getDebugLoc());
RdxResult->replaceAllUsesWith(ExtractLastActive);
RdxResult->eraseFromParent();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 2b582a607dddc..1205f04fb5c29 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -508,6 +508,13 @@ m_LastActiveLane(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::LastActiveLane>(Op0);
}
+template <typename Op0_t, typename Op1_t, typename Op2_t>
+inline VPInstruction_match<VPInstruction::ExtractLastActive, Op0_t, Op1_t,
+ Op2_t>
+m_ExtractLastActive(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
+ return m_VPInstruction<VPInstruction::ExtractLastActive>(Op0, Op1, Op2);
+}
+
template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ComputeReductionResult, Op0_t>
m_ComputeReductionResult(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index eb1b51c40cec4..c92c6bb2d1c3a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -481,7 +481,6 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case Instruction::Select:
case VPInstruction::ActiveLaneMask:
case VPInstruction::ReductionStartVector:
- case VPInstruction::ExtractLastActive:
return 3;
case Instruction::Call: {
// For unmasked calls, the last argument will the called function. Use that
@@ -505,6 +504,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case VPInstruction::SLPLoad:
case VPInstruction::SLPStore:
case VPInstruction::ExtractLane:
+ case VPInstruction::ExtractLastActive:
// Cannot determine the number of operands from the opcode.
return -1u;
}
@@ -904,13 +904,22 @@ Value *VPInstruction::generate(VPTransformState &State) {
case VPInstruction::Reverse:
return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
case VPInstruction::ExtractLastActive: {
- Value *Data = State.get(getOperand(0));
- Value *Mask = State.get(getOperand(1));
- Value *Default = State.get(getOperand(2), /*IsScalar=*/true);
- Type *VTy = Data->getType();
- return Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_extract_last_active, {VTy},
- {Data, Mask, Default});
+ Value *Default = State.get(getOperand(0), /*IsScalar=*/true);
+ Value *Result = Default;
+ for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
+ Value *Data = State.get(getOperand(Idx));
+ Value *Mask = State.get(getOperand(Idx + 1));
+ Type *VTy = Data->getType();
+
+ if (State.VF.isScalar())
+ Result = Builder.CreateSelect(Mask, Data, Result);
+ else
+ Result = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_extract_last_active, {VTy},
+ {Data, Mask, Result});
+ }
+
+ return Result;
}
default:
llvm_unreachable("Unsupported opcode for instruction");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index faffbd452b096..9d4f5260d6f6a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -326,6 +326,10 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
Copy->insertBefore(VPBB, InsertPt);
addRecipeForPart(&R, Copy, Part);
+ // Phi operands are updated once all other recipes have been unrolled.
+ if (isa<VPWidenPHIRecipe>(Copy))
+ continue;
+
VPValue *Op;
if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
m_VPValue(), m_VPValue(Op)))) {
@@ -437,6 +441,17 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
continue;
}
+ VPValue *Op2;
+ if (match(&R, m_ExtractLastActive(m_VPValue(), m_VPValue(Op1),
+ m_VPValue(Op2)))) {
+ addUniformForAllParts(cast<VPInstruction>(&R));
+ for (unsigned Part = 1; Part != UF; ++Part) {
+ R.addOperand(getValueForPart(Op1, Part));
+ R.addOperand(getValueForPart(Op2, Part));
+ }
+ continue;
+ }
+
if (Plan.hasScalarVFOnly()) {
if (match(&R, m_ExtractLastPart(m_VPValue(Op0))) ||
match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index 7053aa60b2035..df2d6c8c775a4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -710,10 +710,10 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
; NEON-LABEL: define i32 @simple_csa_int_select_use_interleave(
; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) {
; NEON-NEXT: [[ENTRY:.*]]:
-; NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
; NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; NEON: [[VECTOR_PH]]:
-; NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
; NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
; NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
@@ -721,24 +721,34 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
; NEON: [[VECTOR_BODY]]:
; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; NEON-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
; NEON-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; NEON-NEXT: [[TMP1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
+; NEON-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[LD_ADDR]], i64 4
; NEON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[LD_ADDR]], align 4
+; NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4
; NEON-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; NEON-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD2]]
; NEON-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
-; NEON-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; NEON-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP10]]
+; NEON-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP3]], [[TMP12]]
+; NEON-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
; NEON-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]]
+; NEON-NEXT: [[TMP11]] = select i1 [[TMP4]], <4 x i1> [[TMP10]], <4 x i1> [[TMP1]]
; NEON-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]]
-; NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; NEON-NEXT: [[TMP13]] = select i1 [[TMP4]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[VEC_PHI1]]
+; NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; NEON-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NEON-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; NEON: [[MIDDLE_BLOCK]]:
; NEON-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 -1)
+; NEON-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP13]], <4 x i1> [[TMP11]], i32 [[TMP8]])
; NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; NEON-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; NEON: [[SCALAR_PH]]:
; NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
; NEON-NEXT: br label %[[LOOP:.*]]
; NEON: [[LOOP]]:
; NEON-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -751,20 +761,21 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; NEON: [[EXIT]]:
-; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
;
; SVE-LABEL: define i32 @simple_csa_int_select_use_interleave(
; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0]] {
; SVE-NEXT: [[ENTRY:.*]]:
; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; SVE: [[VECTOR_PH]]:
; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SVE-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP3]], 1
+; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP14]]
; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -772,24 +783,34 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
; SVE: [[VECTOR_BODY]]:
; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
; SVE-NEXT: [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; SVE-NEXT: [[TMP15:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]]
+; SVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP3]]
; SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
; SVE-NEXT: [[TMP6:%.*]] = icmp slt <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; SVE-NEXT: [[TMP19:%.*]] = icmp slt <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD2]]
; SVE-NEXT: [[TMP7:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
-; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP7]])
+; SVE-NEXT: [[TMP20:%.*]] = freeze <vscale x 4 x i1> [[TMP19]]
+; SVE-NEXT: [[TMP13:%.*]] = or <vscale x 4 x i1> [[TMP7]], [[TMP20]]
+; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
+; SVE-NEXT: [[TMP16]] = select i1 [[TMP8]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> [[TMP15]]
; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
-; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; SVE-NEXT: [[TMP18]] = select i1 [[TMP8]], <vscale x 4 x i32> [[WIDE_LOAD2]], <vscale x 4 x i32> [[VEC_PHI1]]
+; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; SVE: [[MIDDLE_BLOCK]]:
; SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> [[TMP9]], i32 -1)
+; SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x i1> [[TMP16]], i32 [[TMP12]])
; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; SVE: [[SCALAR_PH]]:
; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
; SVE-NEXT: br label %[[LOOP:.*]]
; SVE: [[LOOP]]:
; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -802,7 +823,7 @@ define i32 @simple_csa_int_select_use_interleave(i64 %N, ptr %data, i32 %a) {
; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
; SVE: [[EXIT]]:
-; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
+; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP21]], %[[MIDDLE_BLOCK]] ]
; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll
index 4310b7e73b8fb..bc55f6f21a92f 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll
@@ -48,7 +48,7 @@ define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: mid...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/184099
More information about the llvm-commits
mailing list