[llvm] 2bdc1a1 - [LV] Use frozen start value for FindLastIV if needed. (#132691)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 4 03:48:05 PDT 2025
Author: Florian Hahn
Date: 2025-04-04T11:48:01+01:00
New Revision: 2bdc1a1337692a5743658ba6b680e5d914e684a4
URL: https://github.com/llvm/llvm-project/commit/2bdc1a1337692a5743658ba6b680e5d914e684a4
DIFF: https://github.com/llvm/llvm-project/commit/2bdc1a1337692a5743658ba6b680e5d914e684a4.diff
LOG: [LV] Use frozen start value for FindLastIV if needed. (#132691)
FindLastIV introduces multiple uses of the start value, where in the
original source there was only a single use, when the epilogue is
vectorized.
Each use of undef may produce a different result, so introducing
multiple uses can produce incorrect results when the input is
undef/poison.
If the start value may be undef or poison, freeze it and use the frozen
value, which will be the same at all uses.
See the following scenarios in Alive2:
* Both main and epilogue vector loops execute, go to exit block: https://alive2.llvm.org/ce/z/_TSvRr
* Both main and epilogue vector loops execute, go to scalar loop: https://alive2.llvm.org/ce/z/CsPj5v
* Only epilogue vector loop executes, go to exit block: https://alive2.llvm.org/ce/z/5XqkNV
* Only epilogue vector loop executes, go to scalar loop: https://alive2.llvm.org/ce/z/JUpqRN
The latter 2 show requiring freezing the resume phi. That means we cannot freeze
in the preheader. We could move the freeze to the main iteration count check, but
that would be a bit fragile to find and other transforms can sink the freeze if needed.
Depends on https://github.com/llvm/llvm-project/pull/132689
and https://github.com/llvm/llvm-project/pull/132690.
Fixes https://github.com/llvm/llvm-project/issues/126836
PR: https://github.com/llvm/llvm-project/pull/132691
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0291a8bfd9674..65cce5e7d194d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7659,14 +7659,17 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
} else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
using namespace llvm::PatternMatch;
- Value *Cmp, *OrigResumeV;
+ Value *Cmp, *OrigResumeV, *CmpOp;
bool IsExpectedPattern =
match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
m_Specific(RdxDesc.getSentinelValue()),
m_Value(OrigResumeV))) &&
- match(Cmp,
- m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
- m_Specific(RdxDesc.getRecurrenceStartValue())));
+ (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
+ m_Value(CmpOp))) &&
+ (match(CmpOp,
+ m_Freeze(m_Specific(RdxDesc.getRecurrenceStartValue()))) ||
+ (CmpOp == RdxDesc.getRecurrenceStartValue() &&
+ isGuaranteedNotToBeUndefOrPoison(CmpOp))));
assert(IsExpectedPattern && "Unexpected reduction resume pattern");
(void)IsExpectedPattern;
MainResumeValue = OrigResumeV;
@@ -10374,6 +10377,36 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan);
using namespace VPlanPatternMatch;
+ // When vectorizing the epilogue, FindLastIV reductions can introduce multiple
+ // uses of undef/poison. If the reduction start value may be undef or poison
+ // it needs to be frozen and the frozen start has to be used when computing
+ // the reduction result. We also need to use the frozen value in the resume
+ // phi generated by the main vector loop, as this is also used to compute the
+ // reduction result after the epilogue vector loop.
+ auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
+ bool UpdateResumePhis) {
+ VPBuilder Builder(Plan.getEntry());
+ for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindLastIVResult)
+ continue;
+ VPValue *OrigStart = VPI->getOperand(1);
+ if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue()))
+ continue;
+ VPInstruction *Freeze =
+ Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
+ VPI->setOperand(1, Freeze);
+ if (UpdateResumePhis)
+ OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
+ return Freeze != &U && isa<VPInstruction>(&U) &&
+ cast<VPInstruction>(&U)->getOpcode() ==
+ VPInstruction::ResumePhi;
+ });
+ }
+ };
+ AddFreezeForFindLastIVReductions(MainPlan, true);
+ AddFreezeForFindLastIVReductions(EpiPlan, false);
+
VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
VPValue *VectorTC = &MainPlan.getVectorTripCount();
// If there is a suitable resume value for the canonical induction in the
@@ -10401,24 +10434,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
- // Re-use the trip count and steps expanded for the main loop, as
- // skeleton creation needs it as a value that dominates both the scalar
- // and vector epilogue loops
- // TODO: This is a workaround needed for epilogue vectorization and it
- // should be removed once induction resume value creation is done
- // directly in VPlan.
- for (auto &R : make_early_inc_range(*Plan.getEntry())) {
- auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
- if (!ExpandR)
- continue;
- auto *ExpandedVal =
- Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
- ExpandR->replaceAllUsesWith(ExpandedVal);
- if (Plan.getTripCount() == ExpandR)
- Plan.resetTripCount(ExpandedVal);
- ExpandR->eraseFromParent();
- }
-
+ DenseMap<Value *, Value *> ToFrozen;
// Ensure that the start values for all header phi recipes are updated before
// vectorizing the epilogue loop.
for (VPRecipeBase &R : Header->phis()) {
@@ -10484,6 +10500,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
ResumeV =
Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
} else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
+ ToFrozen[RdxDesc.getRecurrenceStartValue()] =
+ cast<PHINode>(ResumeV)->getIncomingValueForBlock(
+ EPI.MainLoopIterationCountCheck);
+
// VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
// to the resume value. The resume value is adjusted to the sentinel
// value when the final value from the main vector loop equals the start
@@ -10492,8 +10512,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
// variable.
BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
- Value *Cmp =
- Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
+ Value *Cmp = Builder.CreateICmpEQ(
+ ResumeV, ToFrozen[RdxDesc.getRecurrenceStartValue()]);
ResumeV =
Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
}
@@ -10509,6 +10529,35 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
}
+
+ // For some VPValues in the epilogue plan we must re-use the generated IR
+ // values from the main plan. Replace them with live-in VPValues.
+ // TODO: This is a workaround needed for epilogue vectorization and it
+ // should be removed once induction resume value creation is done
+ // directly in VPlan.
+ for (auto &R : make_early_inc_range(*Plan.getEntry())) {
+ // Re-use frozen values from the main plan for Freeze VPInstructions in the
+ // epilogue plan. This ensures all users use the same frozen value.
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (VPI && VPI->getOpcode() == Instruction::Freeze) {
+ VPI->replaceAllUsesWith(Plan.getOrAddLiveIn(
+ ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
+ continue;
+ }
+
+ // Re-use the trip count and steps expanded for the main loop, as
+ // skeleton creation needs it as a value that dominates both the scalar
+ // and vector epilogue loops
+ auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
+ if (!ExpandR)
+ continue;
+ auto *ExpandedVal =
+ Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
+ ExpandR->replaceAllUsesWith(ExpandedVal);
+ if (Plan.getTripCount() == ExpandR)
+ Plan.resetTripCount(ExpandedVal);
+ ExpandR->eraseFromParent();
+ }
}
// Generate bypass values from the additional bypass block. Note that when the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b16a8fc563f4c..a117d82e64ef7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -423,6 +423,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
if (isSingleScalar() || isVectorToScalar())
return true;
switch (Opcode) {
+ case Instruction::Freeze:
case Instruction::ICmp:
case Instruction::PHI:
case Instruction::Select:
@@ -474,6 +475,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
return Builder.CreateExtractElement(Vec, Idx, Name);
}
+ case Instruction::Freeze: {
+ Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
+ return Builder.CreateFreeze(Op, Name);
+ }
case Instruction::ICmp: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -909,6 +914,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
return false;
switch (getOpcode()) {
case Instruction::ExtractElement:
+ case Instruction::Freeze:
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::AnyOf:
@@ -941,6 +947,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case Instruction::ICmp:
case Instruction::Select:
case Instruction::Or:
+ case Instruction::Freeze:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
case VPInstruction::ActiveLaneMask:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index 7296cc0840dc0..c0806ea16a5fc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -9,6 +9,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8
+; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 32
@@ -42,7 +43,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
; CHECK-NEXT: [[TMP13:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[RDX_MINMAX]])
; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP13]], -128
-; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[START]]
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[FR]]
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -53,8 +54,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]]
; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i8 -128, i8 [[BC_MERGE_RDX]]
; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8
; CHECK-NEXT: [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]]
@@ -82,7 +83,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP22:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP20]])
; CHECK-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp ne i8 [[TMP22]], -128
-; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[START]]
+; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[FR]]
; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
; CHECK-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
@@ -128,6 +129,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
; CHECK-NEXT: [[N_POS:%.*]] = icmp sgt i32 [[N]], 0
; CHECK-NEXT: call void @llvm.assume(i1 [[N_POS]])
; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[START]]
; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
@@ -166,7 +168,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
; CHECK-NEXT: [[RDX_MINMAX6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX5]], <4 x i32> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX6]])
; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP8]], -2147483648
-; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[START]]
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[FR]]
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -175,8 +177,8 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[START]]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[FR]]
; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -2147483648, i32 [[BC_MERGE_RDX]]
; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 4
; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]]
@@ -203,7 +205,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]])
; CHECK-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648
-; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[START]]
+; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[FR]]
; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
index ee154ea5a169a..800b6f3f28b7d 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
@@ -217,6 +217,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
+; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 4
@@ -243,7 +244,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP8]])
; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i8 [[TMP10]], -128
-; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[START]]
+; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[FR]]
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -254,8 +255,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]]
; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i8 -128, i8 [[BC_MERGE_RDX]]
; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP2]], 4
; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF2]]
@@ -283,7 +284,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP17]])
; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP19]], -128
-; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[START]]
+; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[FR]]
; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]]
; CHECK-NEXT: br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
More information about the llvm-commits
mailing list