[llvm] [VPlan] Simplify VPBlendRecipes to select instructions (PR #133993)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 6 23:43:22 PDT 2025
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/133993
>From 0b07987f452c801ce48996da95dc001972a8d95e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 1 Apr 2025 22:28:09 +0100
Subject: [PATCH 1/3] [VPlan] Simplify VPBlendRecipes to select instructions
When looking at some EVL tail folded code in SPEC CPU 2017 I noticed we sometimes have both VPBlendRecipes and select VPInstructions in the same plan:
EMIT vp<%active.lane.mask> = active lane mask vp<%5>, vp<%3>
EMIT vp<%7> = icmp ...
EMIT vp<%8> = logical-and vp<%active.lane.mask>, vp<%7>
BLEND ir<%8> = ir<%n.015> ir<%foo>/vp<%8>
EMIT vp<%9> = select vp<%active.lane.mask>, ir<%8>, ir<%n.015>
Since a blend will ultimately generate a chain of selects, we could fold the blend into the select:
EMIT vp<%active.lane.mask> = active lane mask vp<%5>, vp<%3>
EMIT vp<%7> = icmp ...
EMIT vp<%8> = logical-and vp<%active.lane.mask>, vp<%7>
EMIT ir<%8> = select vp<%8>, ir<%foo>, ir<%n.015>
So this patch canonicalizes blends to a series of select instructions, which allows them to be simplified further with other select instructions.
The `BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask` optimisation has also been converted to operate on selects.
---
llvm/lib/Transforms/Vectorize/VPlan.h | 4 --
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 61 +++++--------------
.../Transforms/Vectorize/VPlanTransforms.cpp | 44 +++++--------
.../AArch64/masked-call-scalarize.ll | 4 +-
.../LoopVectorize/AArch64/masked-call.ll | 4 +-
.../RISCV/blocks-with-dead-instructions.ll | 4 +-
.../Transforms/LoopVectorize/RISCV/divrem.ll | 24 ++++----
...rize-force-tail-with-evl-cond-reduction.ll | 8 +--
.../LoopVectorize/X86/load-deref-pred.ll | 16 ++---
.../X86/replicate-uniform-call.ll | 55 +----------------
.../Transforms/LoopVectorize/if-conversion.ll | 4 +-
.../LoopVectorize/load-deref-pred-align.ll | 4 +-
.../pr55167-fold-tail-live-out.ll | 3 +-
.../LoopVectorize/single-value-blend-phis.ll | 14 ++---
14 files changed, 73 insertions(+), 176 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 61b5ccd85bc6e..2ba5affa2c82d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2297,10 +2297,6 @@ class VPBlendRecipe : public VPSingleDefRecipe {
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
- /// Return the cost of this VPWidenMemoryRecipe.
- InstructionCost computeCost(ElementCount VF,
- VPCostContext &Ctx) const override;
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a38932ef99fe..9acd52b2ec02c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -909,6 +909,19 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
Ctx.CostKind);
}
+ case Instruction::Select: {
+ if (!getUnderlyingValue())
+ return 0;
+ // Handle cases where only the first lane is used the same way as the legacy
+ // cost model.
+ if (vputils::onlyFirstLaneUsed(this))
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
+ Type *ResTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
+ Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResTy, CmpTy,
+ CmpInst::BAD_ICMP_PREDICATE,
+ Ctx.CostKind);
+ }
case VPInstruction::AnyOf: {
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticReductionCost(
@@ -2380,53 +2393,7 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPBlendRecipe::execute(VPTransformState &State) {
- assert(isNormalized() && "Expected blend to be normalized!");
- // We know that all PHIs in non-header blocks are converted into
- // selects, so we don't have to worry about the insertion order and we
- // can just use the builder.
- // At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- unsigned NumIncoming = getNumIncomingValues();
-
- // Generate a sequence of selects of the form:
- // SELECT(Mask3, In3,
- // SELECT(Mask2, In2,
- // SELECT(Mask1, In1,
- // In0)))
- // Note that Mask0 is never used: lanes for which no path reaches this phi and
- // are essentially undef are taken from In0.
- bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
- Value *Result = nullptr;
- for (unsigned In = 0; In < NumIncoming; ++In) {
- // We might have single edge PHIs (blocks) - use an identity
- // 'select' for the first PHI operand.
- Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed);
- if (In == 0)
- Result = In0; // Initialize with the first incoming value.
- else {
- // Select between the current value and the previous incoming edge
- // based on the incoming mask.
- Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed);
- Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi");
- }
- }
- State.set(this, Result, OnlyFirstLaneUsed);
-}
-
-InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
- // Handle cases where only the first lane is used the same way as the legacy
- // cost model.
- if (vputils::onlyFirstLaneUsed(this))
- return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
-
- Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
- Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
- return (getNumIncomingValues() - 1) *
- Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
- CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
+ llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends");
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8d4a73c744469..ee6856674871e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1082,6 +1082,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
return Def->replaceAllUsesWith(X);
+ // select !c, x, y -> select c, y, x
+ VPValue *C;
+ if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
+ Def->setOperand(0, C);
+ Def->setOperand(1, Y);
+ Def->setOperand(2, X);
+ return;
+ }
+
if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
return Def->replaceAllUsesWith(A);
@@ -1288,38 +1297,17 @@ static void simplifyBlends(VPlan &Plan) {
}
}
- SmallVector<VPValue *, 4> OperandsWithMask;
- OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
-
+ VPBuilder Builder(&R);
+ VPValue *Select = Blend->getIncomingValue(StartIndex);
for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
if (I == StartIndex)
continue;
- OperandsWithMask.push_back(Blend->getIncomingValue(I));
- OperandsWithMask.push_back(Blend->getMask(I));
- }
-
- auto *NewBlend = new VPBlendRecipe(
- cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
- NewBlend->insertBefore(&R);
-
- VPValue *DeadMask = Blend->getMask(StartIndex);
- Blend->replaceAllUsesWith(NewBlend);
- Blend->eraseFromParent();
- recursivelyDeleteDeadRecipes(DeadMask);
-
- /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
- VPValue *NewMask;
- if (NewBlend->getNumOperands() == 3 &&
- match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
- VPValue *Inc0 = NewBlend->getOperand(0);
- VPValue *Inc1 = NewBlend->getOperand(1);
- VPValue *OldMask = NewBlend->getOperand(2);
- NewBlend->setOperand(0, Inc1);
- NewBlend->setOperand(1, Inc0);
- NewBlend->setOperand(2, NewMask);
- if (OldMask->getNumUsers() == 0)
- cast<VPInstruction>(OldMask)->eraseFromParent();
+ Select =
+ Builder.createSelect(Blend->getMask(I), Blend->getIncomingValue(I),
+ Select, R.getDebugLoc(), "predphi");
+ Select->setUnderlyingValue(Blend->getUnderlyingValue());
}
+ Blend->replaceAllUsesWith(Select);
}
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
index 078f98f54525b..c507c36f0dba9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
@@ -22,8 +22,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
-; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
-; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
+; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ule <2 x double> [[TMP2]], zeroinitializer
+; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer
; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
; TFNONE-NEXT: store double [[TMP14]], ptr [[P:%.*]], align 8
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 6029095bbe7b1..b9b54aeca7f00 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -935,8 +935,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
; TFNONE-NEXT: [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
-; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
-; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
+; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ule <vscale x 2 x double> [[TMP8]], zeroinitializer
+; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> zeroinitializer
; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
; TFNONE-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2
; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index f8b83ff41f512..d5e07a615057d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -597,8 +597,8 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 {
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP12]], <vscale x 8 x i16> splat (i16 99), <vscale x 8 x i16> [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <vscale x 8 x i16> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP8]], <vscale x 8 x i16> [[WIDE_LOAD]], <vscale x 8 x i16> splat (i16 99)
; CHECK-NEXT: store <vscale x 8 x i16> [[PREDPHI]], ptr [[TMP11]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[TMP5]]
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index db780c3c12c7e..0451d30179a0a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -677,9 +677,9 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
; CHECK-NEXT: [[TMP10:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 27)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i64> [[TMP10]]
; CHECK-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -720,12 +720,12 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
-; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
+; FIXED-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
+; FIXED-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
; FIXED-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], splat (i64 27)
; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27)
-; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]]
-; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[WIDE_LOAD]], <4 x i64> [[TMP6]]
+; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[TMP7]]
; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8
; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -797,9 +797,9 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
; CHECK-NEXT: [[TMP10:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 27)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i64> [[TMP10]]
; CHECK-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -840,12 +840,12 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
-; FIXED-NEXT: [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; FIXED-NEXT: [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
+; FIXED-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 42)
+; FIXED-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
; FIXED-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], splat (i64 27)
; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27)
-; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]]
-; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]]
+; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[WIDE_LOAD]], <4 x i64> [[TMP6]]
+; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[TMP7]]
; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8
; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 8c44da63e08a6..fbc3c0dc84219 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -398,9 +398,9 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
; NO-VP-OUTLOOP-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
-; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 3)
+; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = icmp sle <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 3)
; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
-; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]]
+; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP16]]
; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-VP-OUTLOOP-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -956,9 +956,9 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
; NO-VP-OUTLOOP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP27]], align 4
-; NO-VP-OUTLOOP-NEXT: [[TMP28:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
+; NO-VP-OUTLOOP-NEXT: [[TMP13:%.*]] = icmp sle <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
-; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP28]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[VEC_PHI]]
+; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP22]]
; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index f4cd48de60243..7e66fcf8568d3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -34,10 +34,10 @@ define i32 @test_explicit_pred(i64 %len) {
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], splat (i64 4)
-; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp sge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sge <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sge <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4
@@ -47,10 +47,10 @@ define i32 @test_explicit_pred(i64 %len) {
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD7]]
+; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD8]]
+; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_LOAD9]]
; CHECK-NEXT: [[TMP16]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]]
; CHECK-NEXT: [[TMP18]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index ed1c67c082134..5ad5cf14aea10 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -11,54 +11,9 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[C:%.*]] = icmp ult i8 -68, -69
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[X]], 0
-; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
-; CHECK: [[PRED_UREM_IF]]:
-; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE]]
-; CHECK: [[PRED_UREM_CONTINUE]]:
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
-; CHECK: [[PRED_UREM_IF1]]:
-; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE2]]
-; CHECK: [[PRED_UREM_CONTINUE2]]:
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]]
-; CHECK: [[PRED_UREM_IF3]]:
-; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE4]]
-; CHECK: [[PRED_UREM_CONTINUE4]]:
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]]
-; CHECK: [[PRED_UREM_IF5]]:
-; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE6]]
-; CHECK: [[PRED_UREM_CONTINUE6]]:
-; CHECK-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0)
-; CHECK-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0)
-; CHECK-NEXT: [[P:%.*]] = select i1 [[C]], i64 1, i64 [[TMP12]]
-; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[C]], i64 1, i64 [[TMP13]]
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[P]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[PREDPHI7]], 1
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[ADD]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8
-; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
; CHECK: [[LOOP_HEADER]]:
-; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ]
; CHECK-NEXT: br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[REM1:%.*]] = urem i64 [[MUL]], [[X]]
@@ -71,7 +26,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
; CHECK-NEXT: store i64 0, ptr [[GEP1]], align 8
; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 1024
-; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -103,9 +58,3 @@ exit:
}
declare i64 @llvm.smax.i64(i64, i64)
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
index 8a7f4a386fda1..57109a0996a66 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
@@ -159,10 +159,10 @@ define i32 @reduction_func(ptr nocapture %A, i32 %n) nounwind uwtable readonly s
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 30)
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 31)
; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[VEC_PHI]], splat (i32 2)
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 8a326c9d0c083..7c6bc6aa6ca3a 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -567,7 +567,7 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) {
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <2 x i8> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
@@ -577,7 +577,7 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) {
; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP10]], i32 1
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP12]], <2 x i16> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> zeroinitializer, <2 x i16> [[TMP12]]
; CHECK-NEXT: [[TMP13]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index bd9647188911a..1eb6b134621da 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -23,8 +23,7 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10)
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20)
; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> splat (i32 9)
-; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> [[PREDPHI]]
+; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9)
; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]]
; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index d5a206ff21da0..9bc231108fa96 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -21,8 +21,8 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) {
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP4]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i16> splat (i16 1), <2 x i16> [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1)
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
; CHECK-NEXT: store <2 x i16> [[PREDPHI]], ptr [[TMP7]], align 2
@@ -107,9 +107,8 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) {
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 1
; CHECK-NEXT: [[TMP6:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1)
-; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]]
+; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[PREDPHI]], <2 x i16> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], ptr [[TMP10]], align 2
@@ -189,8 +188,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP0]], <2 x i16> [[VEC_IND1]], <2 x i16> [[VEC_IND3]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
@@ -297,9 +296,8 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) {
; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
; CHECK-NEXT: [[TMP15:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> [[TMP14]], <2 x i16> splat (i16 1)
-; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]]
+; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[PREDPHI]], <2 x i16> zeroinitializer
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0
; CHECK-NEXT: store <2 x i16> [[PREDPHI3]], ptr [[TMP19]], align 2
>From bb56e679be84f63f28f5ac0e837e1d0719442dcf Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 2 Jul 2025 00:46:06 +0100
Subject: [PATCH 2/3] Update VPlan debug output tests
---
llvm/test/Transforms/LoopVectorize/vplan-printing.ll | 6 +++---
.../LoopVectorize/vplan-sink-scalars-and-merge.ll | 6 +++---
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 732214aa1449e..94ee454e971e8 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -187,7 +187,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
; CHECK-NEXT: Successor(s): if.then.0
; CHECK-EMPTY:
; CHECK-NEXT: if.then.0:
-; CHECK-NEXT: BLEND ir<%d> = ir<0> vp<[[PRED]]>/ir<%cmp>
+; CHECK-NEXT: EMIT ir<%d> = select ir<%cmp>, vp<[[PRED]]>, ir<0>
; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%d>
@@ -376,7 +376,7 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
; CHECK-NEXT: if.then.0:
; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp2>
; CHECK-NEXT: EMIT vp<[[SEL2:%.+]]> = logical-and vp<[[NOT1]]>, vp<[[NOT2]]>
-; CHECK-NEXT: BLEND ir<%ysd.0> = vp<[[PHI]]> ir<%psd>/vp<[[SEL2]]>
+; CHECK-NEXT: EMIT ir<%ysd.0> = select vp<[[SEL2]]>, ir<%psd>, vp<[[PHI]]>
; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0>
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -757,7 +757,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
; CHECK-EMPTY:
; CHECK-NEXT: if.then.1:
; CHECK-NEXT: WIDEN ir<%fadd> = fadd vp<[[PHI1]]>, vp<[[PHI2]]>
-; CHECK-NEXT: BLEND ir<%st.value> = ir<%ld.value> ir<%fadd>/ir<%ifcond>
+; CHECK-NEXT: EMIT ir<%st.value> = select ir<%ifcond>, ir<%fadd>, ir<%ld.value>
; CHECK-NEXT: CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%st.addr>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 4a4bda254bf88..a330afc45abab 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -366,7 +366,7 @@ define void @pred_cfg1(i32 %k, i32 %j) {
; CHECK-NEXT: Successor(s): then.0.0
; CHECK-EMPTY:
; CHECK-NEXT: then.0.0:
-; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
+; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0>
; CHECK-NEXT: Successor(s): pred.store
; CHECK-EMPTY:
; CHECK-NEXT: <xVFxUF> pred.store: {
@@ -465,7 +465,7 @@ define void @pred_cfg2(i32 %k, i32 %j) {
; CHECK-NEXT: Successor(s): then.0.0
; CHECK-EMPTY:
; CHECK-NEXT: then.0.0:
-; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
+; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0>
; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.1>
; CHECK-NEXT: Successor(s): pred.store
; CHECK-EMPTY:
@@ -571,7 +571,7 @@ define void @pred_cfg3(i32 %k, i32 %j) {
; CHECK-NEXT: Successor(s): then.0.0
; CHECK-EMPTY:
; CHECK-NEXT: then.0.0:
-; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
+; CHECK-NEXT: EMIT ir<%p> = select vp<[[MASK2]]>, vp<[[PRED]]>, ir<0>
; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0>
; CHECK-NEXT: Successor(s): pred.store
; CHECK-EMPTY:
>From e0a2fbd634e7fbaa785d5a6ac7d44a344908ce2b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 4 Jul 2025 12:59:46 +0100
Subject: [PATCH 3/3] Mark in planContainsAdditionalSimplifications
This fixes a crash on SPEC CPU 2017 with mismatching legacy + vplan cost models.
I'm really struggling to create a test case for this due to the issue at https://github.com/llvm/llvm-project/issues/147038
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 67df7a8af098d..ee6728d846203 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7048,6 +7048,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
if (isa<VPPartialReductionRecipe>(&R))
return true;
+ // VPBlendRecipes are converted to selects and may have been simplified.
+ using namespace VPlanPatternMatch;
+ if (match(&R, m_VPInstruction<Instruction::Select>(
+ m_VPValue(), m_VPValue(), m_VPValue())) &&
+ isa_and_nonnull<PHINode>(R.getVPSingleValue()->getUnderlyingValue()))
+ return true;
+
/// If a VPlan transform folded a recipe to one producing a single-scalar,
/// but the original instruction wasn't uniform-after-vectorization in the
/// legacy cost model, the legacy cost overestimates the actual cost.
More information about the llvm-commits
mailing list