[llvm] [SLP] Reject 2-element vectorization when vector inst count exceeds scalar (PR #190414)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 3 15:24:06 PDT 2026
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/190414
>From 0160822f66a0cac0518dc90eaed1ee2d10649f16 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 3 Apr 2026 15:12:24 -0700
Subject: [PATCH 1/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
=?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.7
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 135 ++++++++++++++++++
.../SLPVectorizer/AArch64/slp-fma-loss.ll | 14 +-
.../SLPVectorizer/AArch64/vec3-base.ll | 10 +-
.../Transforms/SLPVectorizer/X86/c-ray.ll | 19 +--
.../entry-no-bundle-but-extra-use-on-vec.ll | 23 ++-
.../X86/extractelement-multi-register-use.ll | 29 ++--
.../X86/extractelements-vector-ops-shuffle.ll | 14 +-
.../Transforms/SLPVectorizer/X86/lookahead.ll | 131 +++++------------
.../X86/reorder_with_external_users.ll | 7 +-
.../Transforms/SLPVectorizer/X86/vec3-base.ll | 10 +-
10 files changed, 230 insertions(+), 162 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f2ccf198c4c81..64ab2228f99a3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -142,6 +142,11 @@ static cl::opt<bool> SplitAlternateInstructions(
"slp-split-alternate-instructions", cl::init(true), cl::Hidden,
cl::desc("Improve the code quality by splitting alternate instructions"));
+static cl::opt<bool> SLPInstCountCheck(
+ "slp-inst-count-check", cl::init(true), cl::Hidden,
+ cl::desc("Reject vectorization if vector instruction count exceeds "
+ "scalar instruction count"));
+
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
@@ -3776,6 +3781,13 @@ class slpvectorizer::BoUpSLP {
Instruction *I,
const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
+ /// Estimates the number of unique scalar instructions in the tree.
+ unsigned getNumScalarInsts() const;
+
+ /// Estimates the number of vector instructions (including buildvectors,
+ /// shuffles, and extracts) that the tree will produce.
+ unsigned getNumVectorInsts() const;
+
/// Return information about the vector formed for the specified index
/// of a vector of (the same) instruction.
TargetTransformInfo::OperandValueInfo
@@ -12817,6 +12829,113 @@ bool BoUpSLP::areAllUsersVectorized(
});
}
+unsigned BoUpSLP::getNumScalarInsts() const {
+ unsigned Count = 0;
+ for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
+ const TreeEntry &TE = *Ptr;
+ if (DeletedNodes.contains(&TE))
+ continue;
+ if (TE.isGather() || TransformedToGatherNodes.contains(&TE)) {
+ // Count instruction scalars in gathers — they exist in the scalar
+ // code regardless of vectorization. ExtractElement instructions
+ // become free when the vector input is used directly.
+ for (Value *V : TE.Scalars)
+ if (isa<Instruction>(V))
+ ++Count;
+ continue;
+ }
+ // Each vectorize entry represents a bundle of scalar instructions.
+ // Count per-entry without cross-entry deduplication, since shared
+ // scalars across entries still represent separate work in scalar code.
+ for (Value *V : TE.Scalars) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I || (TE.hasCopyableElements() && TE.isCopyableElement(V)))
+ continue;
+ ++Count;
+ }
+ }
+ return Count;
+}
+
+unsigned BoUpSLP::getNumVectorInsts() const {
+ unsigned Count = 0;
+ SmallPtrSet<Value *, 4> GatherExtractSourceVecs;
+ for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
+ const TreeEntry &TE = *Ptr;
+ if (DeletedNodes.contains(&TE))
+ continue;
+ if (TE.State == TreeEntry::CombinedVectorize)
+ continue;
+ if (TE.CombinedOp == TreeEntry::ReducedBitcast ||
+ TE.CombinedOp == TreeEntry::ReducedBitcastBSwap ||
+ TE.CombinedOp == TreeEntry::ReducedBitcastLoads ||
+ TE.CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
+ TE.CombinedOp == TreeEntry::ReducedCmpBitcast)
+ continue;
+ bool IsGatherOrTransformed =
+ TE.isGather() || TransformedToGatherNodes.contains(&TE);
+ if (IsGatherOrTransformed) {
+ if (TE.hasState()) {
+ if (const TreeEntry *E =
+ getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
+ E && E->getVectorFactor() == TE.getVectorFactor())
+ continue;
+ SmallVector<Value *> RevScalars(TE.Scalars.rbegin(),
+ TE.Scalars.rend());
+ if (const TreeEntry *E =
+ getSameValuesTreeEntry(TE.getMainOp(), RevScalars);
+ E && E->getVectorFactor() == TE.getVectorFactor()) {
+ ++Count;
+ continue;
+ }
+ }
+ // ExtractElement gathers from the same source vector become a single
+ // shufflevector. Collect source vectors globally across all gather
+ // entries and count once at the end.
+ if (all_of(TE.Scalars,
+ IsaPred<ExtractElementInst, UndefValue, Constant>)) {
+ for (Value *V : TE.Scalars)
+ if (auto *EE = dyn_cast<ExtractElementInst>(V))
+ GatherExtractSourceVecs.insert(EE->getVectorOperand());
+ } else {
+ for (Value *V : TE.Scalars) {
+ if (!isConstant(V) && !isa<PoisonValue>(V))
+ ++Count;
+ }
+ }
+ continue;
+ }
+ // InsertElement/ExtractElement vectorize entries don't produce real
+ // vector instructions — InsertElement at root IS the result, and
+ // ExtractElement entries reference the input vector directly.
+ if (TE.getOpcode() == Instruction::InsertElement ||
+ TE.getOpcode() == Instruction::ExtractElement)
+ continue;
+ if (TE.State == TreeEntry::SplitVectorize)
+ Count += 2;
+ else
+ ++Count;
+ if (!TE.ReorderIndices.empty() || !TE.ReuseShuffleIndices.empty())
+ ++Count;
+ }
+ Count += GatherExtractSourceVecs.size();
+ // Count extract instructions from ExternalUses, skipping insertelements
+ // (those get folded into shuffles, not real extracts).
+ SmallPtrSet<Value *, 8> CountedExtracts;
+ for (const ExternalUser &EU : ExternalUses) {
+ if (isa_and_nonnull<InsertElementInst>(EU.User))
+ continue;
+ if (EU.User && EphValues.count(EU.User))
+ continue;
+ if (ExternalUsesAsOriginalScalar.contains(EU.Scalar))
+ continue;
+ if (!CountedExtracts.insert(EU.Scalar).second)
+ continue;
+ ++Count;
+ }
+ return Count;
+}
+
void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
SmallVectorImpl<Value *> *OpScalars,
@@ -18082,6 +18201,22 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
ArrayRef<Value *> VectorizedVals,
InstructionCost ReductionCost,
Instruction *RdxRoot) {
+ // Reject vectorization if the vector code would produce more instructions
+ // than the scalar code. The cost model may underestimate overhead from
+ // shuffles, inserts, and extracts.
+ if (SLPInstCountCheck && VectorizableTree.front()->getVectorFactor() == 2 &&
+ SLPCostThreshold == 0) {
+ unsigned NumScalar = getNumScalarInsts();
+ unsigned NumVector = getNumVectorInsts();
+ LLVM_DEBUG(dbgs() << "SLP: Inst count check: vector=" << NumVector
+ << " scalar=" << NumScalar << "\n");
+ if (NumVector > NumScalar) {
+ LLVM_DEBUG(dbgs() << "SLP: Rejecting tree: vector inst count "
+ << NumVector << " > scalar inst count " << NumScalar
+ << ".\n");
+ return InstructionCost::getInvalid();
+ }
+ }
InstructionCost Cost = TreeCost;
SmallDenseMap<std::tuple<const TreeEntry *, Value *, Instruction *>, unsigned>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 9b0f5416db725..596154c23c6fc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -216,21 +216,21 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-LABEL: @slp_not_profitable_in_loop(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
+; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
+; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[A1]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
+; CHECK-NEXT: [[L_4:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float 3.000000e+00, [[L_0]]
; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_2]]
-; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = fmul fast float [[X:%.*]], [[L_3]]
+; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_4]]
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
index d527d38adbee3..d91ad0621fbfe 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
@@ -369,14 +369,14 @@ entry:
define void @fpext_gather(ptr %dst, double %conv) {
; CHECK-LABEL: @fpext_gather(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT: [[TMP3:%.*]] = fptrunc double [[CONV:%.*]] to float
; CHECK-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: store float [[TMP3]], ptr [[LENGTHS]], align 4
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
-; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT: [[CONV34:%.*]] = fptrunc double [[CONV]] to float
+; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
+; CHECK-NEXT: store float [[CONV34]], ptr [[ARRAYIDX37]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
index d23e54f3495bd..8e3fbf0b53324 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
@@ -64,25 +64,18 @@ define i32 @ray_sphere(ptr nocapture noundef readonly %sph, ptr nocapture nounde
; SSE2: if.end:
; SSE2-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]])
; SSE2-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]]
+; SSE2-NEXT: [[ADD:%.*]] = fsub double [[CALL]], [[TMP12]]
; SSE2-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00
-; SSE2-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0
-; SSE2-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1
-; SSE2-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; SSE2-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1
-; SSE2-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]]
-; SSE2-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0
-; SSE2-NEXT: [[TMP32:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer
-; SSE2-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]]
-; SSE2-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1
+; SSE2-NEXT: [[TMP34:%.*]] = fdiv double [[ADD]], [[MUL88]]
+; SSE2-NEXT: [[SUB90:%.*]] = fsub double [[FNEG87]], [[CALL]]
+; SSE2-NEXT: [[TMP35:%.*]] = fdiv double [[SUB90]], [[MUL88]]
; SSE2-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D
-; SSE2-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0
; SSE2-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D
; SSE2-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false
; SSE2-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]]
; SSE2: lor.lhs.false:
-; SSE2-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], splat (double 1.000000e+00)
-; SSE2-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
-; SSE2-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
+; SSE2-NEXT: [[TMP38:%.*]] = fcmp ule double [[TMP34]], 1.000000e+00
+; SSE2-NEXT: [[TMP37:%.*]] = fcmp ule double [[TMP35]], 1.000000e+00
; SSE2-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]]
; SSE2-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32
; SSE2-NEXT: br label [[CLEANUP]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index 0dd5e44889c1e..155b1ce27ac9d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -11,27 +11,22 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP11]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], zeroinitializer
-; CHECK-NEXT: [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
; CHECK-NEXT: br label %[[IF_END]]
; CHECK: [[IF_END]]:
; CHECK-NEXT: [[TMP6:%.*]] = phi float [ [[TMP1]], %[[IF_THEN]] ], [ [[TMP0]], %[[ENTRY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ [[TMP1]], %[[ENTRY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT: [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x float> [ [[TMP5]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP8]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x float> [ [[TMP5]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x float> [[TMP15]], [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x float> [[TMP10]], [[TMP14]]
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP14]], <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP7]], i32 2
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP6]], i32 3
; CHECK-NEXT: [[TMP19:%.*]] = fmul <4 x float> [[TMP28]], zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1
-; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP33]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
; CHECK-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]]
; CHECK-NEXT: [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
index 73b73735da021..c945799b38122 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
@@ -5,32 +5,31 @@ define void @test(double %i) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: double [[I:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> zeroinitializer, [[TMP0]]
+; CHECK-NEXT: [[I74:%.*]] = fsub double 0.000000e+00, poison
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[I]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]]
+; CHECK-NEXT: [[I96:%.*]] = fsub double poison, 0.000000e+00
; CHECK-NEXT: [[I75:%.*]] = fsub double 0.000000e+00, [[I]]
-; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> <i32 poison, i32 0, i32 2, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> <i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 5, i32 6, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP28]], <8 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 poison, i32 12, i32 5, i32 6, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[I75]], i32 3
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[I]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> <double 0.000000e+00, double poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT: [[TMP15:%.*]] = fsub <2 x double> [[TMP4]], [[TMP8]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[I75]], i32 5
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> zeroinitializer, [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x double> zeroinitializer, [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> [[TMP12]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = fcmp ult <8 x double> [[TMP13]], zeroinitializer
; CHECK-NEXT: br label [[BB116:%.*]]
; CHECK: bb116:
-; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[TMP15]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[TMP15]], i32 1
-; CHECK-NEXT: [[I120:%.*]] = fadd double [[TMP16]], [[TMP17]]
-; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP1]]
+; CHECK-NEXT: [[I117:%.*]] = fmul double 0.000000e+00, [[TMP16]]
+; CHECK-NEXT: [[I119:%.*]] = fmul double 0.000000e+00, [[I96]]
+; CHECK-NEXT: [[I120:%.*]] = fadd double [[I117]], [[I119]]
+; CHECK-NEXT: [[TMP21:%.*]] = fmul double 0.000000e+00, [[I74]]
; CHECK-NEXT: [[TMP19:%.*]] = fmul <2 x double> zeroinitializer, [[TMP3]]
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
+; CHECK-NEXT: [[TMP20:%.*]] = fmul double 0.000000e+00, [[I75]]
; CHECK-NEXT: [[I128:%.*]] = fadd double [[TMP20]], [[TMP21]]
; CHECK-NEXT: [[I139:%.*]] = call double @llvm.maxnum.f64(double [[I128]], double 0.000000e+00)
; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP19]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
index 7bbc694dc5181..f43b1cba84a5f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
@@ -4,10 +4,10 @@
define double @test() {
; CHECK-LABEL: define double @test() {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 5), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), align 16
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 5), align 8
+; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16
; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 9), align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, double [[TMP3]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP2]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
@@ -16,14 +16,12 @@ define double @test() {
; CHECK-NEXT: store double [[TMP7]], ptr null, align 16
; CHECK-NEXT: br label [[BB:%.*]]
; CHECK: bb:
-; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = fmul double [[TMP2]], 0.000000e+00
+; CHECK-NEXT: [[TMP9:%.*]] = fmul double [[TMP8]], 0.000000e+00
; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[TMP0]], zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP12]], i32 1
+; CHECK-NEXT: [[TMP13:%.*]] = fmul double [[TMP3]], 0.000000e+00
; CHECK-NEXT: [[TMP14:%.*]] = fadd double [[TMP13]], [[TMP11]]
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = fmul double [[TMP0]], 0.000000e+00
; CHECK-NEXT: [[TMP16:%.*]] = fadd double [[TMP15]], [[TMP14]]
; CHECK-NEXT: ret double [[TMP16]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index a192808490511..02828a97b5656 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -424,37 +424,19 @@ define void @ChecksExtractScores(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr
define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
-; SSE-LABEL: @ExtractIdxNotConstantInt1(
-; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
-; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
-; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
-; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
-; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
-; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
-; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
-; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
-; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; SSE-NEXT: ret i1 [[CMP_I185]]
-;
-; AVX-LABEL: @ExtractIdxNotConstantInt1(
-; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
-; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
-; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
-; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
-; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
-; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
-; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
-; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
-; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
-; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; AVX-NEXT: ret i1 [[CMP_I185]]
+; CHECK-LABEL: @ExtractIdxNotConstantInt1(
+; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
+; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
+; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
+; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
+; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
+; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
+; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
+; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
+; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
+; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
+; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
+; CHECK-NEXT: ret i1 [[CMP_I185]]
;
%vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef
%sub14.i167 = fsub float undef, %vecext.i291.i166
@@ -472,37 +454,19 @@ define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %
define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
-; SSE-LABEL: @ExtractIdxNotConstantInt2(
-; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
-; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
-; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
-; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
-; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
-; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
-; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
-; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
-; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; SSE-NEXT: ret i1 [[CMP_I185]]
-;
-; AVX-LABEL: @ExtractIdxNotConstantInt2(
-; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
-; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
-; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
-; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
-; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
-; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
-; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
-; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
-; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
-; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; AVX-NEXT: ret i1 [[CMP_I185]]
+; CHECK-LABEL: @ExtractIdxNotConstantInt2(
+; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
+; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
+; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
+; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
+; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
+; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
+; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
+; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
+; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
+; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
+; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
+; CHECK-NEXT: ret i1 [[CMP_I185]]
;
%vecext.i291.i166 = extractelement <4 x float> %vec, i64 1
%sub14.i167 = fsub float undef, %vecext.i291.i166
@@ -520,36 +484,19 @@ define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %
define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
-; SSE-LABEL: @foo(
-; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
-; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 poison, i32 1>
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
-; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
-; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
-; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
-; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; SSE-NEXT: ret i1 [[CMP_I185]]
-;
-; AVX-LABEL: @foo(
-; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
-; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
-; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
-; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
-; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
-; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
-; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
-; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
-; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; AVX-NEXT: ret i1 [[CMP_I185]]
+; CHECK-LABEL: @foo(
+; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
+; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
+; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
+; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
+; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
+; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
+; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
+; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
+; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
+; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
+; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
+; CHECK-NEXT: ret i1 [[CMP_I185]]
;
%vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
%sub14.i167 = fsub float undef, %vecext.i291.i166
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
index 098a2cd02caed..47fa6245a7a2b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
@@ -19,9 +19,10 @@ define void @rotate_with_external_users(ptr %A, ptr %ptr) {
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[A:%.*]], align 8
; CHECK-NEXT: br label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], <double 4.400000e+00, double 3.300000e+00>
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = fadd double [[TMP4]], 3.300000e+00
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP7]], 4.400000e+00
; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]]
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
index 15dd6756cd7db..99dde849ae514 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
@@ -304,14 +304,14 @@ entry:
define void @fpext_gather(ptr %dst, double %conv) {
; CHECK-LABEL: @fpext_gather(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT: [[TMP3:%.*]] = fptrunc double [[CONV:%.*]] to float
; CHECK-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: store float [[TMP3]], ptr [[LENGTHS]], align 4
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
-; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT: [[CONV34:%.*]] = fptrunc double [[CONV]] to float
+; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
+; CHECK-NEXT: store float [[CONV34]], ptr [[ARRAYIDX37]], align 4
; CHECK-NEXT: ret void
;
entry:
>From 99b053789c455763d19aa84e90b1218ec22e1317 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 3 Apr 2026 15:16:22 -0700
Subject: [PATCH 2/3] Fix formatting
Created using spr 1.3.7
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 64ab2228f99a3..b0c95fae70752 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12880,8 +12880,7 @@ unsigned BoUpSLP::getNumVectorInsts() const {
getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
E && E->getVectorFactor() == TE.getVectorFactor())
continue;
- SmallVector<Value *> RevScalars(TE.Scalars.rbegin(),
- TE.Scalars.rend());
+ SmallVector<Value *> RevScalars(TE.Scalars.rbegin(), TE.Scalars.rend());
if (const TreeEntry *E =
getSameValuesTreeEntry(TE.getMainOp(), RevScalars);
E && E->getVectorFactor() == TE.getVectorFactor()) {
>From 4430c7dfe8f05173b19f9d2926668659ee6b7264 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 3 Apr 2026 15:23:53 -0700
Subject: [PATCH 3/3] Address comment
Created using spr 1.3.7
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
.../Transforms/SLPVectorizer/X86/vec3-base.ll | 68 ++++++++++++++++---
2 files changed, 58 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b0c95fae70752..ea95faeb820a7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3781,7 +3781,7 @@ class slpvectorizer::BoUpSLP {
Instruction *I,
const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
- /// Estimates the number of unique scalar instructions in the tree.
+ /// Estimates the number of scalar instructions in the tree.
unsigned getNumScalarInsts() const;
/// Estimates the number of vector instructions (including buildvectors,
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
index 99dde849ae514..4c394f6805cce 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -slp-inst-count-check=false -S %s | FileCheck --check-prefixes=CHECK,NO-INST-COUNT %s
define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
; CHECK-LABEL: @v3_load_i32_mul_by_constant_store(
@@ -183,6 +184,19 @@ define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
; POW2-ONLY-NEXT: ret void
;
+; NO-INST-COUNT-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
+; NO-INST-COUNT-NEXT: entry:
+; NO-INST-COUNT-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
+; NO-INST-COUNT-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
+; NO-INST-COUNT-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; NO-INST-COUNT-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
+; NO-INST-COUNT-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
+; NO-INST-COUNT-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], splat (float 1.000000e+01)
+; NO-INST-COUNT-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; NO-INST-COUNT-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
+; NO-INST-COUNT-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
+; NO-INST-COUNT-NEXT: ret void
+;
entry:
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
%l.src.0 = load float , ptr %gep.src.0, align 4
@@ -255,6 +269,14 @@ define void @store_try_reorder(ptr %dst) {
; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
; POW2-ONLY-NEXT: ret void
;
+; NO-INST-COUNT-LABEL: @store_try_reorder(
+; NO-INST-COUNT-NEXT: entry:
+; NO-INST-COUNT-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; NO-INST-COUNT-NEXT: [[ADD216:%.*]] = sub i32 0, 0
+; NO-INST-COUNT-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
+; NO-INST-COUNT-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
+; NO-INST-COUNT-NEXT: ret void
+;
entry:
%add = add i32 0, 0
store i32 %add, ptr %dst, align 4
@@ -302,17 +324,41 @@ entry:
}
define void @fpext_gather(ptr %dst, double %conv) {
-; CHECK-LABEL: @fpext_gather(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP3:%.*]] = fptrunc double [[CONV:%.*]] to float
-; CHECK-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT: store float [[TMP3]], ptr [[LENGTHS]], align 4
-; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
-; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX32]], align 4
-; CHECK-NEXT: [[CONV34:%.*]] = fptrunc double [[CONV]] to float
-; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
-; CHECK-NEXT: store float [[CONV34]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @fpext_gather(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[CONV25:%.*]] = fptrunc double [[CONV:%.*]] to float
+; NON-POW2-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
+; NON-POW2-NEXT: store float [[CONV25]], ptr [[LENGTHS]], align 4
+; NON-POW2-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
+; NON-POW2-NEXT: store float [[CONV25]], ptr [[ARRAYIDX32]], align 4
+; NON-POW2-NEXT: [[CONV34:%.*]] = fptrunc double [[CONV]] to float
+; NON-POW2-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
+; NON-POW2-NEXT: store float [[CONV34]], ptr [[ARRAYIDX37]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @fpext_gather(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[CONV25:%.*]] = fptrunc double [[CONV:%.*]] to float
+; POW2-ONLY-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
+; POW2-ONLY-NEXT: store float [[CONV25]], ptr [[LENGTHS]], align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
+; POW2-ONLY-NEXT: store float [[CONV25]], ptr [[ARRAYIDX32]], align 4
+; POW2-ONLY-NEXT: [[CONV34:%.*]] = fptrunc double [[CONV]] to float
+; POW2-ONLY-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
+; POW2-ONLY-NEXT: store float [[CONV34]], ptr [[ARRAYIDX37]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; NO-INST-COUNT-LABEL: @fpext_gather(
+; NO-INST-COUNT-NEXT: entry:
+; NO-INST-COUNT-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV:%.*]], i32 0
+; NO-INST-COUNT-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; NO-INST-COUNT-NEXT: [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; NO-INST-COUNT-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
+; NO-INST-COUNT-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; NO-INST-COUNT-NEXT: store float [[TMP3]], ptr [[LENGTHS]], align 4
+; NO-INST-COUNT-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
+; NO-INST-COUNT-NEXT: store <2 x float> [[TMP2]], ptr [[ARRAYIDX32]], align 4
+; NO-INST-COUNT-NEXT: ret void
;
entry:
%conv25 = fptrunc double %conv to float
More information about the llvm-commits
mailing list