[llvm] [SLP] Reject 2-element vectorization when vector inst count exceeds scalar (PR #190414)

Fri Apr 3 15:13:13 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

<details>
<summary>Changes</summary>

The LLVM cost model uses integer-valued throughput costs which cannot
represent fractional costs. For 2-element vectors, this rounding can
make vectorization appear profitable when it actually produces more
instructions than the scalar code — the overhead from shuffles, inserts,
extracts, and buildvectors is underestimated.
Add an instruction-count safety check in getTreeCost that estimates
the number of vector instructions (including gathers, shuffles, and
extracts) and compares against the number of scalar instructions.
If the vector code would produce more instructions, reject the tree
regardless of what the cost model says. This catches cases where
fractional cost rounding hides real overhead.

The check is gated behind -slp-inst-count-check (default: on) and
only applies to 2-element root trees where rounding errors matter most.


---

Patch is 37.76 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190414.diff


10 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+135) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll (+7-7) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll (+5-5) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll (+6-13) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll (+9-14) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll (+14-15) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll (+6-8) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll (+39-92) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll (+4-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll (+5-5) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f2ccf198c4c81..64ab2228f99a3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -142,6 +142,11 @@ static cl::opt<bool> SplitAlternateInstructions(
     "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
     cl::desc("Improve the code quality by splitting alternate instructions"));
 
+static cl::opt<bool> SLPInstCountCheck(
+    "slp-inst-count-check", cl::init(true), cl::Hidden,
+    cl::desc("Reject vectorization if vector instruction count exceeds "
+             "scalar instruction count"));
+
 static cl::opt<int>
 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
@@ -3776,6 +3781,13 @@ class slpvectorizer::BoUpSLP {
       Instruction *I,
       const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
 
+  /// Estimates the number of unique scalar instructions in the tree.
+  unsigned getNumScalarInsts() const;
+
+  /// Estimates the number of vector instructions (including buildvectors,
+  /// shuffles, and extracts) that the tree will produce.
+  unsigned getNumVectorInsts() const;
+
   /// Return information about the vector formed for the specified index
   /// of a vector of (the same) instruction.
   TargetTransformInfo::OperandValueInfo
@@ -12817,6 +12829,113 @@ bool BoUpSLP::areAllUsersVectorized(
          });
 }
 
+unsigned BoUpSLP::getNumScalarInsts() const {
+  unsigned Count = 0;
+  for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
+    const TreeEntry &TE = *Ptr;
+    if (DeletedNodes.contains(&TE))
+      continue;
+    if (TE.isGather() || TransformedToGatherNodes.contains(&TE)) {
+      // Count instruction scalars in gathers — they exist in the scalar
+      // code regardless of vectorization. ExtractElement instructions
+      // become free when the vector input is used directly.
+      for (Value *V : TE.Scalars)
+        if (isa<Instruction>(V))
+          ++Count;
+      continue;
+    }
+    // Each vectorize entry represents a bundle of scalar instructions.
+    // Count per-entry without cross-entry deduplication, since shared
+    // scalars across entries still represent separate work in scalar code.
+    for (Value *V : TE.Scalars) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I || (TE.hasCopyableElements() && TE.isCopyableElement(V)))
+        continue;
+      ++Count;
+    }
+  }
+  return Count;
+}
+
+unsigned BoUpSLP::getNumVectorInsts() const {
+  unsigned Count = 0;
+  SmallPtrSet<Value *, 4> GatherExtractSourceVecs;
+  for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
+    const TreeEntry &TE = *Ptr;
+    if (DeletedNodes.contains(&TE))
+      continue;
+    if (TE.State == TreeEntry::CombinedVectorize)
+      continue;
+    if (TE.CombinedOp == TreeEntry::ReducedBitcast ||
+        TE.CombinedOp == TreeEntry::ReducedBitcastBSwap ||
+        TE.CombinedOp == TreeEntry::ReducedBitcastLoads ||
+        TE.CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
+        TE.CombinedOp == TreeEntry::ReducedCmpBitcast)
+      continue;
+    bool IsGatherOrTransformed =
+        TE.isGather() || TransformedToGatherNodes.contains(&TE);
+    if (IsGatherOrTransformed) {
+      if (TE.hasState()) {
+        if (const TreeEntry *E =
+                getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
+            E && E->getVectorFactor() == TE.getVectorFactor())
+          continue;
+        SmallVector<Value *> RevScalars(TE.Scalars.rbegin(),
+                                       TE.Scalars.rend());
+        if (const TreeEntry *E =
+                getSameValuesTreeEntry(TE.getMainOp(), RevScalars);
+            E && E->getVectorFactor() == TE.getVectorFactor()) {
+          ++Count;
+          continue;
+        }
+      }
+      // ExtractElement gathers from the same source vector become a single
+      // shufflevector. Collect source vectors globally across all gather
+      // entries and count once at the end.
+      if (all_of(TE.Scalars,
+                 IsaPred<ExtractElementInst, UndefValue, Constant>)) {
+        for (Value *V : TE.Scalars)
+          if (auto *EE = dyn_cast<ExtractElementInst>(V))
+            GatherExtractSourceVecs.insert(EE->getVectorOperand());
+      } else {
+        for (Value *V : TE.Scalars) {
+          if (!isConstant(V) && !isa<PoisonValue>(V))
+            ++Count;
+        }
+      }
+      continue;
+    }
+    // InsertElement/ExtractElement vectorize entries don't produce real
+    // vector instructions — InsertElement at root IS the result, and
+    // ExtractElement entries reference the input vector directly.
+    if (TE.getOpcode() == Instruction::InsertElement ||
+        TE.getOpcode() == Instruction::ExtractElement)
+      continue;
+    if (TE.State == TreeEntry::SplitVectorize)
+      Count += 2;
+    else
+      ++Count;
+    if (!TE.ReorderIndices.empty() || !TE.ReuseShuffleIndices.empty())
+      ++Count;
+  }
+  Count += GatherExtractSourceVecs.size();
+  // Count extract instructions from ExternalUses, skipping insertelements
+  // (those get folded into shuffles, not real extracts).
+  SmallPtrSet<Value *, 8> CountedExtracts;
+  for (const ExternalUser &EU : ExternalUses) {
+    if (isa_and_nonnull<InsertElementInst>(EU.User))
+      continue;
+    if (EU.User && EphValues.count(EU.User))
+      continue;
+    if (ExternalUsesAsOriginalScalar.contains(EU.Scalar))
+      continue;
+    if (!CountedExtracts.insert(EU.Scalar).second)
+      continue;
+    ++Count;
+  }
+  return Count;
+}
+
 void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
     const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
     SmallVectorImpl<Value *> *OpScalars,
@@ -18082,6 +18201,22 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
                                      ArrayRef<Value *> VectorizedVals,
                                      InstructionCost ReductionCost,
                                      Instruction *RdxRoot) {
+  // Reject vectorization if the vector code would produce more instructions
+  // than the scalar code. The cost model may underestimate overhead from
+  // shuffles, inserts, and extracts.
+  if (SLPInstCountCheck && VectorizableTree.front()->getVectorFactor() == 2 &&
+      SLPCostThreshold == 0) {
+    unsigned NumScalar = getNumScalarInsts();
+    unsigned NumVector = getNumVectorInsts();
+    LLVM_DEBUG(dbgs() << "SLP: Inst count check: vector=" << NumVector
+                      << " scalar=" << NumScalar << "\n");
+    if (NumVector > NumScalar) {
+      LLVM_DEBUG(dbgs() << "SLP: Rejecting tree: vector inst count "
+                        << NumVector << " > scalar inst count " << NumScalar
+                        << ".\n");
+      return InstructionCost::getInvalid();
+    }
+  }
   InstructionCost Cost = TreeCost;
 
   SmallDenseMap<std::tuple<const TreeEntry *, Value *, Instruction *>, unsigned>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 9b0f5416db725..596154c23c6fc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -216,21 +216,21 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
 define float @slp_not_profitable_in_loop(float %x, ptr %A) {
 ; CHECK-LABEL: @slp_not_profitable_in_loop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
 ; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[A1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
 ; CHECK-NEXT:    [[L_3:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
+; CHECK-NEXT:    [[L_4:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float 3.000000e+00, [[L_0]]
 ; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_2]]
-; CHECK-NEXT:    [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast float [[X:%.*]], [[L_3]]
+; CHECK-NEXT:    [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_4]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
 ; CHECK-NEXT:    [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
index d527d38adbee3..d91ad0621fbfe 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
@@ -369,14 +369,14 @@ entry:
 define void @fpext_gather(ptr %dst, double %conv) {
 ; CHECK-LABEL: @fpext_gather(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[CONV:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc double [[CONV:%.*]] to float
 ; CHECK-NEXT:    [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    store float [[TMP3]], ptr [[LENGTHS]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    store float [[TMP3]], ptr [[ARRAYIDX32]], align 4
+; CHECK-NEXT:    [[CONV34:%.*]] = fptrunc double [[CONV]] to float
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[CONV34]], ptr [[ARRAYIDX37]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
index d23e54f3495bd..8e3fbf0b53324 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
@@ -64,25 +64,18 @@ define i32 @ray_sphere(ptr nocapture noundef readonly %sph, ptr nocapture nounde
 ; SSE2:       if.end:
 ; SSE2-NEXT:    [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]])
 ; SSE2-NEXT:    [[FNEG87:%.*]] = fneg double [[TMP12]]
+; SSE2-NEXT:    [[ADD:%.*]] = fsub double [[CALL]], [[TMP12]]
 ; SSE2-NEXT:    [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00
-; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0
-; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1
-; SSE2-NEXT:    [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1
-; SSE2-NEXT:    [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]]
-; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0
-; SSE2-NEXT:    [[TMP32:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer
-; SSE2-NEXT:    [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]]
-; SSE2-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1
+; SSE2-NEXT:    [[TMP34:%.*]] = fdiv double [[ADD]], [[MUL88]]
+; SSE2-NEXT:    [[SUB90:%.*]] = fsub double [[FNEG87]], [[CALL]]
+; SSE2-NEXT:    [[TMP35:%.*]] = fdiv double [[SUB90]], [[MUL88]]
 ; SSE2-NEXT:    [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D
-; SSE2-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0
 ; SSE2-NEXT:    [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D
 ; SSE2-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false
 ; SSE2-NEXT:    br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]]
 ; SSE2:       lor.lhs.false:
-; SSE2-NEXT:    [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], splat (double 1.000000e+00)
-; SSE2-NEXT:    [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
-; SSE2-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
+; SSE2-NEXT:    [[TMP38:%.*]] = fcmp ule double [[TMP34]], 1.000000e+00
+; SSE2-NEXT:    [[TMP37:%.*]] = fcmp ule double [[TMP35]], 1.000000e+00
 ; SSE2-NEXT:    [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]]
 ; SSE2-NEXT:    [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32
 ; SSE2-NEXT:    br label [[CLEANUP]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index 0dd5e44889c1e..155b1ce27ac9d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -11,27 +11,22 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi float [ [[TMP1]], %[[IF_THEN]] ], [ [[TMP0]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ [[TMP1]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x float> [ [[TMP5]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP5]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP14]], <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP7]], i32 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP6]], i32 3
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP33]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
index 73b73735da021..c945799b38122 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
@@ -5,32 +5,31 @@ define void @test(double %i) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: double [[I:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[I74:%.*]] = fsub double 0.000000e+00, poison
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[I]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[I96:%.*]] = fsub double poison, 0.000000e+00
 ; CHECK-NEXT:    [[I75:%.*]] = fsub double 0.000000e+00, [[I]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> <i32 poison, i32 0, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> <i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 5, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP28]], <8 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 poison, i32 12, i32 5, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[I75]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[I]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> <double 0.000000e+00, double poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP15:%.*]] = fsub <2 x double> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x double> [[TMP...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/190414