[llvm] [SLP]Initial compatibility support for shl v, 1 and add v, v (PR #181168)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 8 08:13:41 PDT 2026


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/181168

>From ddf410df8a50109fa0c7fcc3e23a845551308ac2 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 12 Feb 2026 07:52:22 -0800
Subject: [PATCH 1/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.7
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 227 ++++++++++++++++--
 .../SLPVectorizer/X86/entries-different-vf.ll |  20 +-
 .../X86/shl-compatible-with-add.ll            |  22 +-
 .../X86/shl-to-add-transformation.ll          |  49 ++--
 .../X86/shl-to-add-transformation4.ll         |  29 +--
 .../X86/shl-to-add-transformation5.ll         |  71 +++---
 6 files changed, 286 insertions(+), 132 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8fb88a1fac0ef..58c5451010852 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1002,8 +1002,12 @@ class BinOpSameOpcodeHelper {
     /// preventing us from determining which instruction it should convert to.
     bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
       if (Mask & InterchangeableMask) {
-        SeenBefore |= OpcodeInMaskForm;
-        Mask &= InterchangeableMask;
+        MaskType TempSeenBefore = SeenBefore | OpcodeInMaskForm;
+        MaskType TempMask = Mask & InterchangeableMask;
+        if (!(TempMask & TempSeenBefore))
+          return false;
+        Mask = TempMask;
+        SeenBefore = TempSeenBefore;
         return true;
       }
       return false;
@@ -1083,6 +1087,8 @@ class BinOpSameOpcodeHelper {
       APInt ToCIValue;
       switch (FromOpcode) {
       case Instruction::Shl:
+        if (ToOpcode == Instruction::Add && FromCIValue.isOne())
+          return {I->getOperand(0), I->getOperand(0)};
         if (ToOpcode == Instruction::Mul) {
           ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
                                           FromCIValue.getZExtValue());
@@ -1205,6 +1211,8 @@ class BinOpSameOpcodeHelper {
       case Instruction::Shl:
         if (CIValue.ult(CIValue.getBitWidth()))
           InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
+        if (CIValue.isOne())
+          InterchangeableMask |= AddBIT;
         break;
       case Instruction::Mul:
         if (CIValue.isOne()) {
@@ -1402,6 +1410,49 @@ class InstructionsState {
            Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
   }
 
+  /// Checks if the value \p V is a transformed instruction, compatible either
+  /// with main or alternate ops.
+  bool isExpandedBinOp(Value *V) const {
+    assert(valid() && "InstructionsState is invalid.");
+    if (isCopyableElement(V))
+      return false;
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return false;
+    auto CheckForTransformedOpcode = [](const Instruction *Op, Instruction *I) {
+      switch (Op->getOpcode()) {
+      case Instruction::Add:
+        switch (I->getOpcode()) {
+        case Instruction::Shl:
+          assert(match(I, m_Shl(m_Value(), m_One())) &&
+                 "Expected shl x, 1 only.");
+          return true;
+        default:
+          break;
+        }
+        break;
+      default:
+        break;
+      }
+      return false;
+    };
+    Instruction *Op = getMatchingMainOpOrAltOp(I);
+    return CheckForTransformedOpcode(Op, I);
+  }
+
+  /// Checks if the operand at index \p Idx of instruction \p I is an expanded
+  /// operand.
+  bool isExpandedOperand(Instruction *I, unsigned Idx) const {
+    assert(isExpandedBinOp(I) && "Expected an expanded binop.");
+    switch (I->getOpcode()) {
+    case Instruction::Shl:
+      assert(match(I, m_Shl(m_Value(), m_One())) && "Expected shl x, 1 only.");
+      return Idx == 1;
+    default:
+      llvm_unreachable("Unexpected opcode for an expanded operand.");
+    }
+  }
+
   /// Checks if the value is non-schedulable.
   bool isNonSchedulable(Value *V) const {
     assert(valid() && "InstructionsState is invalid.");
@@ -4211,6 +4262,26 @@ class slpvectorizer::BoUpSLP {
       return CopyableElements.contains(V);
     }
 
+    /// Checks if the value \p V is a transformed instruction, compatible either
+    /// with main or alternate ops.
+    bool isExpandedBinOp(Value *V) const {
+      assert(hasState() && "InstructionsState is invalid.");
+      if (isCopyableElement(V))
+        return false;
+      return S.isExpandedBinOp(V);
+    }
+
+    /// Checks if the operand at index \p Idx of instruction \p I is an expanded
+    /// operand.
+    bool isExpandedOperand(Instruction *I, unsigned Idx) const {
+      assert(hasState() && "InstructionsState is invalid.");
+      if (isCopyableElement(I))
+        return false;
+      if (!isExpandedBinOp(I))
+        return false;
+      return S.isExpandedOperand(I, Idx);
+    }
+
     /// Returns true if any scalar in the list is a copyable element.
     bool hasCopyableElements() const { return !CopyableElements.empty(); }
 
@@ -4285,8 +4356,11 @@ class slpvectorizer::BoUpSLP {
           dbgs().indent(2) << *V << "\n";
       }
       dbgs() << "Scalars: \n";
-      for (Value *V : Scalars)
-        dbgs().indent(2) << *V << "\n";
+      for (Value *V : Scalars) {
+        dbgs().indent(2) << *V
+                         << ((S && S.isExpandedBinOp(V)) ? " [[Expanded]]\n"
+                                                         : "\n");
+      }
       dbgs() << "State: ";
       if (S && hasCopyableElements())
         dbgs() << "[[Copyable]] ";
@@ -5697,8 +5771,18 @@ class slpvectorizer::BoUpSLP {
             for (const Use &U : In->operands()) {
               if (auto *I = dyn_cast<Instruction>(U.get())) {
                 auto Res = OperandsUses.try_emplace(I, 0);
-                ++Res.first->getSecond();
-                ++TotalOpCount;
+                unsigned Inc = 1;
+                // Count all expanded operands in the binops.
+                for (ScheduleBundle *Bundle : Bundles) {
+                  if (const TreeEntry *TE = Bundle->getTreeEntry()) {
+                    if (TE->isExpandedBinOp(In))
+                      ++Inc;
+                  } else if (S.isExpandedBinOp(In)) {
+                    ++Inc;
+                  }
+                }
+                Res.first->getSecond() += Inc;
+                TotalOpCount += Inc;
               }
             }
           }
@@ -5707,7 +5791,7 @@ class slpvectorizer::BoUpSLP {
           auto DecrUnschedForInst =
               [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
                   SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
-                      &Checked) {
+                      &Checked, bool IsExpandedOperand = false) {
                 if (!ScheduleCopyableDataMap.empty()) {
                   const EdgeInfo EI = {UserTE, OpIdx};
                   if (ScheduleCopyableData *CD =
@@ -5722,7 +5806,8 @@ class slpvectorizer::BoUpSLP {
                 assert(It != OperandsUses.end() && "Operand not found");
                 if (It->second > 0) {
                   if (ScheduleData *OpSD = getScheduleData(I)) {
-                    if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
+                    if (!IsExpandedOperand &&
+                        !Checked.insert(std::make_pair(OpSD, OpIdx)).second)
                       return;
                     --It->getSecond();
                     assert(TotalOpCount > 0 && "No more operands to decrement");
@@ -5798,7 +5883,9 @@ class slpvectorizer::BoUpSLP {
                         Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
                   LLVM_DEBUG(dbgs() << "SLP:   check for readiness (def): "
                                     << *I << "\n");
-                  DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
+                  DecrUnschedForInst(
+                      I, Bundle->getTreeEntry(), OpIdx, Checked,
+                      Bundle->getTreeEntry()->isExpandedOperand(In, OpIdx));
                 }
               // If parent node is schedulable, it will be handled correctly.
               if (Bundle->getTreeEntry()->isCopyableElement(In))
@@ -5986,6 +6073,7 @@ class slpvectorizer::BoUpSLP {
     /// bundles which depend on the original bundle.
     void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
                                BoUpSLP *SLP,
+                               const SmallPtrSetImpl<Value *> &ExpandedOps,
                                ArrayRef<ScheduleData *> ControlDeps = {});
 
     /// Sets all instruction in the scheduling region to un-scheduled.
@@ -22487,6 +22575,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
             (!EI.UserTE->hasCopyableElements() ||
              !EI.UserTE->isCopyableElement(SD->getInst())))
           SD->clearDirectDependencies();
+        const bool IsExpandedBinOp = S.isExpandedBinOp(SD->getInst());
         for (const Use &U : SD->getInst()->operands()) {
           unsigned &NumOps =
               UserOpToNumOps
@@ -22502,6 +22591,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
               if (RegionHasStackSave ||
                   !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
                 ControlDependentMembers.push_back(OpSD);
+              continue;
+            }
+          }
+          if (IsExpandedBinOp) {
+            if (ScheduleData *OpSD = getScheduleData(U.get());
+                OpSD && OpSD->hasValidDependencies()) {
+              OpSD->clearDirectDependencies();
+              ControlDependentMembers.push_back(OpSD);
+              continue;
             }
           }
         }
@@ -22528,20 +22626,24 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
       });
       ReSchedule = true;
     }
+    SmallPtrSet<Value *, 4> ExpandedOps;
+    for (Value *V : VL) {
+      if (S.isExpandedBinOp(V))
+        ExpandedOps.insert(V);
+    }
     // Check if the bundle data has deps for copyable elements already. In
     // this case need to reset deps and recalculate it.
     if (Bundle && !Bundle.getBundle().empty()) {
-      if (S.areInstructionsWithCopyableElements() ||
-          !ScheduleCopyableDataMap.empty())
+      if (!ScheduleCopyableDataMap.empty() || !ExpandedOps.empty())
         CheckIfNeedToClearDeps(Bundle);
       LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
                         << BB->getName() << "\n");
       calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
-                            ControlDependentMembers);
+                            ExpandedOps, ControlDependentMembers);
     } else if (!ControlDependentMembers.empty()) {
       ScheduleBundle Invalid = ScheduleBundle::invalid();
       calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
-                            ControlDependentMembers);
+                            ExpandedOps, ControlDependentMembers);
     }
 
     if (ReSchedule) {
@@ -22606,7 +22708,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
         ReadyInsts.remove(B);
     }
 
-    if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
+    if (!S.isCopyableElement(V) && !S.isExpandedBinOp(V) &&
+        !BundleMember->isScheduled())
       continue;
     // A bundle member was scheduled as single instruction before and now
     // needs to be scheduled as part of the bundle. We just get rid of the
@@ -22642,6 +22745,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
       if (S.isNonSchedulable(V))
         continue;
       auto *I = cast<Instruction>(V);
+      if (S.isExpandedBinOp(I)) {
+        for (Value *Op : I->operands()) {
+          if (ScheduleData *OpSD = getScheduleData(Op);
+              OpSD && OpSD->hasValidDependencies()) {
+            OpSD->clearDirectDependencies();
+            ControlDependentMembers.push_back(OpSD);
+          }
+        }
+      }
       if (S.isCopyableElement(I)) {
         // Remove the copyable data from the scheduling region and restore
         // previous mappings.
@@ -22688,6 +22800,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
           if (RegionHasStackSave ||
               !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
             ControlDependentMembers.push_back(OpSD);
+          if (any_of(VL, [&](Value *V) { return S.isExpandedBinOp(V); })) {
+            // Clear scheduling data for all operands, if this node is operand
+            // of the expanded instruction.
+            for (Value *Op : I->operands()) {
+              if (ScheduleData *OpSD = getScheduleData(Op);
+                  OpSD && OpSD->hasValidDependencies()) {
+                OpSD->clearDirectDependencies();
+                ControlDependentMembers.push_back(OpSD);
+              }
+            }
+          }
         }
         continue;
       }
@@ -22695,8 +22818,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     }
     if (!ControlDependentMembers.empty()) {
       ScheduleBundle Invalid = ScheduleBundle::invalid();
+      SmallPtrSet<Value *, 4> ExpandedOps;
       calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
-                            ControlDependentMembers);
+                            ExpandedOps, ControlDependentMembers);
     }
     return std::nullopt;
   }
@@ -22835,6 +22959,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
 
 void BoUpSLP::BlockScheduling::calculateDependencies(
     ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
+    const SmallPtrSetImpl<Value *> &ExpandedOps,
     ArrayRef<ScheduleData *> ControlDeps) {
   SmallVector<ScheduleEntity *> WorkList;
   auto ProcessNode = [&](ScheduleEntity *SE) {
@@ -22873,9 +22998,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
             }
           } else if (Visited.insert(In).second) {
             if (ScheduleData *UseSD = getScheduleData(In)) {
-              CD->incDependencies();
+              unsigned Inc = 1;
+              // Increment twice, since the operand was expanded in binop.
+              if (EI.UserTE && EI.UserTE->isExpandedBinOp(In))
+                Inc = 2;
+              for_each(seq(Inc), [&](unsigned) { CD->incDependencies(); });
               if (!UseSD->isScheduled())
-                CD->incrementUnscheduledDeps(1);
+                CD->incrementUnscheduledDeps(Inc);
               if (!UseSD->hasValidDependencies() ||
                   (InsertInReadyList && UseSD->isReady()))
                 WorkList.push_back(UseSD);
@@ -22923,9 +23052,17 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
         if (areAllOperandsReplacedByCopyableData(
                 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
           continue;
-        BundleMember->incDependencies();
+        unsigned Inc = 1;
+        // Increment twice, since the operand was expanded in binop.
+        for (const TreeEntry *UserTE : SLP->getTreeEntries(U)) {
+          if (UserTE->isExpandedBinOp(U))
+            ++Inc;
+        }
+        if (ExpandedOps.contains(U))
+          ++Inc;
+        for_each(seq(Inc), [&](unsigned) { BundleMember->incDependencies(); });
         if (!UseSD->isScheduled())
-          BundleMember->incrementUnscheduledDeps(1);
+          BundleMember->incrementUnscheduledDeps(Inc);
         if (!UseSD->hasValidDependencies() ||
             (InsertInReadyList && UseSD->isReady()))
           WorkList.push_back(UseSD);
@@ -23197,15 +23334,21 @@ void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
     if (!Bundles.empty()) {
       for (ScheduleBundle *Bundle : Bundles) {
         Bundle->setSchedulingPriority(Idx++);
-        if (!Bundle->hasValidDependencies())
-          BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
+        if (!Bundle->hasValidDependencies()) {
+          SmallPtrSet<Value *, 4> ExpandedOps;
+          BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this,
+                                    ExpandedOps);
+        }
       }
       SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
       for (ScheduleCopyableData *SD : reverse(SDs)) {
         ScheduleBundle &Bundle = SD->getBundle();
         Bundle.setSchedulingPriority(Idx++);
-        if (!Bundle.hasValidDependencies())
-          BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+        if (!Bundle.hasValidDependencies()) {
+          SmallPtrSet<Value *, 4> ExpandedOps;
+          BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this,
+                                    ExpandedOps);
+        }
       }
       continue;
     }
@@ -23231,14 +23374,19 @@ void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
         // recalculate them.
         ScheduleBundle Bundle;
         Bundle.add(SD);
-        BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+        SmallPtrSet<Value *, 4> ExpandedOps;
+        BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this,
+                                  ExpandedOps);
       }
     }
     for (ScheduleCopyableData *SD : reverse(CopyableData)) {
       ScheduleBundle &Bundle = SD->getBundle();
       Bundle.setSchedulingPriority(Idx++);
-      if (!Bundle.hasValidDependencies())
-        BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+      if (!Bundle.hasValidDependencies()) {
+        SmallPtrSet<Value *, 4> ExpandedOps;
+        BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this,
+                                  ExpandedOps);
+      }
     }
   }
   BS->initialFillReadyList(ReadyInsts);
@@ -25422,6 +25570,33 @@ class HorizontalReduction {
         ReducedVals.erase(std::next(ReducedVals.begin(), SelectIdx));
       }
     }
+    // Check if shl %x, 1 can be merged with adds.
+    auto ShlIt = UsedReductionOpIds.find(Instruction::Shl);
+    auto AddIt = UsedReductionOpIds.find(Instruction::Add);
+    if (ShlIt != UsedReductionOpIds.end() &&
+        AddIt != UsedReductionOpIds.end()) {
+      unsigned ShlIdx = ShlIt->second;
+      unsigned AddIdx = AddIt->second;
+      if (ReducedVals[ShlIdx].size() < ReductionLimit) {
+        SmallVector<Value *> Shls;
+        SmallVector<Value *> Remaining;
+        for (Value *V : ReducedVals[ShlIdx]) {
+          if (match(V, m_Shl(m_Value(), m_One())))
+            Shls.push_back(V);
+          else
+            Remaining.push_back(V);
+        }
+        // Have compatible shls? Merge them to adds, if so.
+        if (!Shls.empty()) {
+          Shls.append(ReducedVals[AddIdx]);
+          ReducedVals[AddIdx].swap(Shls);
+          if (Remaining.empty())
+            ReducedVals.erase(std::next(ReducedVals.begin(), ShlIdx));
+          else
+            ReducedVals[ShlIdx].swap(Remaining);
+        }
+      }
+    }
   }
 
 public:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
index f75c8deddd9bb..38dddb064b751 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
@@ -5,16 +5,16 @@ define i1 @test(i64 %v) {
 ; CHECK-LABEL: define i1 @test
 ; CHECK-SAME: (i64 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[V]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[V]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[V]], 7
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> <i64 undef, i64 undef, i64 0, i64 0>, i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 0, i32 3, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <8 x i64> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[V]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> <i64 poison, i64 poison, i64 0, i64 0>, i64 [[V]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> <i64 7, i64 undef, i64 0, i64 0>, i64 [[V]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 0, i32 3, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <8 x i64> [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <8 x i64> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP11]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shl-compatible-with-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/shl-compatible-with-add.ll
index 04a45e4d416f8..3ed950147e826 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shl-compatible-with-add.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shl-compatible-with-add.ll
@@ -5,23 +5,21 @@ define void @intrapred_luma(ptr %0, i16 %1, i32 %conv593) {
 ; CHECK-LABEL: define void @intrapred_luma(
 ; CHECK-SAME: ptr [[TMP0:%.*]], i16 [[TMP1:%.*]], i32 [[CONV593:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX590:%.*]] = getelementptr i8, ptr [[DOTPRE]], i64 4304
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[CONV5952:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT:    [[ADD596:%.*]] = add i32 [[CONV5952]], 1
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load ptr, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX590:%.*]] = getelementptr i8, ptr [[DOTPRE]], i64 4304
-; CHECK-NEXT:    [[TMP5:%.*]] = shl <2 x i32> [[TMP4]], <i32 1, i32 0>
 ; CHECK-NEXT:    [[CONV635:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT:    [[ADD633:%.*]] = add i32 [[CONV635]], 1
-; CHECK-NEXT:    [[ADD636:%.*]] = add i32 [[ADD633]], [[CONV593]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[ADD596]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV635]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[ADD636]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>, i32 [[CONV593]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> <i32 1, i32 poison, i32 0, i32 poison>, i32 [[CONV593]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV635]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>, <4 x i32> <i32 3, i32 3, i32 3, i32 5>
 ; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP9]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation.ll b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation.ll
index 1cba1bb586e36..9a0b7dc57ef95 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation.ll
@@ -5,38 +5,27 @@ define void @test(ptr %src, i8 %0, i32 %conv2) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ptr [[SRC:%.*]], i8 [[TMP0:%.*]], i32 [[CONV2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[CONV65:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT:    [[ADD36:%.*]] = add i32 [[CONV65]], 1
-; CHECK-NEXT:    [[ADD37:%.*]] = or i32 [[ADD36]], [[CONV2]]
-; CHECK-NEXT:    [[CONV4:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT:    [[ADD38:%.*]] = or i32 [[ADD37]], [[CONV4]]
-; CHECK-NEXT:    [[SHR39:%.*]] = lshr i32 [[ADD38]], 1
-; CHECK-NEXT:    [[CONV40:%.*]] = trunc i32 [[SHR39]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr i8, ptr [[SRC]], i64 1
-; CHECK-NEXT:    store i8 [[CONV40]], ptr [[ARRAYIDX41]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[CONV4]], 1
-; CHECK-NEXT:    [[ADD45:%.*]] = or i32 [[ADD]], [[CONV2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[SRC]], align 1
-; CHECK-NEXT:    [[CONV8:%.*]] = zext i8 [[TMP1]] to i32
-; CHECK-NEXT:    [[ADD46:%.*]] = or i32 [[ADD45]], [[CONV8]]
-; CHECK-NEXT:    [[SHR47:%.*]] = lshr i32 [[ADD46]], 1
-; CHECK-NEXT:    [[CONV48:%.*]] = trunc i32 [[SHR47]] to i8
-; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr i8, ptr [[SRC]], i64 2
-; CHECK-NEXT:    store i8 [[CONV48]], ptr [[ARRAYIDX49]], align 1
-; CHECK-NEXT:    [[MUL52:%.*]] = shl i32 [[CONV8]], 1
-; CHECK-NEXT:    [[ADD54:%.*]] = or i32 [[MUL52]], 1
-; CHECK-NEXT:    [[CONV10:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[ADD54]], [[CONV10]]
-; CHECK-NEXT:    [[SHR56:%.*]] = lshr i32 [[ADD55]], 1
-; CHECK-NEXT:    [[CONV57:%.*]] = trunc i32 [[SHR56]] to i8
-; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr i8, ptr [[SRC]], i64 3
-; CHECK-NEXT:    store i8 [[CONV57]], ptr [[ARRAYIDX58]], align 1
-; CHECK-NEXT:    [[ADD63:%.*]] = add i32 [[CONV8]], 1
-; CHECK-NEXT:    [[ADD64:%.*]] = or i32 [[ADD63]], [[CONV10]]
-; CHECK-NEXT:    [[SHR66:%.*]] = lshr i32 [[ADD64]], 1
-; CHECK-NEXT:    [[CONV67:%.*]] = trunc i32 [[SHR66]] to i8
-; CHECK-NEXT:    [[ARRAYIDX68:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
-; CHECK-NEXT:    store i8 [[CONV67]], ptr [[ARRAYIDX68]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 1, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 1, i32 1>, i32 [[CONV2]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP14]], [[TMP7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], splat (i32 1)
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i8>
+; CHECK-NEXT:    store <4 x i8> [[TMP19]], ptr [[ARRAYIDX41]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation4.ll b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation4.ll
index 68a2138aa88db..3da10dd23ae8f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation4.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation4.ll
@@ -7,26 +7,27 @@ define fastcc i32 @test(ptr %0, i16 %1, i32 %2) {
 ; CHECK-LABEL: define fastcc i32 @test(
 ; CHECK-SAME: ptr [[TMP0:%.*]], i16 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP1]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[TMP0]], align 2
-; CHECK-NEXT:    [[ADD68:%.*]] = add i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[CONV_3_1:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT:    [[ADD68_1:%.*]] = add i32 [[CONV_3_1]], -1
-; CHECK-NEXT:    [[ADD118_1:%.*]] = or i32 [[ADD68]], [[ADD68_1]]
-; CHECK-NEXT:    [[CMP16_I:%.*]] = icmp slt i32 [[ADD118_1]], 0
-; CHECK-NEXT:    [[SUB2_I2:%.*]] = sub i32 0, [[TMP2]]
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP1]] to i32
 ; CHECK-NEXT:    [[ADD56_1:%.*]] = or i32 [[TMP2]], [[CONV_3_1]]
+; CHECK-NEXT:    [[ADD68_1:%.*]] = add i32 [[CONV_3_1]], -1
 ; CHECK-NEXT:    [[ADD37_1:%.*]] = add i32 [[CONV_2]], 1
+; CHECK-NEXT:    [[ADD68:%.*]] = add i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[MUL137:%.*]] = shl i32 [[ADD56_1]], 1
+; CHECK-NEXT:    [[ADD118_1:%.*]] = or i32 [[ADD68]], [[ADD68_1]]
 ; CHECK-NEXT:    [[SUB138:%.*]] = sub i32 [[ADD37_1]], [[MUL137]]
-; CHECK-NEXT:    [[CMP16_I45:%.*]] = icmp slt i32 [[SUB138]], 0
-; CHECK-NEXT:    [[SUB2_I44:%.*]] = sub i32 0, [[ADD56_1]]
-; CHECK-NEXT:    [[RETVAL_0_I46:%.*]] = select i1 [[CMP16_I45]], i32 [[SUB2_I44]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @dequant_coef, i64 12), align 4
-; CHECK-NEXT:    [[MUL175_3635:%.*]] = mul i32 [[RETVAL_0_I46]], [[TMP4]]
-; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = select i1 [[CMP16_I]], i32 [[SUB2_I2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @dequant_coef, i64 16), align 16
-; CHECK-NEXT:    [[MUL175_1:%.*]] = mul i32 [[RETVAL_0_I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[SUB138]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[ADD118_1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <2 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD56_1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[TMP9]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i32>, ptr getelementptr inbounds nuw (i8, ptr @dequant_coef, i64 12), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <2 x i32> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[MUL175_3635:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; CHECK-NEXT:    [[MUL175_1:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[ADD300:%.*]] = or i32 [[MUL175_3635]], [[MUL175_1]]
 ; CHECK-NEXT:    ret i32 [[ADD300]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation5.ll b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation5.ll
index 6fea312b99b25..194898be786ca 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation5.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation5.ll
@@ -7,30 +7,28 @@ define i32 @test(i32 %0, i32 %1) {
 ; CHECK-LABEL: define i32 @test(
 ; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[DOTNEG_NEG:%.*]] = shl i32 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shl <2 x i32> [[TMP3]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @st, i64 12), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[DOTNEG_NEG:%.*]] = shl i32 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr getelementptr inbounds nuw (i8, ptr @st, i64 8), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP9]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @st, i64 12), align 4
 ; CHECK-NEXT:    [[SUB120_3:%.*]] = or i32 [[TMP5]], [[DOTNEG_NEG]]
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> <i32 1, i32 poison, i32 1, i32 1>, i32 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = shl <4 x i32> [[TMP10]], <i32 0, i32 1, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[ADD110]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[DOTNEG_NEG]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = sub <2 x i32> zeroinitializer, [[TMP13]]
-; CHECK-NEXT:    store <2 x i32> [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 32), align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, <4 x i32> <i32 1, i32 5, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[SUB120_3]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = shl <4 x i32> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 16), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 0, i32 0>, <8 x i32> <i32 0, i32 1, i32 poison, i32 11, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[SUB120_3]], i32 5
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> poison, <8 x i32> <i32 2, i32 poison, i32 2, i32 poison, i32 2, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[DOTNEG_NEG]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 11, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = sub <8 x i32> [[TMP12]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add <8 x i32> [[TMP12]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP19]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+; CHECK-NEXT:    store <8 x i32> [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 8), align 8
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -65,11 +63,11 @@ define i32 @test1(ptr %0, ptr %1, i32 %2) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[ADD53_1:%.*]] = add i32 [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ADD53_1]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD53_1:%.*]] = add i32 [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP7]], splat (i32 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ADD53_1]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP8]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
@@ -119,25 +117,18 @@ define i32 @test2(i32 %0) {
 ; CHECK-LABEL: define i32 @test2(
 ; CHECK-SAME: i32 [[TMP0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i32> splat (i32 1), [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i32> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i32> [[TMP8]], <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = shl <2 x i32> [[TMP9]], splat (i32 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> <i32 0, i32 1, i32 0, i32 0>, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 1, i32 1>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shl i32 [[TMP0]], 1
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 20), align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[ADD110_3]], i32 3
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 4, i32 1, i32 6, i32 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP9]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub <4 x i32> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>

>From 8b472645b98b1c842717c1b9df670a45aac63ca2 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 13 Feb 2026 04:42:24 -0800
Subject: [PATCH 2/3] Added assertion

Created using spr 1.3.7
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 58c5451010852..21b5ec09a4848 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1437,6 +1437,8 @@ class InstructionsState {
       return false;
     };
     Instruction *Op = getMatchingMainOpOrAltOp(I);
+    assert(Op &&
+           "The instruction should be compatible with either main or alt op.");
     return CheckForTransformedOpcode(Op, I);
   }
 

>From e225bc35e105e1fb4e877137fea3911b66028096 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Sun, 22 Feb 2026 07:10:10 -0800
Subject: [PATCH 3/3] Fix formatting

Created using spr 1.3.7
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 92df9ab1058e3..2ae640d33ed17 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5819,7 +5819,8 @@ class slpvectorizer::BoUpSLP {
           auto DecrUnschedForInst =
               [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
                   SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
-                      &Checked, bool IsExpandedOperand = false) {
+                      &Checked,
+                  bool IsExpandedOperand = false) {
                 if (!ScheduleCopyableDataMap.empty()) {
                   const EdgeInfo EI = {UserTE, OpIdx};
                   if (ScheduleCopyableData *CD =



More information about the llvm-commits mailing list