[llvm] [SLP]Initial compatibility support for shl v, 1 and add v, v (PR #181168)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 8 08:13:41 PDT 2026
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/181168
>From ddf410df8a50109fa0c7fcc3e23a845551308ac2 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 12 Feb 2026 07:52:22 -0800
Subject: [PATCH 1/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
=?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.7
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 227 ++++++++++++++++--
.../SLPVectorizer/X86/entries-different-vf.ll | 20 +-
.../X86/shl-compatible-with-add.ll | 22 +-
.../X86/shl-to-add-transformation.ll | 49 ++--
.../X86/shl-to-add-transformation4.ll | 29 +--
.../X86/shl-to-add-transformation5.ll | 71 +++---
6 files changed, 286 insertions(+), 132 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8fb88a1fac0ef..58c5451010852 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1002,8 +1002,12 @@ class BinOpSameOpcodeHelper {
/// preventing us from determining which instruction it should convert to.
bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
if (Mask & InterchangeableMask) {
- SeenBefore |= OpcodeInMaskForm;
- Mask &= InterchangeableMask;
+ MaskType TempSeenBefore = SeenBefore | OpcodeInMaskForm;
+ MaskType TempMask = Mask & InterchangeableMask;
+ if (!(TempMask & TempSeenBefore))
+ return false;
+ Mask = TempMask;
+ SeenBefore = TempSeenBefore;
return true;
}
return false;
@@ -1083,6 +1087,8 @@ class BinOpSameOpcodeHelper {
APInt ToCIValue;
switch (FromOpcode) {
case Instruction::Shl:
+ if (ToOpcode == Instruction::Add && FromCIValue.isOne())
+ return {I->getOperand(0), I->getOperand(0)};
if (ToOpcode == Instruction::Mul) {
ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth,
FromCIValue.getZExtValue());
@@ -1205,6 +1211,8 @@ class BinOpSameOpcodeHelper {
case Instruction::Shl:
if (CIValue.ult(CIValue.getBitWidth()))
InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
+ if (CIValue.isOne())
+ InterchangeableMask |= AddBIT;
break;
case Instruction::Mul:
if (CIValue.isOne()) {
@@ -1402,6 +1410,49 @@ class InstructionsState {
Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
}
+ /// Checks if the value \p V is a transformed instruction, compatible either
+ /// with main or alternate ops.
+ bool isExpandedBinOp(Value *V) const {
+ assert(valid() && "InstructionsState is invalid.");
+ if (isCopyableElement(V))
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+ auto CheckForTransformedOpcode = [](const Instruction *Op, Instruction *I) {
+ switch (Op->getOpcode()) {
+ case Instruction::Add:
+ switch (I->getOpcode()) {
+ case Instruction::Shl:
+ assert(match(I, m_Shl(m_Value(), m_One())) &&
+ "Expected shl x, 1 only.");
+ return true;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ return false;
+ };
+ Instruction *Op = getMatchingMainOpOrAltOp(I);
+ return CheckForTransformedOpcode(Op, I);
+ }
+
+ /// Checks if the operand at index \p Idx of instruction \p I is an expanded
+ /// operand.
+ bool isExpandedOperand(Instruction *I, unsigned Idx) const {
+ assert(isExpandedBinOp(I) && "Expected an expanded binop.");
+ switch (I->getOpcode()) {
+ case Instruction::Shl:
+ assert(match(I, m_Shl(m_Value(), m_One())) && "Expected shl x, 1 only.");
+ return Idx == 1;
+ default:
+ llvm_unreachable("Unexpected opcode for an expanded operand.");
+ }
+ }
+
/// Checks if the value is non-schedulable.
bool isNonSchedulable(Value *V) const {
assert(valid() && "InstructionsState is invalid.");
@@ -4211,6 +4262,26 @@ class slpvectorizer::BoUpSLP {
return CopyableElements.contains(V);
}
+ /// Checks if the value \p V is a transformed instruction, compatible either
+ /// with main or alternate ops.
+ bool isExpandedBinOp(Value *V) const {
+ assert(hasState() && "InstructionsState is invalid.");
+ if (isCopyableElement(V))
+ return false;
+ return S.isExpandedBinOp(V);
+ }
+
+ /// Checks if the operand at index \p Idx of instruction \p I is an expanded
+ /// operand.
+ bool isExpandedOperand(Instruction *I, unsigned Idx) const {
+ assert(hasState() && "InstructionsState is invalid.");
+ if (isCopyableElement(I))
+ return false;
+ if (!isExpandedBinOp(I))
+ return false;
+ return S.isExpandedOperand(I, Idx);
+ }
+
/// Returns true if any scalar in the list is a copyable element.
bool hasCopyableElements() const { return !CopyableElements.empty(); }
@@ -4285,8 +4356,11 @@ class slpvectorizer::BoUpSLP {
dbgs().indent(2) << *V << "\n";
}
dbgs() << "Scalars: \n";
- for (Value *V : Scalars)
- dbgs().indent(2) << *V << "\n";
+ for (Value *V : Scalars) {
+ dbgs().indent(2) << *V
+ << ((S && S.isExpandedBinOp(V)) ? " [[Expanded]]\n"
+ : "\n");
+ }
dbgs() << "State: ";
if (S && hasCopyableElements())
dbgs() << "[[Copyable]] ";
@@ -5697,8 +5771,18 @@ class slpvectorizer::BoUpSLP {
for (const Use &U : In->operands()) {
if (auto *I = dyn_cast<Instruction>(U.get())) {
auto Res = OperandsUses.try_emplace(I, 0);
- ++Res.first->getSecond();
- ++TotalOpCount;
+ unsigned Inc = 1;
+ // Count all expanded operands in the binops.
+ for (ScheduleBundle *Bundle : Bundles) {
+ if (const TreeEntry *TE = Bundle->getTreeEntry()) {
+ if (TE->isExpandedBinOp(In))
+ ++Inc;
+ } else if (S.isExpandedBinOp(In)) {
+ ++Inc;
+ }
+ }
+ Res.first->getSecond() += Inc;
+ TotalOpCount += Inc;
}
}
}
@@ -5707,7 +5791,7 @@ class slpvectorizer::BoUpSLP {
auto DecrUnschedForInst =
[&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
- &Checked) {
+ &Checked, bool IsExpandedOperand = false) {
if (!ScheduleCopyableDataMap.empty()) {
const EdgeInfo EI = {UserTE, OpIdx};
if (ScheduleCopyableData *CD =
@@ -5722,7 +5806,8 @@ class slpvectorizer::BoUpSLP {
assert(It != OperandsUses.end() && "Operand not found");
if (It->second > 0) {
if (ScheduleData *OpSD = getScheduleData(I)) {
- if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
+ if (!IsExpandedOperand &&
+ !Checked.insert(std::make_pair(OpSD, OpIdx)).second)
return;
--It->getSecond();
assert(TotalOpCount > 0 && "No more operands to decrement");
@@ -5798,7 +5883,9 @@ class slpvectorizer::BoUpSLP {
Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
<< *I << "\n");
- DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
+ DecrUnschedForInst(
+ I, Bundle->getTreeEntry(), OpIdx, Checked,
+ Bundle->getTreeEntry()->isExpandedOperand(In, OpIdx));
}
// If parent node is schedulable, it will be handled correctly.
if (Bundle->getTreeEntry()->isCopyableElement(In))
@@ -5986,6 +6073,7 @@ class slpvectorizer::BoUpSLP {
/// bundles which depend on the original bundle.
void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
BoUpSLP *SLP,
+ const SmallPtrSetImpl<Value *> &ExpandedOps,
ArrayRef<ScheduleData *> ControlDeps = {});
/// Sets all instruction in the scheduling region to un-scheduled.
@@ -22487,6 +22575,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
(!EI.UserTE->hasCopyableElements() ||
!EI.UserTE->isCopyableElement(SD->getInst())))
SD->clearDirectDependencies();
+ const bool IsExpandedBinOp = S.isExpandedBinOp(SD->getInst());
for (const Use &U : SD->getInst()->operands()) {
unsigned &NumOps =
UserOpToNumOps
@@ -22502,6 +22591,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
if (RegionHasStackSave ||
!isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
ControlDependentMembers.push_back(OpSD);
+ continue;
+ }
+ }
+ if (IsExpandedBinOp) {
+ if (ScheduleData *OpSD = getScheduleData(U.get());
+ OpSD && OpSD->hasValidDependencies()) {
+ OpSD->clearDirectDependencies();
+ ControlDependentMembers.push_back(OpSD);
+ continue;
}
}
}
@@ -22528,20 +22626,24 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
});
ReSchedule = true;
}
+ SmallPtrSet<Value *, 4> ExpandedOps;
+ for (Value *V : VL) {
+ if (S.isExpandedBinOp(V))
+ ExpandedOps.insert(V);
+ }
// Check if the bundle data has deps for copyable elements already. In
// this case need to reset deps and recalculate it.
if (Bundle && !Bundle.getBundle().empty()) {
- if (S.areInstructionsWithCopyableElements() ||
- !ScheduleCopyableDataMap.empty())
+ if (!ScheduleCopyableDataMap.empty() || !ExpandedOps.empty())
CheckIfNeedToClearDeps(Bundle);
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
<< BB->getName() << "\n");
calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
- ControlDependentMembers);
+ ExpandedOps, ControlDependentMembers);
} else if (!ControlDependentMembers.empty()) {
ScheduleBundle Invalid = ScheduleBundle::invalid();
calculateDependencies(Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
- ControlDependentMembers);
+ ExpandedOps, ControlDependentMembers);
}
if (ReSchedule) {
@@ -22606,7 +22708,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReadyInsts.remove(B);
}
- if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
+ if (!S.isCopyableElement(V) && !S.isExpandedBinOp(V) &&
+ !BundleMember->isScheduled())
continue;
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
@@ -22642,6 +22745,15 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
if (S.isNonSchedulable(V))
continue;
auto *I = cast<Instruction>(V);
+ if (S.isExpandedBinOp(I)) {
+ for (Value *Op : I->operands()) {
+ if (ScheduleData *OpSD = getScheduleData(Op);
+ OpSD && OpSD->hasValidDependencies()) {
+ OpSD->clearDirectDependencies();
+ ControlDependentMembers.push_back(OpSD);
+ }
+ }
+ }
if (S.isCopyableElement(I)) {
// Remove the copyable data from the scheduling region and restore
// previous mappings.
@@ -22688,6 +22800,17 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
if (RegionHasStackSave ||
!isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
ControlDependentMembers.push_back(OpSD);
+ if (any_of(VL, [&](Value *V) { return S.isExpandedBinOp(V); })) {
+ // Clear scheduling data for all operands, if this node is operand
+ // of the expanded instruction.
+ for (Value *Op : I->operands()) {
+ if (ScheduleData *OpSD = getScheduleData(Op);
+ OpSD && OpSD->hasValidDependencies()) {
+ OpSD->clearDirectDependencies();
+ ControlDependentMembers.push_back(OpSD);
+ }
+ }
+ }
}
continue;
}
@@ -22695,8 +22818,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
}
if (!ControlDependentMembers.empty()) {
ScheduleBundle Invalid = ScheduleBundle::invalid();
+ SmallPtrSet<Value *, 4> ExpandedOps;
calculateDependencies(Invalid, /*InsertInReadyList=*/false, SLP,
- ControlDependentMembers);
+ ExpandedOps, ControlDependentMembers);
}
return std::nullopt;
}
@@ -22835,6 +22959,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
void BoUpSLP::BlockScheduling::calculateDependencies(
ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
+ const SmallPtrSetImpl<Value *> &ExpandedOps,
ArrayRef<ScheduleData *> ControlDeps) {
SmallVector<ScheduleEntity *> WorkList;
auto ProcessNode = [&](ScheduleEntity *SE) {
@@ -22873,9 +22998,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
}
} else if (Visited.insert(In).second) {
if (ScheduleData *UseSD = getScheduleData(In)) {
- CD->incDependencies();
+ unsigned Inc = 1;
+ // Increment twice, since the operand was expanded in binop.
+ if (EI.UserTE && EI.UserTE->isExpandedBinOp(In))
+ Inc = 2;
+ for_each(seq(Inc), [&](unsigned) { CD->incDependencies(); });
if (!UseSD->isScheduled())
- CD->incrementUnscheduledDeps(1);
+ CD->incrementUnscheduledDeps(Inc);
if (!UseSD->hasValidDependencies() ||
(InsertInReadyList && UseSD->isReady()))
WorkList.push_back(UseSD);
@@ -22923,9 +23052,17 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
if (areAllOperandsReplacedByCopyableData(
cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
continue;
- BundleMember->incDependencies();
+ unsigned Inc = 1;
+ // Increment twice, since the operand was expanded in binop.
+ for (const TreeEntry *UserTE : SLP->getTreeEntries(U)) {
+ if (UserTE->isExpandedBinOp(U))
+ ++Inc;
+ }
+ if (ExpandedOps.contains(U))
+ ++Inc;
+ for_each(seq(Inc), [&](unsigned) { BundleMember->incDependencies(); });
if (!UseSD->isScheduled())
- BundleMember->incrementUnscheduledDeps(1);
+ BundleMember->incrementUnscheduledDeps(Inc);
if (!UseSD->hasValidDependencies() ||
(InsertInReadyList && UseSD->isReady()))
WorkList.push_back(UseSD);
@@ -23197,15 +23334,21 @@ void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
if (!Bundles.empty()) {
for (ScheduleBundle *Bundle : Bundles) {
Bundle->setSchedulingPriority(Idx++);
- if (!Bundle->hasValidDependencies())
- BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
+ if (!Bundle->hasValidDependencies()) {
+ SmallPtrSet<Value *, 4> ExpandedOps;
+ BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this,
+ ExpandedOps);
+ }
}
SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
for (ScheduleCopyableData *SD : reverse(SDs)) {
ScheduleBundle &Bundle = SD->getBundle();
Bundle.setSchedulingPriority(Idx++);
- if (!Bundle.hasValidDependencies())
- BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+ if (!Bundle.hasValidDependencies()) {
+ SmallPtrSet<Value *, 4> ExpandedOps;
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this,
+ ExpandedOps);
+ }
}
continue;
}
@@ -23231,14 +23374,19 @@ void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
// recalculate them.
ScheduleBundle Bundle;
Bundle.add(SD);
- BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+ SmallPtrSet<Value *, 4> ExpandedOps;
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this,
+ ExpandedOps);
}
}
for (ScheduleCopyableData *SD : reverse(CopyableData)) {
ScheduleBundle &Bundle = SD->getBundle();
Bundle.setSchedulingPriority(Idx++);
- if (!Bundle.hasValidDependencies())
- BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+ if (!Bundle.hasValidDependencies()) {
+ SmallPtrSet<Value *, 4> ExpandedOps;
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this,
+ ExpandedOps);
+ }
}
}
BS->initialFillReadyList(ReadyInsts);
@@ -25422,6 +25570,33 @@ class HorizontalReduction {
ReducedVals.erase(std::next(ReducedVals.begin(), SelectIdx));
}
}
+ // Check if shl %x, 1 can be merged with adds.
+ auto ShlIt = UsedReductionOpIds.find(Instruction::Shl);
+ auto AddIt = UsedReductionOpIds.find(Instruction::Add);
+ if (ShlIt != UsedReductionOpIds.end() &&
+ AddIt != UsedReductionOpIds.end()) {
+ unsigned ShlIdx = ShlIt->second;
+ unsigned AddIdx = AddIt->second;
+ if (ReducedVals[ShlIdx].size() < ReductionLimit) {
+ SmallVector<Value *> Shls;
+ SmallVector<Value *> Remaining;
+ for (Value *V : ReducedVals[ShlIdx]) {
+ if (match(V, m_Shl(m_Value(), m_One())))
+ Shls.push_back(V);
+ else
+ Remaining.push_back(V);
+ }
+ // Have compatible shls? Merge them to adds, if so.
+ if (!Shls.empty()) {
+ Shls.append(ReducedVals[AddIdx]);
+ ReducedVals[AddIdx].swap(Shls);
+ if (Remaining.empty())
+ ReducedVals.erase(std::next(ReducedVals.begin(), ShlIdx));
+ else
+ ReducedVals[ShlIdx].swap(Remaining);
+ }
+ }
+ }
}
public:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
index f75c8deddd9bb..38dddb064b751 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
@@ -5,16 +5,16 @@ define i1 @test(i64 %v) {
; CHECK-LABEL: define i1 @test
; CHECK-SAME: (i64 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[V]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[V]], 3
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[V]], 7
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> <i64 undef, i64 undef, i64 0, i64 0>, i64 [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 0, i32 3, i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP4]], [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = sub <8 x i64> [[TMP4]], [[TMP7]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[V]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> <i64 poison, i64 poison, i64 0, i64 0>, i64 [[V]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> <i64 7, i64 undef, i64 0, i64 0>, i64 [[V]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 0, i32 3, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP5]]
+; CHECK-NEXT: [[TMP9:%.*]] = sub <8 x i64> [[TMP7]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <8 x i64> [[TMP10]], zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP11]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shl-compatible-with-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/shl-compatible-with-add.ll
index 04a45e4d416f8..3ed950147e826 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shl-compatible-with-add.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shl-compatible-with-add.ll
@@ -5,23 +5,21 @@ define void @intrapred_luma(ptr %0, i16 %1, i32 %conv593) {
; CHECK-LABEL: define void @intrapred_luma(
; CHECK-SAME: ptr [[TMP0:%.*]], i16 [[TMP1:%.*]], i32 [[CONV593:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[DOTPRE:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[ARRAYIDX590:%.*]] = getelementptr i8, ptr [[DOTPRE]], i64 4304
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
-; CHECK-NEXT: [[CONV5952:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT: [[ADD596:%.*]] = add i32 [[CONV5952]], 1
-; CHECK-NEXT: [[DOTPRE:%.*]] = load ptr, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[ARRAYIDX590:%.*]] = getelementptr i8, ptr [[DOTPRE]], i64 4304
-; CHECK-NEXT: [[TMP5:%.*]] = shl <2 x i32> [[TMP4]], <i32 1, i32 0>
; CHECK-NEXT: [[CONV635:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT: [[ADD633:%.*]] = add i32 [[CONV635]], 1
-; CHECK-NEXT: [[ADD636:%.*]] = add i32 [[ADD633]], [[CONV593]]
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[ADD596]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], <i32 0, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV635]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[ADD636]], i32 3
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>, i32 [[CONV593]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> <i32 1, i32 poison, i32 0, i32 poison>, i32 [[CONV593]], i32 3
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV635]], i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>, <4 x i32> <i32 3, i32 3, i32 3, i32 5>
; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP9]], [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation.ll b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation.ll
index 1cba1bb586e36..9a0b7dc57ef95 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation.ll
@@ -5,38 +5,27 @@ define void @test(ptr %src, i8 %0, i32 %conv2) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ptr [[SRC:%.*]], i8 [[TMP0:%.*]], i32 [[CONV2:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[CONV65:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT: [[ADD36:%.*]] = add i32 [[CONV65]], 1
-; CHECK-NEXT: [[ADD37:%.*]] = or i32 [[ADD36]], [[CONV2]]
-; CHECK-NEXT: [[CONV4:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT: [[ADD38:%.*]] = or i32 [[ADD37]], [[CONV4]]
-; CHECK-NEXT: [[SHR39:%.*]] = lshr i32 [[ADD38]], 1
-; CHECK-NEXT: [[CONV40:%.*]] = trunc i32 [[SHR39]] to i8
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr i8, ptr [[SRC]], i64 1
-; CHECK-NEXT: store i8 [[CONV40]], ptr [[ARRAYIDX41]], align 1
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CONV4]], 1
-; CHECK-NEXT: [[ADD45:%.*]] = or i32 [[ADD]], [[CONV2]]
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SRC]], align 1
-; CHECK-NEXT: [[CONV8:%.*]] = zext i8 [[TMP1]] to i32
-; CHECK-NEXT: [[ADD46:%.*]] = or i32 [[ADD45]], [[CONV8]]
-; CHECK-NEXT: [[SHR47:%.*]] = lshr i32 [[ADD46]], 1
-; CHECK-NEXT: [[CONV48:%.*]] = trunc i32 [[SHR47]] to i8
-; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr i8, ptr [[SRC]], i64 2
-; CHECK-NEXT: store i8 [[CONV48]], ptr [[ARRAYIDX49]], align 1
-; CHECK-NEXT: [[MUL52:%.*]] = shl i32 [[CONV8]], 1
-; CHECK-NEXT: [[ADD54:%.*]] = or i32 [[MUL52]], 1
-; CHECK-NEXT: [[CONV10:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[ADD54]], [[CONV10]]
-; CHECK-NEXT: [[SHR56:%.*]] = lshr i32 [[ADD55]], 1
-; CHECK-NEXT: [[CONV57:%.*]] = trunc i32 [[SHR56]] to i8
-; CHECK-NEXT: [[ARRAYIDX58:%.*]] = getelementptr i8, ptr [[SRC]], i64 3
-; CHECK-NEXT: store i8 [[CONV57]], ptr [[ARRAYIDX58]], align 1
-; CHECK-NEXT: [[ADD63:%.*]] = add i32 [[CONV8]], 1
-; CHECK-NEXT: [[ADD64:%.*]] = or i32 [[ADD63]], [[CONV10]]
-; CHECK-NEXT: [[SHR66:%.*]] = lshr i32 [[ADD64]], 1
-; CHECK-NEXT: [[CONV67:%.*]] = trunc i32 [[SHR66]] to i8
-; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
-; CHECK-NEXT: store i8 [[CONV67]], ptr [[ARRAYIDX68]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i8> [[TMP6]] to <4 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 1, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 1, i32 1>, i32 [[CONV2]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP7]]
+; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP14]], [[TMP7]]
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT: [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], splat (i32 1)
+; CHECK-NEXT: [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i8>
+; CHECK-NEXT: store <4 x i8> [[TMP19]], ptr [[ARRAYIDX41]], align 1
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation4.ll b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation4.ll
index 68a2138aa88db..3da10dd23ae8f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation4.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation4.ll
@@ -7,26 +7,27 @@ define fastcc i32 @test(ptr %0, i16 %1, i32 %2) {
; CHECK-LABEL: define fastcc i32 @test(
; CHECK-SAME: ptr [[TMP0:%.*]], i16 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP1]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[TMP0]], align 2
-; CHECK-NEXT: [[ADD68:%.*]] = add i32 [[TMP2]], 1
; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT: [[ADD68_1:%.*]] = add i32 [[CONV_3_1]], -1
-; CHECK-NEXT: [[ADD118_1:%.*]] = or i32 [[ADD68]], [[ADD68_1]]
-; CHECK-NEXT: [[CMP16_I:%.*]] = icmp slt i32 [[ADD118_1]], 0
-; CHECK-NEXT: [[SUB2_I2:%.*]] = sub i32 0, [[TMP2]]
+; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP1]] to i32
; CHECK-NEXT: [[ADD56_1:%.*]] = or i32 [[TMP2]], [[CONV_3_1]]
+; CHECK-NEXT: [[ADD68_1:%.*]] = add i32 [[CONV_3_1]], -1
; CHECK-NEXT: [[ADD37_1:%.*]] = add i32 [[CONV_2]], 1
+; CHECK-NEXT: [[ADD68:%.*]] = add i32 [[TMP2]], 1
; CHECK-NEXT: [[MUL137:%.*]] = shl i32 [[ADD56_1]], 1
+; CHECK-NEXT: [[ADD118_1:%.*]] = or i32 [[ADD68]], [[ADD68_1]]
; CHECK-NEXT: [[SUB138:%.*]] = sub i32 [[ADD37_1]], [[MUL137]]
-; CHECK-NEXT: [[CMP16_I45:%.*]] = icmp slt i32 [[SUB138]], 0
-; CHECK-NEXT: [[SUB2_I44:%.*]] = sub i32 0, [[ADD56_1]]
-; CHECK-NEXT: [[RETVAL_0_I46:%.*]] = select i1 [[CMP16_I45]], i32 [[SUB2_I44]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @dequant_coef, i64 12), align 4
-; CHECK-NEXT: [[MUL175_3635:%.*]] = mul i32 [[RETVAL_0_I46]], [[TMP4]]
-; CHECK-NEXT: [[RETVAL_0_I:%.*]] = select i1 [[CMP16_I]], i32 [[SUB2_I2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @dequant_coef, i64 16), align 16
-; CHECK-NEXT: [[MUL175_1:%.*]] = mul i32 [[RETVAL_0_I]], [[TMP5]]
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[SUB138]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[ADD118_1]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD56_1]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[TMP9]], <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr getelementptr inbounds nuw (i8, ptr @dequant_coef, i64 12), align 4
+; CHECK-NEXT: [[TMP12:%.*]] = mul <2 x i32> [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[MUL175_3635:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
+; CHECK-NEXT: [[MUL175_1:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
; CHECK-NEXT: [[ADD300:%.*]] = or i32 [[MUL175_3635]], [[MUL175_1]]
; CHECK-NEXT: ret i32 [[ADD300]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation5.ll b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation5.ll
index 6fea312b99b25..194898be786ca 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation5.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shl-to-add-transformation5.ll
@@ -7,30 +7,28 @@ define i32 @test(i32 %0, i32 %1) {
; CHECK-LABEL: define i32 @test(
; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT: [[DOTNEG_NEG:%.*]] = shl i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = shl <2 x i32> [[TMP3]], <i32 0, i32 1>
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @st, i64 12), align 4
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[DOTNEG_NEG:%.*]] = shl i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr getelementptr inbounds nuw (i8, ptr @st, i64 8), align 8
-; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x i32> [[TMP9]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 8), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @st, i64 12), align 4
; CHECK-NEXT: [[SUB120_3:%.*]] = or i32 [[TMP5]], [[DOTNEG_NEG]]
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> <i32 1, i32 poison, i32 1, i32 1>, i32 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = shl <4 x i32> [[TMP10]], <i32 0, i32 1, i32 0, i32 0>
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[ADD110]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[DOTNEG_NEG]], i32 1
-; CHECK-NEXT: [[TMP14:%.*]] = sub <2 x i32> zeroinitializer, [[TMP13]]
-; CHECK-NEXT: store <2 x i32> [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 32), align 16
-; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, <4 x i32> <i32 1, i32 5, i32 1, i32 poison>
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[SUB120_3]], i32 3
-; CHECK-NEXT: [[TMP17:%.*]] = shl <4 x i32> [[TMP16]], [[TMP11]]
-; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], [[TMP11]]
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 16), align 16
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 0, i32 0>, <8 x i32> <i32 0, i32 1, i32 poison, i32 11, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[SUB120_3]], i32 5
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> poison, <8 x i32> <i32 2, i32 poison, i32 2, i32 poison, i32 2, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[DOTNEG_NEG]], i32 1
+; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 11, i32 7>
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT: [[TMP18:%.*]] = sub <8 x i32> [[TMP12]], [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i32> [[TMP12]], [[TMP17]]
+; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP19]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+; CHECK-NEXT: store <8 x i32> [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 8), align 8
; CHECK-NEXT: ret i32 0
;
entry:
@@ -65,11 +63,11 @@ define i32 @test1(ptr %0, ptr %1, i32 %2) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[ADD53_1:%.*]] = add i32 [[TMP4]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ADD53_1]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[ADD53_1:%.*]] = add i32 [[TMP4]], [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP3]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP7]], splat (i32 1)
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ADD53_1]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP8]], <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
@@ -119,25 +117,18 @@ define i32 @test2(i32 %0) {
; CHECK-LABEL: define i32 @test2(
; CHECK-SAME: i32 [[TMP0:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> splat (i32 1), [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> [[TMP3]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP2]]
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i32> [[TMP8]], <i32 1, i32 0>
-; CHECK-NEXT: [[TMP10:%.*]] = shl <2 x i32> [[TMP9]], splat (i32 1)
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> <i32 0, i32 1, i32 0, i32 0>, [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 1, i32 1>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = shl i32 [[TMP0]], 1
; CHECK-NEXT: store i32 [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @st, i64 20), align 4
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[ADD110_3]], i32 3
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 4, i32 1, i32 6, i32 1>
+; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP9]], [[TMP3]]
; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[TMP15]], [[TMP17]]
; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP15]], [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
>From 8b472645b98b1c842717c1b9df670a45aac63ca2 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 13 Feb 2026 04:42:24 -0800
Subject: [PATCH 2/3] Added assertion
Created using spr 1.3.7
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 58c5451010852..21b5ec09a4848 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1437,6 +1437,8 @@ class InstructionsState {
return false;
};
Instruction *Op = getMatchingMainOpOrAltOp(I);
+ assert(Op &&
+ "The instruction should be compatible with either main or alt op.");
return CheckForTransformedOpcode(Op, I);
}
>From e225bc35e105e1fb4e877137fea3911b66028096 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Sun, 22 Feb 2026 07:10:10 -0800
Subject: [PATCH 3/3] Fix formatting
Created using spr 1.3.7
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 92df9ab1058e3..2ae640d33ed17 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5819,7 +5819,8 @@ class slpvectorizer::BoUpSLP {
auto DecrUnschedForInst =
[&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
- &Checked, bool IsExpandedOperand = false) {
+ &Checked,
+ bool IsExpandedOperand = false) {
if (!ScheduleCopyableDataMap.empty()) {
const EdgeInfo EI = {UserTE, OpIdx};
if (ScheduleCopyableData *CD =
More information about the llvm-commits
mailing list