[llvm] e858b10 - Revert "[SLP]Reduce number of alternate instruction, where possible"
Hans Wennborg via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 11 07:04:49 PDT 2025
Author: Hans Wennborg
Date: 2025-03-11T15:04:36+01:00
New Revision: e858b10917046b83234bf1931485df414fcded3c
URL: https://github.com/llvm/llvm-project/commit/e858b10917046b83234bf1931485df414fcded3c
DIFF: https://github.com/llvm/llvm-project/commit/e858b10917046b83234bf1931485df414fcded3c.diff
LOG: Revert "[SLP]Reduce number of alternate instruction, where possible"
This caused failures such as:
Instruction does not dominate all uses!
%29 = insertelement <8 x i64> %28, i64 %xor6.i.5, i64 6
%17 = shufflevector <8 x i64> %29, <8 x i64> poison, <6 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
see comment on https://github.com/llvm/llvm-project/pull/123360
> Previous version was reviewed here https://github.com/llvm/llvm-project/pull/123360
> It is mostly the same, adjusted after graph-to-tree transformation
>
> Patch tries to remove wide alternate operations.
> Currently SLP vectorizer emits something like this:
> ```
> %0 = add i32
> %1 = sub i32
> %2 = add i32
> %3 = sub i32
> %4 = add i32
> %5 = sub i32
> %6 = add i32
> %7 = sub i32
>
> transformes to
>
> %v1 = add <8 x i32>
> %v2 = sub <8 x i32>
> %res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15>
> ```
> i.e. half of the results are just unused. This leads to increased
> register pressure and potentially doubles number of operations.
>
> Patch introduces SplitVectorize mode, where it splits the operations by
> opcodes and produces instead something like this:
> ```
> %v1 = add <4 x i32>
> %v2 = sub <4 x i32>
> %res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7>
> ```
> It allows to improve the performance by reducing number of ops. Also, it
> turns on some other improvements, like improved graph reordering.
>
> [...]
This reverts commit 9d37e61fc77d3d6de891c30630f1c0227522031d as well as
the follow-up commit 72bb0a9a9c6fdde43e1e191f2dc0d5d2d46aff4e.
Added:
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
llvm/lib/Target/X86/X86TargetTransformInfo.h
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll
llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
llvm/test/Transforms/SLPVectorizer/X86/phi.ll
llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
llvm/test/Transforms/SLPVectorizer/addsub.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 4d311e7e9fd6a..3081379bafd06 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1771,10 +1771,6 @@ class TargetTransformInfo {
/// scalable version of the vectorized loop.
bool preferFixedOverScalableIfEqualCost() const;
- /// \returns True if target prefers SLP vectorizer with altermate opcode
- /// vectorization, false - otherwise.
- bool preferAlternateOpcodeVectorization() const;
-
/// \returns True if the target prefers reductions in loop.
bool preferInLoopReduction(unsigned Opcode, Type *Ty) const;
@@ -2329,7 +2325,6 @@ class TargetTransformInfo::Concept {
virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty) const = 0;
virtual bool preferPredicatedReductionSelect(unsigned Opcode,
Type *Ty) const = 0;
- virtual bool preferAlternateOpcodeVectorization() const = 0;
virtual bool preferEpilogueVectorization() const = 0;
virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
@@ -3140,9 +3135,6 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
bool preferInLoopReduction(unsigned Opcode, Type *Ty) const override {
return Impl.preferInLoopReduction(Opcode, Ty);
}
- bool preferAlternateOpcodeVectorization() const override {
- return Impl.preferAlternateOpcodeVectorization();
- }
bool preferPredicatedReductionSelect(unsigned Opcode,
Type *Ty) const override {
return Impl.preferPredicatedReductionSelect(Opcode, Ty);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c15694916a732..63fe7debfb8c7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1007,7 +1007,6 @@ class TargetTransformInfoImplBase {
bool preferFixedOverScalableIfEqualCost() const { return false; }
bool preferInLoopReduction(unsigned Opcode, Type *Ty) const { return false; }
- bool preferAlternateOpcodeVectorization() const { return true; }
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const {
return false;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 36f2983390a48..3d43f39439625 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1384,10 +1384,6 @@ bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode,
return TTIImpl->preferInLoopReduction(Opcode, Ty);
}
-bool TargetTransformInfo::preferAlternateOpcodeVectorization() const {
- return TTIImpl->preferAlternateOpcodeVectorization();
-}
-
bool TargetTransformInfo::preferPredicatedReductionSelect(unsigned Opcode,
Type *Ty) const {
return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index ac46db5faf28d..020a2b8d4edfb 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -125,8 +125,6 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
- bool preferAlternateOpcodeVectorization() const { return false; }
-
bool preferEpilogueVectorization() const {
// Epilogue vectorization is usually unprofitable - tail folding or
// a smaller VF would have been better. This a blunt hammer - we
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index c916da7f275d7..8fcaee0c7017f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -292,7 +292,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
- bool preferAlternateOpcodeVectorization() const { return false; }
bool prefersVectorizedAddressing() const;
bool supportsEfficientVectorElementLoadStore() const;
bool enableInterleavedAccessVectorization();
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a9f61d7a9798a..4decf5bec9514 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -141,10 +141,6 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore(
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
-static cl::opt<bool> SplitAlternateInstructions(
- "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
- cl::desc("Improve the code quality by splitting alternate instructions"));
-
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
@@ -844,35 +840,6 @@ class InstructionsState {
return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
}
- /// Checks if main/alt instructions are shift operations.
- bool isShiftOp() const {
- return getMainOp()->isShift() && getAltOp()->isShift();
- }
-
- /// Checks if main/alt instructions are bitwise logic operations.
- bool isBitwiseLogicOp() const {
- return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
- }
-
- /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
- bool isMulDivLikeOp() const {
- constexpr std::array<unsigned, 8> MulDiv = {
- Instruction::Mul, Instruction::FMul, Instruction::SDiv,
- Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
- Instruction::URem, Instruction::FRem};
- return is_contained(MulDiv, getOpcode()) &&
- is_contained(MulDiv, getAltOpcode());
- }
-
- /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
- bool isAddSubLikeOp() const {
- constexpr std::array<unsigned, 4> AddSub = {
- Instruction::Add, Instruction::Sub, Instruction::FAdd,
- Instruction::FSub};
- return is_contained(AddSub, getOpcode()) &&
- is_contained(AddSub, getAltOpcode());
- }
-
/// Checks if the current state is valid, i.e. has non-null MainOp
bool valid() const { return MainOp && AltOp; }
@@ -1505,7 +1472,6 @@ class BoUpSLP {
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntries.clear();
- ScalarsInSplitNodes.clear();
MustGather.clear();
NonScheduledFirst.clear();
EntryToLastInstruction.clear();
@@ -1541,7 +1507,7 @@ class BoUpSLP {
/// should be represented as an empty order, so this is used to
/// decide if we can canonicalize a computed order. Undef elements
/// (represented as size) are ignored.
- static bool isIdentityOrder(ArrayRef<unsigned> Order) {
+ bool isIdentityOrder(ArrayRef<unsigned> Order) const {
assert(!Order.empty() && "expected non-empty order");
const unsigned Sz = Order.size();
return all_of(enumerate(Order), [&](const auto &P) {
@@ -3263,35 +3229,12 @@ class BoUpSLP {
/// \returns Common mask for reorder indices and reused scalars.
SmallVector<int> getCommonMask() const {
- if (State == TreeEntry::SplitVectorize)
- return {};
SmallVector<int> Mask;
inversePermutation(ReorderIndices, Mask);
::addMask(Mask, ReuseShuffleIndices);
return Mask;
}
- /// \returns The mask for split nodes.
- SmallVector<int> getSplitMask() const {
- assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
- "Expected only split vectorize node.");
- SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
- unsigned CommonVF = std::max<unsigned>(
- CombinedEntriesWithIndices.back().second,
- Scalars.size() - CombinedEntriesWithIndices.back().second);
- for (auto [Idx, I] : enumerate(ReorderIndices))
- Mask[I] =
- Idx + (Idx >= CombinedEntriesWithIndices.back().second
- ? CommonVF - CombinedEntriesWithIndices.back().second
- : 0);
- return Mask;
- }
-
- /// Updates (reorders) SplitVectorize node according to the given mask \p
- /// Mask and order \p MaskOrder.
- void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
- ArrayRef<int> MaskOrder);
-
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
@@ -3379,8 +3322,6 @@ class BoUpSLP {
///< complex node like select/cmp to minmax, mul/add to
///< fma, etc. Must be used for the following nodes in
///< the pattern, not the very first one.
- SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
- ///< independently and then combines back.
};
EntryState State;
@@ -3411,7 +3352,7 @@ class BoUpSLP {
/// The index of this treeEntry in VectorizableTree.
unsigned Idx = 0;
- /// For gather/buildvector/alt opcode nodes, which are combined from
+ /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
/// other nodes as a series of insertvector instructions.
SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
@@ -3606,9 +3547,6 @@ class BoUpSLP {
case CombinedVectorize:
dbgs() << "CombinedVectorize\n";
break;
- case SplitVectorize:
- dbgs() << "SplitVectorize\n";
- break;
}
if (S) {
dbgs() << "MainOp: " << *S.getMainOp() << "\n";
@@ -3689,10 +3627,8 @@ class BoUpSLP {
const EdgeInfo &UserTreeIdx,
ArrayRef<int> ReuseShuffleIndices = {},
ArrayRef<unsigned> ReorderIndices = {}) {
- assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
- EntryState == TreeEntry::SplitVectorize)) ||
- (Bundle && EntryState != TreeEntry::NeedToGather &&
- EntryState != TreeEntry::SplitVectorize)) &&
+ assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
+ (Bundle && EntryState != TreeEntry::NeedToGather)) &&
"Need to vectorize gather entry?");
// Gathered loads still gathered? Do not create entry, use the original one.
if (GatheredLoadsEntriesFirst.has_value() &&
@@ -3726,38 +3662,11 @@ class BoUpSLP {
return VL[Idx];
});
InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
- if (S) {
+ if (S)
Last->setOperations(S);
- } else if (EntryState == TreeEntry::SplitVectorize) {
- auto *MainOp =
- cast<Instruction>(*find_if(Last->Scalars, IsaPred<Instruction>));
- auto *AltOp = cast<Instruction>(*find_if(Last->Scalars, [=](Value *V) {
- auto *I = dyn_cast<Instruction>(V);
- return I && I->getOpcode() != MainOp->getOpcode();
- }));
- Last->setOperations(InstructionsState(MainOp, AltOp));
- }
- if (EntryState == TreeEntry::SplitVectorize) {
- SmallPtrSet<Value *, 4> Processed;
- for (Value *V : VL) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- continue;
- auto It = ScalarsInSplitNodes.find(V);
- if (It == ScalarsInSplitNodes.end()) {
- ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
- Last);
- (void)Processed.insert(V);
- } else if (Processed.insert(V).second) {
- assert(!is_contained(It->getSecond(), Last) &&
- "Value already associated with the node.");
- It->getSecond().push_back(Last);
- }
- }
- }
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
}
- if (!Last->isGather() && Last->State != TreeEntry::SplitVectorize) {
+ if (!Last->isGather()) {
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
if (isa<PoisonValue>(V))
@@ -3794,7 +3703,7 @@ class BoUpSLP {
}
}
assert(!BundleMember && "Bundle and VL out of sync");
- } else if (Last->isGather()) {
+ } else {
// Build a map for gathered scalars to the nodes where they are used.
bool AllConstsOrCasts = true;
for (Value *V : VL)
@@ -3839,15 +3748,6 @@ class BoUpSLP {
return It->getSecond();
}
- /// Get list of split vector entries, associated with the value \p V.
- ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
- assert(V && "V cannot be nullptr.");
- auto It = ScalarsInSplitNodes.find(V);
- if (It == ScalarsInSplitNodes.end())
- return {};
- return It->getSecond();
- }
-
/// Returns first vector node for value \p V, matching values \p VL.
TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
bool SameVF = false) const {
@@ -3878,9 +3778,6 @@ class BoUpSLP {
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
- /// Scalars, used in split vectorize nodes.
- SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
-
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -5867,14 +5764,12 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
!Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
- if (TE.State == TreeEntry::SplitVectorize ||
- ((TE.State == TreeEntry::Vectorize ||
- TE.State == TreeEntry::StridedVectorize) &&
- (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
- (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
- assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
- "Alternate instructions are only supported by "
- "BinaryOperator and CastInst.");
+ if ((TE.State == TreeEntry::Vectorize ||
+ TE.State == TreeEntry::StridedVectorize) &&
+ (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
+ (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
+ assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
+ "BinaryOperator and CastInst.");
return TE.ReorderIndices;
}
if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
@@ -5985,9 +5880,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
return std::nullopt; // No need to reorder.
return std::move(Phis);
}
- if (TE.isGather() &&
- (!TE.hasState() || !TE.isAltShuffle() ||
- ScalarsInSplitNodes.contains(TE.getMainOp())) &&
+ if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
allSameType(TE.Scalars)) {
// TODO: add analysis of other gather nodes with extractelement
// instructions and other values/instructions, not only undefs.
@@ -6195,30 +6088,6 @@ bool BoUpSLP::isProfitableToReorder() const {
return true;
}
-void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
- ArrayRef<int> MaskOrder) {
- assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
- SmallVector<int> NewMask(getVectorFactor());
- SmallVector<int> NewMaskOrder(getVectorFactor());
- std::iota(NewMask.begin(), NewMask.end(), 0);
- std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
- if (Idx == 0) {
- copy(Mask, NewMask.begin());
- copy(MaskOrder, NewMaskOrder.begin());
- } else {
- assert(Idx == 1 && "Expected either 0 or 1 index.");
- unsigned Offset = CombinedEntriesWithIndices.back().second;
- for (unsigned I : seq<unsigned>(Mask.size())) {
- NewMask[I + Offset] = Mask[I] + Offset;
- NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
- }
- }
- reorderScalars(Scalars, NewMask);
- reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
- if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(ReorderIndices))
- ReorderIndices.clear();
-}
-
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -6253,8 +6122,7 @@ void BoUpSLP::reorderTopToBottom() {
// Patterns like [fadd,fsub] can be combined into a single instruction in
// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
// to take into account their order when looking for the most used order.
- if (TE->hasState() && TE->isAltShuffle() &&
- TE->State != TreeEntry::SplitVectorize) {
+ if (TE->hasState() && TE->isAltShuffle()) {
VectorType *VecTy =
getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
unsigned Opcode0 = TE->getOpcode();
@@ -6295,8 +6163,7 @@ void BoUpSLP::reorderTopToBottom() {
}
VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize ||
- TE->State == TreeEntry::SplitVectorize) ||
+ TE->State == TreeEntry::StridedVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
if (TE->State == TreeEntry::Vectorize &&
@@ -6327,8 +6194,7 @@ void BoUpSLP::reorderTopToBottom() {
for (const TreeEntry *OpTE : OrderedEntries) {
// No need to reorder this nodes, still need to extend and to use shuffle,
// just need to merge reordering shuffle and the reuse shuffle.
- if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
- OpTE->State != TreeEntry::SplitVectorize)
+ if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
continue;
// Count number of orders uses.
const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
@@ -6435,17 +6301,14 @@ void BoUpSLP::reorderTopToBottom() {
// Just do the reordering for the nodes with the given VF.
if (TE->Scalars.size() != VF) {
if (TE->ReuseShuffleIndices.size() == VF) {
- assert(TE->State != TreeEntry::SplitVectorize &&
- "Split vectorized not expected.");
// Need to reorder the reuses masks of the operands with smaller VF to
// be able to find the match between the graph nodes and scalar
// operands of the given node during vectorization/cost estimation.
- assert(
- (!TE->UserTreeIndex ||
- TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
- TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
- TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
- "All users must be of VF size.");
+ assert((!TE->UserTreeIndex ||
+ TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
+ TE->UserTreeIndex.UserTE->Scalars.size() ==
+ TE->Scalars.size()) &&
+ "All users must be of VF size.");
if (SLPReVec) {
assert(SLPReVec && "Only supported by REVEC.");
// ShuffleVectorInst does not do reorderOperands (and it should not
@@ -6462,28 +6325,19 @@ void BoUpSLP::reorderTopToBottom() {
// Update ordering of the operands with the smaller VF than the given
// one.
reorderNodeWithReuses(*TE, Mask);
- // Update orders in user split vectorize nodes.
- if (TE->UserTreeIndex &&
- TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
- TE->UserTreeIndex.UserTE->reorderSplitNode(
- TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
}
continue;
}
- if ((TE->State == TreeEntry::SplitVectorize &&
- TE->ReuseShuffleIndices.empty()) ||
- ((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize) &&
- (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
- InsertElementInst>(TE->getMainOp()) ||
- (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
- assert(
- (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
- TE->ReuseShuffleIndices.empty())) &&
- "Alternate instructions are only supported by BinaryOperator "
- "and CastInst.");
- // Build correct orders for extract{element,value}, loads,
- // stores and alternate (split) nodes.
+ if ((TE->State == TreeEntry::Vectorize ||
+ TE->State == TreeEntry::StridedVectorize) &&
+ (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
+ InsertElementInst>(TE->getMainOp()) ||
+ (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
+ assert(!TE->isAltShuffle() &&
+ "Alternate instructions are only supported by BinaryOperator "
+ "and CastInst.");
+ // Build correct orders for extract{element,value}, loads and
+ // stores.
reorderOrder(TE->ReorderIndices, Mask);
if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
TE->reorderOperands(Mask);
@@ -6504,11 +6358,6 @@ void BoUpSLP::reorderTopToBottom() {
addMask(NewReuses, TE->ReuseShuffleIndices);
TE->ReuseShuffleIndices.swap(NewReuses);
}
- // Update orders in user split vectorize nodes.
- if (TE->UserTreeIndex &&
- TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
- TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
- Mask, MaskOrder);
}
}
}
@@ -6521,8 +6370,7 @@ bool BoUpSLP::canReorderOperands(
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
(OpData.second->State == TreeEntry::Vectorize ||
- OpData.second->State == TreeEntry::StridedVectorize ||
- OpData.second->State == TreeEntry::SplitVectorize);
+ OpData.second->State == TreeEntry::StridedVectorize);
}))
continue;
if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
@@ -6536,7 +6384,6 @@ bool BoUpSLP::canReorderOperands(
// node, just reorder reuses mask.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
- TE->State != TreeEntry::SplitVectorize &&
TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
GatherOps.push_back(TE);
continue;
@@ -6546,7 +6393,6 @@ bool BoUpSLP::canReorderOperands(
[&Gather, UserTE, I](TreeEntry *TE) {
assert(TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
- TE->State != TreeEntry::SplitVectorize &&
"Only non-vectorized nodes are expected.");
if (TE->UserTreeIndex.UserTE == UserTE &&
TE->UserTreeIndex.EdgeIdx == I) {
@@ -6566,14 +6412,7 @@ bool BoUpSLP::canReorderOperands(
}
void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
- struct TreeEntryCompare {
- bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
- if (LHS->UserTreeIndex && RHS->UserTreeIndex)
- return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
- return LHS->Idx < RHS->Idx;
- }
- };
- PriorityQueue<TreeEntry *, SmallVector<TreeEntry *>, TreeEntryCompare> Queue;
+ SetVector<TreeEntry *> OrderedEntries;
DenseSet<const TreeEntry *> GathersToOrders;
// Find all reorderable leaf nodes with the given VF.
// Currently the are vectorized loads,extracts without alternate operands +
@@ -6581,15 +6420,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
SmallVector<TreeEntry *> NonVectorized;
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE->State != TreeEntry::Vectorize &&
- TE->State != TreeEntry::StridedVectorize &&
- TE->State != TreeEntry::SplitVectorize)
+ TE->State != TreeEntry::StridedVectorize)
NonVectorized.push_back(TE.get());
if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/false, IgnoreReorder)) {
- Queue.push(TE.get());
+ OrderedEntries.insert(TE.get());
if (!(TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::StridedVectorize ||
- TE->State == TreeEntry::SplitVectorize) ||
+ TE->State == TreeEntry::StridedVectorize) ||
!TE->ReuseShuffleIndices.empty())
GathersToOrders.insert(TE.get());
}
@@ -6600,88 +6437,40 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// one operand order in the natural order and reorder others + reorder the
// user node itself.
SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
- while (!Queue.empty()) {
+ while (!OrderedEntries.empty()) {
// 1. Filter out only reordered nodes.
- std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
- TreeEntry *TE = Queue.top();
- const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
- Queue.pop();
- SmallVector<TreeEntry *> OrderedOps(1, TE);
- while (!Queue.empty()) {
- TE = Queue.top();
- if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
- break;
- Queue.pop();
- OrderedOps.push_back(TE);
- }
- for (TreeEntry *TE : OrderedOps) {
+ DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
+ SmallVector<TreeEntry *> Filtered;
+ for (TreeEntry *TE : OrderedEntries) {
if (!(TE->State == TreeEntry::Vectorize ||
TE->State == TreeEntry::StridedVectorize ||
- TE->State == TreeEntry::SplitVectorize ||
(TE->isGather() && GathersToOrders.contains(TE))) ||
!TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
- !Visited.insert(TE).second)
+ !Visited.insert(TE).second) {
+ Filtered.push_back(TE);
continue;
+ }
// Build a map between user nodes and their operands order to speedup
// search. The graph currently does not provide this dependency directly.
- Users.first = TE->UserTreeIndex.UserTE;
- Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
- }
- if (Users.first) {
- auto &Data = Users;
- if (Data.first->State == TreeEntry::SplitVectorize) {
- assert(
- Data.second.size() <= 2 &&
- "Expected not greater than 2 operands for split vectorize node.");
- if (any_of(Data.second,
- [](const auto &Op) { return !Op.second->UserTreeIndex; }))
- continue;
- // Update orders in user split vectorize nodes.
- assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
- "Expected exactly 2 entries.");
- for (const auto &P : Data.first->CombinedEntriesWithIndices) {
- TreeEntry &OpTE = *VectorizableTree[P.first].get();
- OrdersType Order = OpTE.ReorderIndices;
- if (Order.empty()) {
- if (!OpTE.isGather())
- continue;
- const auto BestOrder =
- getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
- if (!BestOrder || BestOrder->empty() || isIdentityOrder(*BestOrder))
- continue;
- Order = *BestOrder;
- }
- fixupOrderingIndices(Order);
- SmallVector<int> Mask;
- inversePermutation(Order, Mask);
- const unsigned E = Order.size();
- SmallVector<int> MaskOrder(E, PoisonMaskElem);
- transform(Order, MaskOrder.begin(), [E](unsigned I) {
- return I < E ? static_cast<int>(I) : PoisonMaskElem;
- });
- Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
- // Clear ordering of the operand.
- if (!OpTE.ReorderIndices.empty()) {
- OpTE.ReorderIndices.clear();
- } else {
- assert(OpTE.isGather() && "Expected only gather/buildvector node.");
- reorderScalars(OpTE.Scalars, Mask);
- }
- }
- if (Data.first->ReuseShuffleIndices.empty() &&
- !Data.first->ReorderIndices.empty()) {
- // Insert user node to the list to try to sink reordering deeper in
- // the graph.
- Queue.push(Data.first);
- }
- continue;
- }
+ Users[TE->UserTreeIndex.UserTE].emplace_back(TE->UserTreeIndex.EdgeIdx,
+ TE);
+ }
+ // Erase filtered entries.
+ for (TreeEntry *TE : Filtered)
+ OrderedEntries.remove(TE);
+ SmallVector<
+ std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
+ UsersVec(Users.begin(), Users.end());
+ sort(UsersVec, [](const auto &Data1, const auto &Data2) {
+ return Data1.first->Idx > Data2.first->Idx;
+ });
+ for (auto &Data : UsersVec) {
// Check that operands are used only in the User node.
SmallVector<TreeEntry *> GatherOps;
if (!canReorderOperands(Data.first, Data.second, NonVectorized,
GatherOps)) {
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
- Visited.insert(Op.second);
+ OrderedEntries.remove(Op.second);
continue;
}
// All operands are reordered and used only in this node - propagate the
@@ -6774,8 +6563,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
UTE->UserTreeIndex.UserTE == Data.first) ||
(Data.first->UserTreeIndex &&
Data.first->UserTreeIndex.UserTE == UTE) ||
- (IgnoreReorder && UTE->UserTreeIndex &&
- UTE->UserTreeIndex.UserTE->Idx == 0) ||
NodeShouldBeReorderedWithOperands(UTE);
}))
continue;
@@ -6789,7 +6576,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
continue;
const TreeEntry *Op = getOperandEntry(UTE, Idx);
Visited.erase(Op);
- Queue.push(const_cast<TreeEntry *>(Op));
+ OrderedEntries.insert(const_cast<TreeEntry *>(Op));
}
}
}
@@ -6846,7 +6633,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// the compile time.
// Profitable to reorder if definitely more operands allow
// reordering rather than those with natural order.
- ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
+ ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
if (static_cast<unsigned>(count_if(
Ops, [UserTE, &AllowsReordering](
const std::pair<unsigned, TreeEntry *> &Op) {
@@ -6858,7 +6645,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
if (OrdersUses.empty()) {
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
- Visited.insert(Op.second);
+ OrderedEntries.remove(Op.second);
continue;
}
// Choose the most used order.
@@ -6888,7 +6675,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Set order of the user node.
if (isIdentityOrder(BestOrder)) {
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
- Visited.insert(Op.second);
+ OrderedEntries.remove(Op.second);
continue;
}
fixupOrderingIndices(BestOrder);
@@ -6903,6 +6690,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
});
for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
TreeEntry *TE = Op.second;
+ OrderedEntries.remove(TE);
if (!VisitedOps.insert(TE).second)
continue;
if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
@@ -6912,7 +6700,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Gathers are processed separately.
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
- TE->State != TreeEntry::SplitVectorize &&
(TE->State != TreeEntry::ScatterVectorize ||
TE->ReorderIndices.empty()))
continue;
@@ -6933,7 +6720,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
continue;
}
reorderScalars(Gather->Scalars, Mask);
- Visited.insert(Gather);
+ OrderedEntries.remove(Gather);
}
// Reorder operands of the user node and set the ordering for the user
// node itself.
@@ -6953,7 +6740,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
!Data.first->isAltShuffle()) {
// Insert user node to the list to try to sink reordering deeper in
// the graph.
- Queue.push(Data.first);
+ OrderedEntries.insert(Data.first);
}
} else {
reorderOrder(Data.first->ReorderIndices, Mask);
@@ -6983,7 +6770,7 @@ void BoUpSLP::buildExternalUses(
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
- if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+ if (Entry->isGather())
continue;
// For each lane:
@@ -8592,48 +8379,6 @@ class PHIHandler {
};
} // namespace
-/// Returns main/alternate instructions for the given \p VL. Unlike
-/// getSameOpcode supports non-compatible instructions for better SplitVectorize
-/// node support.
-/// \returns first main/alt instructions, if only poisons and instruction with
-/// only 2 opcodes exists. Returns pair of nullptr otherwise.
-static std::pair<Instruction *, Instruction *>
-getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
- Instruction *MainOp = nullptr;
- Instruction *AltOp = nullptr;
- for (Value *V : VL) {
- if (isa<PoisonValue>(V))
- continue;
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return {};
- if (!MainOp) {
- MainOp = I;
- continue;
- }
- if (MainOp->getOpcode() == I->getOpcode()) {
- if (I->getParent() != MainOp->getParent())
- return {};
- continue;
- }
- if (!AltOp) {
- AltOp = I;
- continue;
- }
- if (AltOp->getOpcode() == I->getOpcode()) {
- if (I->getParent() != AltOp->getParent())
- return {};
- continue;
- }
- return {};
- }
- if (!AltOp)
- return {};
- assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
- "Expected
diff erent main and alt instructions.");
- return std::make_pair(MainOp, AltOp);
-}
-
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx,
unsigned InterleaveFactor) {
@@ -8784,146 +8529,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
- // Tries to build split node.
- constexpr unsigned SmallNodeSize = 4;
- auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
- const InstructionsState &LocalState) {
- if (VL.size() <= SmallNodeSize ||
- TTI.preferAlternateOpcodeVectorization() || !SplitAlternateInstructions)
- return false;
-
- // Any value is used in split node already - just gather.
- if (any_of(VL, [&](Value *V) {
- return ScalarsInSplitNodes.contains(V) || isVectorized(V);
- })) {
- if (TryToFindDuplicates(S))
- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- return true;
- }
- SmallVector<Value *> Op1, Op2;
- OrdersType ReorderIndices(VL.size(), VL.size());
- SmallBitVector Op1Indices(VL.size());
- for (auto [Idx, V] : enumerate(VL)) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I) {
- Op1.push_back(V);
- Op1Indices.set(Idx);
- continue;
- }
- InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI);
- if (NewS && !NewS.isAltShuffle()) {
- Op1.push_back(V);
- Op1Indices.set(Idx);
- continue;
- }
- Op2.push_back(V);
- }
- Type *ScalarTy = getValueType(VL.front());
- VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
- unsigned Opcode0 = LocalState.getOpcode();
- unsigned Opcode1 = LocalState.getAltOpcode();
- SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
- // Enable split node, only if all nodes do not form legal alternate
- // instruction (like X86 addsub).
- SmallPtrSet<Value *, 4> UOp1(Op1.begin(), Op1.end());
- SmallPtrSet<Value *, 4> UOp2(Op2.begin(), Op2.end());
- if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
- TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
- !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) ||
- !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size()))
- return false;
- // Enable split node, only if all nodes are power-of-2/full registers.
- unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
- for (unsigned Idx : seq<unsigned>(VL.size())) {
- if (Op1Indices.test(Idx)) {
- ReorderIndices[Op1Cnt] = Idx;
- ++Op1Cnt;
- } else {
- ReorderIndices[Op2Cnt] = Idx;
- ++Op2Cnt;
- }
- }
- if (isIdentityOrder(ReorderIndices))
- ReorderIndices.clear();
- SmallVector<int> Mask;
- if (!ReorderIndices.empty())
- inversePermutation(ReorderIndices, Mask);
- unsigned NumParts = TTI.getNumberOfParts(VecTy);
- VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
- VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
- // Check non-profitable single register ops, which better to be represented
- // as alternate ops.
- if (NumParts >= VL.size())
- return false;
- if ((LocalState.getMainOp()->isBinaryOp() &&
- LocalState.getAltOp()->isBinaryOp() &&
- (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
- LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
- (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
- (LocalState.getMainOp()->isUnaryOp() &&
- LocalState.getAltOp()->isUnaryOp())) {
- constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
- InstructionCost InsertCost = ::getShuffleCost(
- TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
- FixedVectorType *SubVecTy =
- getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
- InstructionCost NewShuffleCost =
- ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
- if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost))
- return false;
- InstructionCost OriginalVecOpsCost =
- TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) +
- TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind);
- SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
- for (unsigned Idx : seq<unsigned>(VL.size())) {
- if (isa<PoisonValue>(VL[Idx]))
- continue;
- OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
- }
- InstructionCost OriginalCost =
- OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
- VecTy, OriginalMask, Kind);
- InstructionCost NewVecOpsCost =
- TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
- TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
- InstructionCost NewCost =
- NewVecOpsCost + InsertCost +
- (VectorizableTree.front()->hasState() &&
- VectorizableTree.front()->getOpcode() == Instruction::Store
- ? NewShuffleCost
- : 0);
- // If not profitable to split - exit.
- if (NewCost >= OriginalCost)
- return false;
- }
-
- SmallVector<Value *> NewVL(VL.size());
- copy(Op1, NewVL.begin());
- copy(Op2, std::next(NewVL.begin(), Op1.size()));
- auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, std::nullopt,
- LocalState, UserTreeIdx, {}, ReorderIndices);
- LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
- auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
- InstructionsState S = getSameOpcode(Op, *TLI);
- if (S && (isa<LoadInst>(S.getMainOp()) ||
- getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
- // Build gather node for loads, they will be gathered later.
- TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
- Idx == 0 ? 0 : Op1.size());
- (void)newTreeEntry(Op, TreeEntry::NeedToGather, std::nullopt, S,
- {TE, Idx});
- } else {
- TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
- Idx == 0 ? 0 : Op1.size());
- buildTree_rec(Op, Depth, {TE, Idx});
- }
- };
- AddNode(Op1, 0);
- AddNode(Op2, 1);
- return true;
- };
-
// If all of the operands are identical or constant we have a simple solution.
// If we deal with insert/extract instructions, they all must have constant
// indices, otherwise we should gather them, not try to vectorize.
@@ -9009,13 +8614,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
S.getMainOp()) &&
!all_of(VL, isVectorLikeInstWithConstOps)) ||
NotProfitableForVectorization(VL)) {
- if (!S) {
- auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
- // Last chance to try to vectorize alternate node.
- if (MainOp && AltOp &&
- TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp)))
- return;
- }
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
@@ -9095,10 +8693,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
- // FIXME: investigate if there are profitable cases for VL.size() <= 4.
- if (S.isAltShuffle() && TrySplitNode(SmallNodeSize, S))
- return;
-
// Check that every instruction appears once in this bundle.
if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
return;
@@ -9131,10 +8725,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
- // Last chance to try to vectorize alternate node.
- if (S.isAltShuffle() && ReuseShuffleIndices.empty() &&
- TrySplitNode(SmallNodeSize, S))
- return;
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
NonScheduledFirst.insert(VL.front());
@@ -9279,7 +8869,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE->dump());
break;
case TreeEntry::CombinedVectorize:
- case TreeEntry::SplitVectorize:
case TreeEntry::NeedToGather:
llvm_unreachable("Unexpected loads state.");
}
@@ -10457,69 +10046,6 @@ void BoUpSLP::transformNodes() {
reorderGatherNode(E);
}
- // Better to use full gathered loads analysis, if there are only 2 loads
- // gathered nodes each having less than 16 elements.
- constexpr unsigned VFLimit = 16;
- bool ForceLoadGather =
- count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isGather() && TE->hasState() &&
- TE->getOpcode() == Instruction::Load &&
- TE->getVectorFactor() < VFLimit;
- }) == 2;
-
- // Checks if the scalars are used in other node.
- auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
- function_ref<bool(Value *)> CheckContainer) {
- return TE->isSame(VL) || all_of(VL, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return true;
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
- return is_contained(TE->Scalars, I) || CheckContainer(I);
- });
- };
- auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
- if (E.hasState()) {
- if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
- !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
- return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
- ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
- return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
- return is_contained(TEs, TE);
- });
- });
- }))
- return true;
- ;
- if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
- !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
- return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
- ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
- return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
- return is_contained(TEs, TE);
- });
- });
- }))
- return true;
- } else {
- // Check if the gather node full copy of split node.
- auto *It = find_if(E.Scalars, IsaPred<Instruction>);
- if (It != E.Scalars.end()) {
- if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
- !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
- return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
- ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
- return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
- return is_contained(TEs, TE);
- });
- });
- }))
- return true;
- }
- }
- return false;
- };
// The tree may grow here, so iterate over nodes, built before.
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
TreeEntry &E = *VectorizableTree[Idx];
@@ -10534,11 +10060,6 @@ void BoUpSLP::transformNodes() {
E.isAltShuffle() || !allSameBlock(VL)) ||
allConstant(VL) || isSplat(VL))
continue;
- if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
- continue;
- // Check if the node is a copy of other vector nodes.
- if (CheckForSameVectorNodes(E))
- continue;
// Try to find vectorizable sequences and transform them into a series of
// insertvector instructions.
unsigned StartIdx = 0;
@@ -11772,8 +11293,7 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
}
const auto *It = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
[&](const std::unique_ptr<TreeEntry> &TE) {
- return (TE->isGather() ||
- TE->State == TreeEntry::SplitVectorize) &&
+ return TE->isGather() &&
TE->UserTreeIndex.EdgeIdx == Idx &&
TE->UserTreeIndex.UserTE == E;
});
@@ -11831,32 +11351,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return processBuildVector<ShuffleCostEstimator, InstructionCost>(
E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
}
- if (E->State == TreeEntry::SplitVectorize) {
- assert(E->CombinedEntriesWithIndices.size() == 2 &&
- "Expected exactly 2 combined entries.");
- assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
- InstructionCost VectorCost = 0;
- if (E->ReorderIndices.empty()) {
- VectorCost = ::getShuffleCost(
- *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
- E->CombinedEntriesWithIndices.back().second,
- getWidenedType(
- ScalarTy,
- VectorizableTree[E->CombinedEntriesWithIndices.back().first]
- ->getVectorFactor()));
- } else {
- unsigned CommonVF =
- std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
- ->getVectorFactor(),
- VectorizableTree[E->CombinedEntriesWithIndices.back().first]
- ->getVectorFactor());
- VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
- getWidenedType(ScalarTy, CommonVF),
- E->getSplitMask(), CostKind);
- }
- LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
- return VectorCost;
- }
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
@@ -11938,8 +11432,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
EI.EdgeIdx != 0) {
auto UserBWIt = MinBWs.find(EI.UserTE);
Type *UserScalarTy =
- (EI.UserTE->isGather() ||
- EI.UserTE->State == TreeEntry::SplitVectorize)
+ EI.UserTE->isGather()
? EI.UserTE->Scalars.front()->getType()
: EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
if (UserBWIt != MinBWs.end())
@@ -12442,7 +11935,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
break;
}
case TreeEntry::CombinedVectorize:
- case TreeEntry::SplitVectorize:
case TreeEntry::NeedToGather:
llvm_unreachable("Unexpected vectorization state.");
}
@@ -12939,8 +12431,6 @@ bool BoUpSLP::isTreeNotExtendable() const {
bool Res = false;
for (unsigned Idx : seq<unsigned>(getTreeSize())) {
TreeEntry &E = *VectorizableTree[Idx];
- if (E.State == TreeEntry::SplitVectorize)
- return false;
if (!E.isGather())
continue;
if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
@@ -13366,8 +12856,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
continue;
}
- if (TE.hasState() &&
- (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
+ if (TE.isGather() && TE.hasState()) {
if (const TreeEntry *E =
getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
E && E->getVectorFactor() == TE.getVectorFactor()) {
@@ -14131,19 +13620,6 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
break;
VToTEs.insert(TEPtr);
}
- if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
- const TreeEntry *VTE = VTEs.front();
- if (none_of(TE->CombinedEntriesWithIndices,
- [&](const auto &P) { return P.first == VTE->Idx; })) {
- Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
- if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
- continue;
- }
- // The node is reused - exit.
- if (CheckAndUseSameNode(VTE))
- break;
- VToTEs.insert(VTE);
- }
if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
const TreeEntry *VTE = VTEs.front();
if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
@@ -14697,7 +14173,6 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
assert(((GatheredLoadsEntriesFirst.has_value() &&
E->getOpcode() == Instruction::Load && E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
- E->State == TreeEntry::SplitVectorize ||
all_of(E->Scalars,
[=](Value *V) -> bool {
if (E->getOpcode() == Instruction::GetElementPtr &&
@@ -14723,7 +14198,6 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
}
assert(((E->getOpcode() == Instruction::GetElementPtr &&
!isa<GetElementPtrInst>(I)) ||
- E->State == TreeEntry::SplitVectorize ||
(isVectorLikeInstWithConstOps(LastInst) &&
isVectorLikeInstWithConstOps(I)) ||
(GatheredLoadsEntriesFirst.has_value() &&
@@ -14785,14 +14259,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
return FirstInst;
};
- if (E->State == TreeEntry::SplitVectorize) {
- Res = FindLastInst();
- return *Res;
- }
-
// Set insertpoint for gathered loads to the very first load.
- if (E->State != TreeEntry::SplitVectorize &&
- GatheredLoadsEntriesFirst.has_value() &&
+ if (GatheredLoadsEntriesFirst.has_value() &&
E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
E->getOpcode() == Instruction::Load) {
Res = FindFirstInst();
@@ -14871,10 +14339,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
bool IsPHI = isa<PHINode>(LastInst);
if (IsPHI)
LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
- if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars)) ||
- (GatheredLoadsEntriesFirst.has_value() &&
- E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
- E->getOpcode() == Instruction::Load)) {
+ if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
} else {
// Set the insertion point after the last instruction in the bundle. Set the
@@ -15680,9 +15145,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
// correctness of the transformations in many cases.
auto *I = find_if(ArrayRef(VectorizableTree).drop_front(E->Idx + 1),
[E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isOperandGatherNode({E, NodeIdx}) ||
- (TE->State == TreeEntry::SplitVectorize &&
- TE->UserTreeIndex == EdgeInfo(E, NodeIdx));
+ return TE->isOperandGatherNode({E, NodeIdx});
});
assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
assert(I->get()->UserTreeIndex &&
@@ -16220,83 +15683,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->VectorizedValue = Vec;
return Vec;
}
- if (E->State == TreeEntry::SplitVectorize) {
- assert(E->CombinedEntriesWithIndices.size() == 2 &&
- "Expected exactly 2 combined entries.");
- setInsertPointAfterBundle(E);
- TreeEntry &OpTE1 =
- *VectorizableTree[E->CombinedEntriesWithIndices.front().first].get();
- assert(OpTE1.isSame(
- ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
- "Expected same first part of scalars.");
- Value *Op1 = vectorizeTree(&OpTE1);
- TreeEntry &OpTE2 =
- *VectorizableTree[E->CombinedEntriesWithIndices.back().first].get();
- assert(
- OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
- "Expected same second part of scalars.");
- Value *Op2 = vectorizeTree(&OpTE2);
- auto GetOperandSignedness = [&](const TreeEntry *OpE) {
- bool IsSigned = false;
- auto It = MinBWs.find(OpE);
- if (It != MinBWs.end())
- IsSigned = It->second.second;
- else
- IsSigned = any_of(OpE->Scalars, [&](Value *R) {
- if (isa<PoisonValue>(V))
- return false;
- return !isKnownNonNegative(R, SimplifyQuery(*DL));
- });
- return IsSigned;
- };
- if (cast<VectorType>(Op1->getType())->getElementType() != ScalarTy) {
- assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
- Op1 = Builder.CreateIntCast(
- Op1,
- getWidenedType(
- ScalarTy,
- cast<FixedVectorType>(Op1->getType())->getNumElements()),
- GetOperandSignedness(&OpTE1));
- }
- if (cast<VectorType>(Op2->getType())->getElementType() != ScalarTy) {
- assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
- Op2 = Builder.CreateIntCast(
- Op2,
- getWidenedType(
- ScalarTy,
- cast<FixedVectorType>(Op2->getType())->getNumElements()),
- GetOperandSignedness(&OpTE2));
- }
- if (E->ReorderIndices.empty()) {
- SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
- std::iota(
- Mask.begin(),
- std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
- 0);
- Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
- Vec = createInsertVector(Builder, Vec, Op2,
- E->CombinedEntriesWithIndices.back().second);
- E->VectorizedValue = Vec;
- return Vec;
- }
- unsigned CommonVF =
- std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
- if (getNumElements(Op1->getType()) != CommonVF) {
- SmallVector<int> Mask(CommonVF, PoisonMaskElem);
- std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
- 0);
- Op1 = Builder.CreateShuffleVector(Op1, Mask);
- }
- if (getNumElements(Op2->getType()) != CommonVF) {
- SmallVector<int> Mask(CommonVF, PoisonMaskElem);
- std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
- 0);
- Op2 = Builder.CreateShuffleVector(Op2, Mask);
- }
- Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
- E->VectorizedValue = Vec;
- return Vec;
- }
bool IsReverseOrder =
!E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
@@ -17752,7 +17138,7 @@ Value *BoUpSLP::vectorizeTree(
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
- if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+ if (Entry->isGather())
continue;
assert(Entry->VectorizedValue && "Can't find vectorizable value");
@@ -17805,9 +17191,6 @@ Value *BoUpSLP::vectorizeTree(
VectorizableTree.front().get()) ||
(IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
- !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
- IE->UserTreeIndex &&
- is_contained(VectorizableTree.front()->Scalars, I)) &&
!(GatheredLoadsEntriesFirst.has_value() &&
IE->Idx >= *GatheredLoadsEntriesFirst &&
VectorizableTree.front()->isGather() &&
@@ -18845,13 +18228,6 @@ bool BoUpSLP::collectValuesToDemote(
ToDemote.push_back(E.Idx);
return IsProfitableToDemote;
};
-
- if (E.State == TreeEntry::SplitVectorize)
- return TryProcessInstruction(
- BitWidth,
- {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
- VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
-
switch (E.getOpcode()) {
// We can always demote truncations and extensions. Since truncations can
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 4b5a7c207055a..4ca00f2daf97a 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -12,13 +12,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
; GFX10_1: ; %bb.0:
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880
-; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
@@ -26,28 +20,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_1-NEXT: v_readfirstlane_b32 s55, v0
+; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55, scc
+; GFX10_1-NEXT: ; use s59, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880
-; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
; GFX10_3: ; %bb.0:
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
-; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
@@ -55,27 +37,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_3-NEXT: v_readfirstlane_b32 s55, v0
+; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55, scc
+; GFX10_3-NEXT: ; use s59, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
-; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
-; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: v_writelane_b32 v1, s55, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: s_addc_u32 s0, s32, 0x4040
@@ -85,16 +57,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
; GFX11-NEXT: s_bitset0_b32 s0, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s55, s0
+; GFX11-NEXT: s_mov_b32 s59, s0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55, scc
+; GFX11-NEXT: ; use s59, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v1, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
-; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
@@ -104,13 +70,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX12-NEXT: v_writelane_b32 v1, s55, 0
; GFX12-NEXT: s_add_co_ci_u32 s0, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -120,54 +80,34 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s55, s0
+; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55, scc
+; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v1, 0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
-; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
-; GFX8-NEXT: v_writelane_b32 v1, s55, 0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT: s_movk_i32 s55, 0x4040
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s55, v0
-; GFX8-NEXT: v_readfirstlane_b32 s55, v0
+; GFX8-NEXT: s_movk_i32 s59, 0x4040
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT: v_readfirstlane_b32 s59, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55, scc
+; GFX8-NEXT: ; use s59, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v1, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
-; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
-; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
@@ -175,52 +115,34 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0
-; GFX900-NEXT: v_writelane_b32 v1, s55, 0
-; GFX900-NEXT: v_readfirstlane_b32 s55, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT: v_readfirstlane_b32 s59, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55, scc
+; GFX900-NEXT: ; use s59, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v1, 0
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
-; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
-; GFX942-NEXT: scratch_store_dword off, v1, s2 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: s_addc_u32 s0, s32, 0x4040
; GFX942-NEXT: s_bitcmp1_b32 s0, 0
; GFX942-NEXT: s_bitset0_b32 s0, 0
-; GFX942-NEXT: v_writelane_b32 v1, s55, 0
-; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s59, s0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55, scc
+; GFX942-NEXT: ; use s59, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v1, 0
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
-; GFX942-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca i32, align 4, addrspace(5)
call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
- call void asm sideeffect "; use $0, $1", "{s55},{scc}"(ptr addrspace(5) %alloca1, i32 0)
+ call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1, i32 0)
ret void
}
@@ -230,65 +152,36 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
; GFX10_1-LABEL: scalar_mov_materializes_frame_index_dead_scc:
; GFX10_1: ; %bb.0:
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880
-; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_1-NEXT: s_addk_i32 s55, 0x4040
+; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5
+; GFX10_1-NEXT: s_addk_i32 s59, 0x4040
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55
+; GFX10_1-NEXT: ; use s59
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880
-; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_dead_scc:
; GFX10_3: ; %bb.0:
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
-; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_3-NEXT: s_addk_i32 s55, 0x4040
+; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5
+; GFX10_3-NEXT: s_addk_i32 s59, 0x4040
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55
+; GFX10_3-NEXT: ; use s59
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880
-; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_dead_scc:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
-; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v1, s55, 0
; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
@@ -296,16 +189,10 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_mov_b32 s55, s0
+; GFX11-NEXT: s_mov_b32 s59, s0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55
+; GFX11-NEXT: ; use s59
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v1, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x4044
-; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_dead_scc:
@@ -315,110 +202,67 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v1, s55, 0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s55, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55
+; GFX12-NEXT: ; use s59
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v1, 0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
-; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v1, s55, 0
-; GFX8-NEXT: s_lshr_b32 s55, s32, 6
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT: s_addk_i32 s55, 0x4040
+; GFX8-NEXT: s_lshr_b32 s59, s32, 6
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
+; GFX8-NEXT: s_addk_i32 s59, 0x4040
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55
+; GFX8-NEXT: ; use s59
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v1, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x101100
-; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_dead_scc:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
-; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v1, s55, 0
-; GFX900-NEXT: s_lshr_b32 s55, s32, 6
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT: s_addk_i32 s55, 0x4040
+; GFX900-NEXT: s_lshr_b32 s59, s32, 6
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_addk_i32 s59, 0x4040
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55
+; GFX900-NEXT: ; use s59
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v1, 0
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x101100
-; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_dead_scc:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
-; GFX942-NEXT: scratch_store_dword off, v1, s2 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: s_add_i32 s0, s32, 0x4040
-; GFX942-NEXT: v_writelane_b32 v1, s55, 0
-; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s59, s0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55
+; GFX942-NEXT: ; use s59
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v1, 0
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x4044
-; GFX942-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca i32, align 4, addrspace(5)
call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
- call void asm sideeffect "; use $0", "{s55}"(ptr addrspace(5) %alloca1)
+ call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca1)
ret void
}
@@ -428,14 +272,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10_1-NEXT: s_mov_b32 s5, s33
; GFX10_1-NEXT: s_mov_b32 s33, s32
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80880
-; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: s_mov_b32 s32, s33
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
@@ -443,19 +281,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_1-NEXT: s_mov_b32 s33, s5
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_1-NEXT: v_readfirstlane_b32 s55, v0
+; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55, scc
+; GFX10_1-NEXT: ; use s59, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80880
-; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: s_mov_b32 s33, s5
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -463,13 +294,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10_3-NEXT: s_mov_b32 s5, s33
; GFX10_3-NEXT: s_mov_b32 s33, s32
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880
-; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: s_mov_b32 s32, s33
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
@@ -477,18 +303,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_3-NEXT: s_mov_b32 s33, s5
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_3-NEXT: v_readfirstlane_b32 s55, v0
+; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55, scc
+; GFX10_3-NEXT: ; use s59, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880
-; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: s_mov_b32 s33, s5
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -496,13 +316,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s1, s33
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s2, s33, 0x4044
-; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_addk_i32 s32, 0x4080
; GFX11-NEXT: s_add_i32 s0, s33, 64
-; GFX11-NEXT: v_writelane_b32 v1, s55, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: s_addc_u32 s0, s33, 0x4040
@@ -511,18 +327,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
; GFX11-NEXT: s_bitset0_b32 s0, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
-; GFX11-NEXT: s_mov_b32 s55, s0
+; GFX11-NEXT: s_mov_b32 s33, s1
+; GFX11-NEXT: s_mov_b32 s59, s0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55, scc
+; GFX11-NEXT: ; use s59, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v1, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s2, s33, 0x4044
-; GFX11-NEXT: scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_mov_b32 s33, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -534,13 +343,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s1, s33
; GFX12-NEXT: s_mov_b32 s33, s32
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX12-NEXT: v_writelane_b32 v1, s55, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_ci_u32 s0, s33, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s33
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -550,18 +355,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s55, s0
+; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55, scc
+; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v1, 0
; GFX12-NEXT: s_mov_b32 s32, s33
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_mov_b32 s33, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -570,33 +369,22 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s6, s33
; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s7, s33, 0x101100
-; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
-; GFX8-NEXT: v_writelane_b32 v1, s55, 0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33
-; GFX8-NEXT: s_movk_i32 s55, 0x4040
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s55, v0
+; GFX8-NEXT: s_movk_i32 s59, 0x4040
; GFX8-NEXT: s_add_i32 s32, s32, 0x102000
-; GFX8-NEXT: v_readfirstlane_b32 s55, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT: v_readfirstlane_b32 s59, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55, scc
+; GFX8-NEXT: ; use s59, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v1, 0
; GFX8-NEXT: s_mov_b32 s32, s33
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s7, s33, 0x101100
-; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_mov_b32 s33, s6
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -604,32 +392,21 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: s_mov_b32 s6, s33
; GFX900-NEXT: s_mov_b32 s33, s32
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s7, s33, 0x101100
-; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33
-; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0
; GFX900-NEXT: s_add_i32 s32, s32, 0x102000
-; GFX900-NEXT: v_writelane_b32 v1, s55, 0
-; GFX900-NEXT: v_readfirstlane_b32 s55, v0
+; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT: v_readfirstlane_b32 s59, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55, scc
+; GFX900-NEXT: ; use s59, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v1, 0
; GFX900-NEXT: s_mov_b32 s32, s33
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s7, s33, 0x101100
-; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: s_mov_b32 s33, s6
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -637,10 +414,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b32 s2, s33
; GFX942-NEXT: s_mov_b32 s33, s32
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s3, s33, 0x4044
-; GFX942-NEXT: scratch_store_dword off, v1, s3 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_addk_i32 s32, 0x4080
; GFX942-NEXT: s_add_i32 s0, s33, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
@@ -648,27 +421,20 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX942-NEXT: s_addc_u32 s0, s33, 0x4040
; GFX942-NEXT: s_bitcmp1_b32 s0, 0
; GFX942-NEXT: s_bitset0_b32 s0, 0
-; GFX942-NEXT: v_writelane_b32 v1, s55, 0
-; GFX942-NEXT: s_mov_b32 s55, s0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; use alloca0 v0
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: s_mov_b32 s59, s0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55, scc
+; GFX942-NEXT: ; use s59, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v1, 0
; GFX942-NEXT: s_mov_b32 s32, s33
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s3, s33, 0x4044
-; GFX942-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_mov_b32 s33, s2
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca i32, align 4, addrspace(5)
call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
- call void asm sideeffect "; use $0, $1", "{s55},{scc}"(ptr addrspace(5) %alloca1, i32 0)
+ call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1, i32 0)
ret void
}
@@ -676,75 +442,39 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
; GFX10_1: ; %bb.0:
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800
-; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32
-; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1
-; GFX10_1-NEXT: v_readfirstlane_b32 s55, v1
+; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55, scc
+; GFX10_1-NEXT: ; use s59, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v0, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800
-; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
; GFX10_3: ; %bb.0:
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800
-; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32
-; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1
-; GFX10_3-NEXT: v_readfirstlane_b32 s55, v1
+; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55, scc
+; GFX10_3-NEXT: ; use s59, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v0, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800
-; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x4040
-; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_writelane_b32 v0, s55, 0
; GFX11-NEXT: s_addc_u32 s0, s32, 64
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
; GFX11-NEXT: s_bitset0_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s55, s0
+; GFX11-NEXT: s_mov_b32 s59, s0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55, scc
+; GFX11-NEXT: ; use s59, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v0, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x4040
-; GFX11-NEXT: scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
@@ -754,97 +484,53 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v0, s55, 0
-; GFX12-NEXT: s_mov_b32 s55, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT: s_mov_b32 s59, s32
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55, scc
+; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readlane_b32 s55, v0, 0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x101000
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v0, s55, 0
-; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
-; GFX8-NEXT: s_mov_b32 s55, 64
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s55, v1
-; GFX8-NEXT: v_readfirstlane_b32 s55, v1
+; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT: s_mov_b32 s59, 64
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT: v_readfirstlane_b32 s59, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55, scc
+; GFX8-NEXT: ; use s59, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v0, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x101000
-; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x101000
-; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s32
-; GFX900-NEXT: v_add_u32_e32 v1, 64, v1
-; GFX900-NEXT: v_writelane_b32 v0, s55, 0
-; GFX900-NEXT: v_readfirstlane_b32 s55, v1
+; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT: v_readfirstlane_b32 s59, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55, scc
+; GFX900-NEXT: ; use s59, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v0, 0
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x101000
-; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x4040
-; GFX942-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: s_addc_u32 s0, s32, 64
; GFX942-NEXT: s_bitcmp1_b32 s0, 0
; GFX942-NEXT: s_bitset0_b32 s0, 0
-; GFX942-NEXT: v_writelane_b32 v0, s55, 0
-; GFX942-NEXT: s_mov_b32 s55, s0
+; GFX942-NEXT: s_mov_b32 s59, s0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55, scc
+; GFX942-NEXT: ; use s59, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v0, 0
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x4040
-; GFX942-NEXT: scratch_load_dword v0, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
- call void asm sideeffect "; use $0, $1", "{s55},{scc}"(ptr addrspace(5) %alloca0, i32 0)
+ call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0)
ret void
}
@@ -852,67 +538,32 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
; GFX10_1-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
; GFX10_1: ; %bb.0:
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800
-; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
-; GFX10_1-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_1-NEXT: s_add_i32 s55, s55, 64
+; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5
+; GFX10_1-NEXT: s_add_i32 s59, s59, 64
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55
+; GFX10_1-NEXT: ; use s59
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v0, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80800
-; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
; GFX10_3: ; %bb.0:
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800
-; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
-; GFX10_3-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_3-NEXT: s_add_i32 s55, s55, 64
+; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5
+; GFX10_3-NEXT: s_add_i32 s59, s59, 64
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55
+; GFX10_3-NEXT: ; use s59
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v0, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80800
-; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x4040
-; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v0, s55, 0
; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_mov_b32 s55, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s59, s0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55
+; GFX11-NEXT: ; use s59
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v0, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x4040
-; GFX11-NEXT: scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
@@ -922,88 +573,44 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v0, s55, 0
-; GFX12-NEXT: s_mov_b32 s55, s32
+; GFX12-NEXT: s_mov_b32 s59, s32
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55
+; GFX12-NEXT: ; use s59
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readlane_b32 s55, v0, 0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x101000
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v0, s55, 0
-; GFX8-NEXT: s_lshr_b32 s55, s32, 6
-; GFX8-NEXT: s_add_i32 s55, s55, 64
+; GFX8-NEXT: s_lshr_b32 s59, s32, 6
+; GFX8-NEXT: s_add_i32 s59, s59, 64
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55
+; GFX8-NEXT: ; use s59
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v0, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x101000
-; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x101000
-; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v0, s55, 0
-; GFX900-NEXT: s_lshr_b32 s55, s32, 6
-; GFX900-NEXT: s_add_i32 s55, s55, 64
+; GFX900-NEXT: s_lshr_b32 s59, s32, 6
+; GFX900-NEXT: s_add_i32 s59, s59, 64
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55
+; GFX900-NEXT: ; use s59
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v0, 0
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x101000
-; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x4040
-; GFX942-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_add_i32 s0, s32, 64
-; GFX942-NEXT: v_writelane_b32 v0, s55, 0
-; GFX942-NEXT: s_mov_b32 s55, s0
+; GFX942-NEXT: s_mov_b32 s59, s0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55
+; GFX942-NEXT: ; use s59
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v0, 0
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x4040
-; GFX942-NEXT: scratch_load_dword v0, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
- call void asm sideeffect "; use $0", "{s55}"(ptr addrspace(5) %alloca0)
+ call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0)
ret void
}
@@ -1013,29 +620,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10_1-NEXT: s_mov_b32 s5, s33
; GFX10_1-NEXT: s_mov_b32 s33, s32
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800
-; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s33
-; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
-; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT: s_add_i32 s32, s32, 0x80800
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s33
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: s_mov_b32 s32, s33
-; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1
-; GFX10_1-NEXT: v_readfirstlane_b32 s55, v1
+; GFX10_1-NEXT: s_mov_b32 s33, s5
+; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55, scc
+; GFX10_1-NEXT: ; use s59, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v0, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800
-; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: s_mov_b32 s33, s5
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1043,27 +637,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10_3-NEXT: s_mov_b32 s5, s33
; GFX10_3-NEXT: s_mov_b32 s33, s32
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800
-; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s33
-; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
-; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT: s_add_i32 s32, s32, 0x80800
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s33
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: s_mov_b32 s32, s33
-; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1
-; GFX10_3-NEXT: v_readfirstlane_b32 s55, v1
+; GFX10_3-NEXT: s_mov_b32 s33, s5
+; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55, scc
+; GFX10_3-NEXT: ; use s59, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v0, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800
-; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: s_mov_b32 s33, s5
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1071,29 +654,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s1, s33
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s2, s33, 0x4040
-; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_addk_i32 s32, 0x4080
+; GFX11-NEXT: s_addk_i32 s32, 0x4040
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_writelane_b32 v0, s55, 0
; GFX11-NEXT: s_addc_u32 s0, s33, 64
; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
; GFX11-NEXT: s_bitset0_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s55, s0
+; GFX11-NEXT: s_mov_b32 s33, s1
+; GFX11-NEXT: s_mov_b32 s59, s0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55, scc
+; GFX11-NEXT: ; use s59, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v0, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s2, s33, 0x4040
-; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_mov_b32 s33, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1105,25 +676,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s1, s33
; GFX12-NEXT: s_mov_b32 s33, s32
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v0, s55, 0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
-; GFX12-NEXT: s_mov_b32 s55, s33
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s59, s33
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55, scc
+; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v0, 0
; GFX12-NEXT: s_mov_b32 s32, s33
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_mov_b32 s33, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -1132,28 +693,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s6, s33
; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s7, s33, 0x101000
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v0, s55, 0
-; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s33
-; GFX8-NEXT: s_mov_b32 s55, 64
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s55, v1
-; GFX8-NEXT: s_add_i32 s32, s32, 0x102000
-; GFX8-NEXT: v_readfirstlane_b32 s55, v1
+; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s33
+; GFX8-NEXT: s_mov_b32 s59, 64
+; GFX8-NEXT: s_add_i32 s32, s32, 0x101000
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s59, v0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT: v_readfirstlane_b32 s59, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55, scc
+; GFX8-NEXT: ; use s59, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v0, 0
; GFX8-NEXT: s_mov_b32 s32, s33
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s7, s33, 0x101000
-; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_mov_b32 s33, s6
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1161,27 +711,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: s_mov_b32 s6, s33
; GFX900-NEXT: s_mov_b32 s33, s32
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s7, s33, 0x101000
-; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s33
-; GFX900-NEXT: v_add_u32_e32 v1, 64, v1
-; GFX900-NEXT: s_add_i32 s32, s32, 0x102000
-; GFX900-NEXT: v_writelane_b32 v0, s55, 0
-; GFX900-NEXT: v_readfirstlane_b32 s55, v1
+; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s33
+; GFX900-NEXT: s_add_i32 s32, s32, 0x101000
+; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT: v_readfirstlane_b32 s59, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55, scc
+; GFX900-NEXT: ; use s59, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v0, 0
; GFX900-NEXT: s_mov_b32 s32, s33
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s7, s33, 0x101000
-; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: s_mov_b32 s33, s6
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1189,31 +728,20 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b32 s2, s33
; GFX942-NEXT: s_mov_b32 s33, s32
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s3, s33, 0x4040
-; GFX942-NEXT: scratch_store_dword off, v0, s3 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: s_addk_i32 s32, 0x4080
+; GFX942-NEXT: s_addk_i32 s32, 0x4040
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: s_addc_u32 s0, s33, 64
; GFX942-NEXT: s_bitcmp1_b32 s0, 0
; GFX942-NEXT: s_bitset0_b32 s0, 0
-; GFX942-NEXT: v_writelane_b32 v0, s55, 0
-; GFX942-NEXT: s_mov_b32 s55, s0
+; GFX942-NEXT: s_mov_b32 s59, s0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55, scc
+; GFX942-NEXT: ; use s59, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v0, 0
; GFX942-NEXT: s_mov_b32 s32, s33
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s3, s33, 0x4040
-; GFX942-NEXT: scratch_load_dword v0, off, s3 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
; GFX942-NEXT: s_mov_b32 s33, s2
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
- call void asm sideeffect "; use $0, $1", "{s55},{scc}"(ptr addrspace(5) %alloca0, i32 0)
+ call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0)
ret void
}
@@ -1223,27 +751,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10_1-NEXT: s_mov_b32 s4, s33
; GFX10_1-NEXT: s_mov_b32 s33, s32
-; GFX10_1-NEXT: s_xor_saveexec_b32 s5, -1
-; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800
-; GFX10_1-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s5
-; GFX10_1-NEXT: v_writelane_b32 v0, s55, 0
-; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000
-; GFX10_1-NEXT: s_lshr_b32 s55, s33, 5
+; GFX10_1-NEXT: s_add_i32 s32, s32, 0x80800
+; GFX10_1-NEXT: s_lshr_b32 s59, s33, 5
; GFX10_1-NEXT: s_mov_b32 s32, s33
-; GFX10_1-NEXT: s_add_i32 s55, s55, 64
+; GFX10_1-NEXT: s_add_i32 s59, s59, 64
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55
+; GFX10_1-NEXT: ; use s59
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v0, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s5, -1
-; GFX10_1-NEXT: s_add_i32 s6, s33, 0x80800
-; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s5
; GFX10_1-NEXT: s_mov_b32 s33, s4
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1251,25 +766,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10_3-NEXT: s_mov_b32 s4, s33
; GFX10_3-NEXT: s_mov_b32 s33, s32
-; GFX10_3-NEXT: s_xor_saveexec_b32 s5, -1
-; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800
-; GFX10_3-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s5
-; GFX10_3-NEXT: v_writelane_b32 v0, s55, 0
-; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000
-; GFX10_3-NEXT: s_lshr_b32 s55, s33, 5
+; GFX10_3-NEXT: s_add_i32 s32, s32, 0x80800
+; GFX10_3-NEXT: s_lshr_b32 s59, s33, 5
; GFX10_3-NEXT: s_mov_b32 s32, s33
-; GFX10_3-NEXT: s_add_i32 s55, s55, 64
+; GFX10_3-NEXT: s_add_i32 s59, s59, 64
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55
+; GFX10_3-NEXT: ; use s59
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v0, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s5, -1
-; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800
-; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s5
; GFX10_3-NEXT: s_mov_b32 s33, s4
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1277,25 +781,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, s33
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_xor_saveexec_b32 s1, -1
-; GFX11-NEXT: s_add_i32 s2, s33, 0x4040
-; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_writelane_b32 v0, s55, 0
-; GFX11-NEXT: s_addk_i32 s32, 0x4080
+; GFX11-NEXT: s_addk_i32 s32, 0x4040
; GFX11-NEXT: s_add_i32 s1, s33, 64
; GFX11-NEXT: s_mov_b32 s32, s33
-; GFX11-NEXT: s_mov_b32 s55, s1
+; GFX11-NEXT: s_mov_b32 s59, s1
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55
+; GFX11-NEXT: ; use s59
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v0, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s1, -1
-; GFX11-NEXT: s_add_i32 s2, s33, 0x4040
-; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: s_mov_b32 s33, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1307,24 +800,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, s33
; GFX12-NEXT: s_mov_b32 s33, s32
-; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
-; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_writelane_b32 v0, s55, 0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
-; GFX12-NEXT: s_mov_b32 s55, s33
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s59, s33
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55
+; GFX12-NEXT: ; use s59
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_mov_b32 s32, s33
-; GFX12-NEXT: v_readlane_b32 s55, v0, 0
-; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
-; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: s_mov_b32 s33, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -1333,25 +816,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s4, s33
; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[6:7], -1
-; GFX8-NEXT: s_add_i32 s5, s33, 0x101000
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[6:7]
-; GFX8-NEXT: s_add_i32 s32, s32, 0x102000
-; GFX8-NEXT: v_writelane_b32 v0, s55, 0
-; GFX8-NEXT: s_lshr_b32 s55, s33, 6
-; GFX8-NEXT: s_add_i32 s55, s55, 64
+; GFX8-NEXT: s_add_i32 s32, s32, 0x101000
+; GFX8-NEXT: s_lshr_b32 s59, s33, 6
+; GFX8-NEXT: s_add_i32 s59, s59, 64
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55
+; GFX8-NEXT: ; use s59
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v0, 0
; GFX8-NEXT: s_mov_b32 s32, s33
-; GFX8-NEXT: s_xor_saveexec_b64 s[6:7], -1
-; GFX8-NEXT: s_add_i32 s5, s33, 0x101000
-; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[6:7]
; GFX8-NEXT: s_mov_b32 s33, s4
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1359,25 +831,14 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: s_mov_b32 s4, s33
; GFX900-NEXT: s_mov_b32 s33, s32
-; GFX900-NEXT: s_xor_saveexec_b64 s[6:7], -1
-; GFX900-NEXT: s_add_i32 s5, s33, 0x101000
-; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[6:7]
-; GFX900-NEXT: s_add_i32 s32, s32, 0x102000
-; GFX900-NEXT: v_writelane_b32 v0, s55, 0
-; GFX900-NEXT: s_lshr_b32 s55, s33, 6
-; GFX900-NEXT: s_add_i32 s55, s55, 64
+; GFX900-NEXT: s_add_i32 s32, s32, 0x101000
+; GFX900-NEXT: s_lshr_b32 s59, s33, 6
+; GFX900-NEXT: s_add_i32 s59, s59, 64
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55
+; GFX900-NEXT: ; use s59
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v0, 0
; GFX900-NEXT: s_mov_b32 s32, s33
-; GFX900-NEXT: s_xor_saveexec_b64 s[6:7], -1
-; GFX900-NEXT: s_add_i32 s5, s33, 0x101000
-; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[6:7]
; GFX900-NEXT: s_mov_b32 s33, s4
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1385,28 +846,17 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_mov_b32 s0, s33
; GFX942-NEXT: s_mov_b32 s33, s32
-; GFX942-NEXT: s_xor_saveexec_b64 s[2:3], -1
-; GFX942-NEXT: s_add_i32 s1, s33, 0x4040
-; GFX942-NEXT: scratch_store_dword off, v0, s1 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[2:3]
-; GFX942-NEXT: s_addk_i32 s32, 0x4080
+; GFX942-NEXT: s_addk_i32 s32, 0x4040
; GFX942-NEXT: s_add_i32 s1, s33, 64
-; GFX942-NEXT: v_writelane_b32 v0, s55, 0
-; GFX942-NEXT: s_mov_b32 s55, s1
+; GFX942-NEXT: s_mov_b32 s59, s1
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55
+; GFX942-NEXT: ; use s59
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v0, 0
; GFX942-NEXT: s_mov_b32 s32, s33
-; GFX942-NEXT: s_xor_saveexec_b64 s[2:3], -1
-; GFX942-NEXT: s_add_i32 s1, s33, 0x4040
-; GFX942-NEXT: scratch_load_dword v0, off, s1 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_mov_b32 s33, s0
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
- call void asm sideeffect "; use $0", "{s55}"(ptr addrspace(5) %alloca0)
+ call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0)
ret void
}
@@ -1414,83 +864,48 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
; GFX10_1: ; %bb.0:
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800
-; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5
-; GFX10_1-NEXT: s_add_i32 s55, s4, 0x442c
+; GFX10_1-NEXT: s_add_i32 s59, s4, 0x442c
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55, scc
+; GFX10_1-NEXT: ; use s59, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800
-; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
; GFX10_3: ; %bb.0:
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
-; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5
-; GFX10_3-NEXT: s_add_i32 s55, s4, 0x442c
+; GFX10_3-NEXT: s_add_i32 s59, s4, 0x442c
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55, scc
+; GFX10_3-NEXT: ; use s59, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
-; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
-; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v1, s55, 0
; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: s_add_i32 s55, s32, 0x442c
+; GFX11-NEXT: s_add_i32 s59, s32, 0x442c
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55, scc
+; GFX11-NEXT: ; use s59, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v1, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
-; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
@@ -1500,38 +915,23 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: v_writelane_b32 v1, s55, 0
-; GFX12-NEXT: s_add_co_i32 s55, s32, 0x43ec
+; GFX12-NEXT: s_add_co_i32 s59, s32, 0x43ec
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55, scc
+; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v1, 0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_lshr_b32 s4, s32, 6
-; GFX8-NEXT: v_writelane_b32 v1, s55, 0
-; GFX8-NEXT: s_add_i32 s55, s4, 0x442c
+; GFX8-NEXT: s_add_i32 s59, s4, 0x442c
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: ;;#ASMSTART
@@ -1539,26 +939,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55, scc
+; GFX8-NEXT: ; use s59, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v1, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: s_lshr_b32 s4, s32, 6
-; GFX900-NEXT: v_writelane_b32 v1, s55, 0
-; GFX900-NEXT: s_add_i32 s55, s4, 0x442c
+; GFX900-NEXT: s_add_i32 s59, s4, 0x442c
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
@@ -1566,25 +955,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55, scc
+; GFX900-NEXT: ; use s59, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v1, 0
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x8040
-; GFX942-NEXT: scratch_store_dword off, v1, s2 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: v_writelane_b32 v1, s55, 0
-; GFX942-NEXT: s_add_i32 s55, s32, 0x442c
+; GFX942-NEXT: s_add_i32 s59, s32, 0x442c
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: ;;#ASMSTART
@@ -1592,20 +970,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55, scc
+; GFX942-NEXT: ; use s59, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v1, 0
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x8040
-; GFX942-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca [4096 x i32], align 4, addrspace(5)
%alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 251
call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
- call void asm sideeffect "; use $0, $1", "{s55},{scc}"(ptr addrspace(5) %alloca1.offset, i32 0)
+ call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1.offset, i32 0)
ret void
}
@@ -1613,89 +985,54 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
; GFX10_1: ; %bb.0:
; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800
-; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2
-; GFX10_1-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_1-NEXT: s_add_i32 s55, s55, s4
+; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5
+; GFX10_1-NEXT: s_add_i32 s59, s59, s4
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_1-NEXT: s_addk_i32 s55, 0x4040
+; GFX10_1-NEXT: s_addk_i32 s59, 0x4040
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s55, scc
+; GFX10_1-NEXT: ; use s59, scc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800
-; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: s_waitcnt vmcnt(0)
; GFX10_1-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
; GFX10_3: ; %bb.0:
; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
-; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_writelane_b32 v1, s55, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2
-; GFX10_3-NEXT: s_lshr_b32 s55, s32, 5
-; GFX10_3-NEXT: s_add_i32 s55, s55, s4
+; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5
+; GFX10_3-NEXT: s_add_i32 s59, s59, s4
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
-; GFX10_3-NEXT: s_addk_i32 s55, 0x4040
+; GFX10_3-NEXT: s_addk_i32 s59, 0x4040
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s55, scc
+; GFX10_3-NEXT: ; use s59, scc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_readlane_b32 s55, v1, 0
-; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
-; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
-; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: s_waitcnt vmcnt(0)
; GFX10_3-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_xor_saveexec_b32 s1, -1
-; GFX11-NEXT: s_add_i32 s2, s32, 0x8040
-; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: s_add_i32 s1, s32, 64
-; GFX11-NEXT: v_writelane_b32 v1, s55, 0
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-NEXT: s_add_i32 s55, s32, s0
+; GFX11-NEXT: s_add_i32 s59, s32, s0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_addk_i32 s55, 0x4040
+; GFX11-NEXT: s_addk_i32 s59, 0x4040
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s55, scc
+; GFX11-NEXT: ; use s59, scc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: v_readlane_b32 s55, v1, 0
-; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
-; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
@@ -1705,44 +1042,29 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
-; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_writelane_b32 v1, s55, 0
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_add_co_i32 s55, s32, s0
+; GFX12-NEXT: s_add_co_i32 s59, s32, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_addk_co_i32 s55, 0x4000
+; GFX12-NEXT: s_addk_co_i32 s59, 0x4000
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s55, scc
+; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: v_readlane_b32 s55, v1, 0
-; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v1, s55, 0
; GFX8-NEXT: s_lshl_b32 s4, s16, 2
-; GFX8-NEXT: s_lshr_b32 s55, s32, 6
-; GFX8-NEXT: s_add_i32 s55, s55, s4
-; GFX8-NEXT: s_addk_i32 s55, 0x4040
+; GFX8-NEXT: s_lshr_b32 s59, s32, 6
+; GFX8-NEXT: s_add_i32 s59, s59, s4
+; GFX8-NEXT: s_addk_i32 s59, 0x4040
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
; GFX8-NEXT: ;;#ASMSTART
@@ -1750,28 +1072,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s55, scc
+; GFX8-NEXT: ; use s59, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v1, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v1, s55, 0
; GFX900-NEXT: s_lshl_b32 s4, s16, 2
-; GFX900-NEXT: s_lshr_b32 s55, s32, 6
-; GFX900-NEXT: s_add_i32 s55, s55, s4
-; GFX900-NEXT: s_addk_i32 s55, 0x4040
+; GFX900-NEXT: s_lshr_b32 s59, s32, 6
+; GFX900-NEXT: s_add_i32 s59, s59, s4
+; GFX900-NEXT: s_addk_i32 s59, 0x4040
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
; GFX900-NEXT: ;;#ASMSTART
@@ -1779,27 +1090,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s55, scc
+; GFX900-NEXT: ; use s59, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v1, 0
-; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_xor_saveexec_b64 s[2:3], -1
-; GFX942-NEXT: s_add_i32 s1, s32, 0x8040
-; GFX942-NEXT: scratch_store_dword off, v1, s1 ; 4-byte Folded Spill
-; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-NEXT: v_writelane_b32 v1, s55, 0
-; GFX942-NEXT: s_add_i32 s55, s32, s0
-; GFX942-NEXT: s_addk_i32 s55, 0x4040
+; GFX942-NEXT: s_add_i32 s59, s32, s0
+; GFX942-NEXT: s_addk_i32 s59, 0x4040
; GFX942-NEXT: s_add_i32 s0, s32, 64
; GFX942-NEXT: v_mov_b32_e32 v0, s0
; GFX942-NEXT: ;;#ASMSTART
@@ -1807,20 +1107,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_and_b64 s[0:1], 0, exec
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s55, scc
+; GFX942-NEXT: ; use s59, scc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_readlane_b32 s55, v1, 0
-; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX942-NEXT: s_add_i32 s2, s32, 0x8040
-; GFX942-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
-; GFX942-NEXT: s_mov_b64 exec, s[0:1]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca [4096 x i32], align 4, addrspace(5)
%alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 %soffset
call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
- call void asm sideeffect "; use $0, $1", "{s55},{scc}"(ptr addrspace(5) %alloca1.offset, i32 0)
+ call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1.offset, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 17581bcb61e99..e8dacc93a8f3c 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -67,11 +67,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX7-NEXT: v_mov_b32_e32 v0, 0x4040
; GFX7-NEXT: v_mad_u32_u24 v0, v0, 64, s32
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 6, v0
-; GFX7-NEXT: v_readfirstlane_b32 s54, v0
+; GFX7-NEXT: v_readfirstlane_b32 s59, v0
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
+; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: v_readlane_b32 s55, v23, 16
; GFX7-NEXT: v_readlane_b32 s54, v23, 15
@@ -133,13 +133,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX8-NEXT: v_mov_b32_e32 v0, 0x4040
; GFX8-NEXT: v_mad_u32_u24 v0, v0, 64, s32
-; GFX8-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s54, v0
+; GFX8-NEXT: v_readfirstlane_b32 s59, v0
; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
+; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: v_readlane_b32 s55, v23, 16
; GFX8-NEXT: v_readlane_b32 s54, v23, 15
@@ -200,13 +199,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0
-; GFX900-NEXT: v_readfirstlane_b32 s54, v0
+; GFX900-NEXT: v_readfirstlane_b32 s59, v0
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
+; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s55, v23, 16
; GFX900-NEXT: v_readlane_b32 s54, v23, 15
@@ -265,13 +263,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_addc_u32 s59, s32, 0x4040
-; GFX942-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX942-NEXT: s_bitcmp1_b32 s59, 0
-; GFX942-NEXT: s_bitset0_b32 s59, 0
-; GFX942-NEXT: s_mov_b32 s54, s59
+; GFX942-NEXT: s_addc_u32 s60, s32, 0x4040
+; GFX942-NEXT: s_bitcmp1_b32 s60, 0
+; GFX942-NEXT: s_bitset0_b32 s60, 0
+; GFX942-NEXT: s_mov_b32 s59, s60
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
+; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s55, v23, 16
; GFX942-NEXT: v_readlane_b32 s54, v23, 15
@@ -332,11 +329,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_lshrrev_b32_e64 v24, 5, s32
-; GFX10_1-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
; GFX10_1-NEXT: v_add_nc_u32_e32 v24, 0x4040, v24
-; GFX10_1-NEXT: v_readfirstlane_b32 s54, v24
+; GFX10_1-NEXT: v_readfirstlane_b32 s59, v24
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
+; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_readlane_b32 s55, v23, 16
; GFX10_1-NEXT: v_readlane_b32 s54, v23, 15
@@ -397,11 +393,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_lshrrev_b32_e64 v24, 5, s32
-; GFX10_3-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
; GFX10_3-NEXT: v_add_nc_u32_e32 v24, 0x4040, v24
-; GFX10_3-NEXT: v_readfirstlane_b32 s54, v24
+; GFX10_3-NEXT: v_readfirstlane_b32 s59, v24
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
+; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_readlane_b32 s55, v23, 16
; GFX10_3-NEXT: v_readlane_b32 s54, v23, 15
@@ -461,14 +456,13 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_addc_u32 s59, s32, 0x4040
-; GFX11-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
+; GFX11-NEXT: s_addc_u32 s60, s32, 0x4040
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_bitcmp1_b32 s59, 0
-; GFX11-NEXT: s_bitset0_b32 s59, 0
-; GFX11-NEXT: s_mov_b32 s54, s59
+; GFX11-NEXT: s_bitcmp1_b32 s60, 0
+; GFX11-NEXT: s_bitset0_b32 s60, 0
+; GFX11-NEXT: s_mov_b32 s59, s60
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
+; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: v_readlane_b32 s55, v23, 16
; GFX11-NEXT: v_readlane_b32 s54, v23, 15
@@ -530,15 +524,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_add_co_ci_u32 s59, s32, 0x4000
-; GFX12-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
+; GFX12-NEXT: s_add_co_ci_u32 s60, s32, 0x4000
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_bitcmp1_b32 s59, 0
-; GFX12-NEXT: s_bitset0_b32 s59, 0
+; GFX12-NEXT: s_bitcmp1_b32 s60, 0
+; GFX12-NEXT: s_bitset0_b32 s60, 0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s54, s59
+; GFX12-NEXT: s_mov_b32 s59, s60
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc
+; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: v_readlane_b32 s55, v23, 16
; GFX12-NEXT: v_readlane_b32 s54, v23, 15
@@ -586,7 +579,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; scc is unavailable since it is live in
call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
- "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s54},{scc}"(
+ "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"(
<16 x i32> %s0,
<16 x i32> %s1,
<16 x i32> %s2,
@@ -636,9 +629,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: v_mad_u32_u24 v22, 16, 64, s32
; GFX7-NEXT: v_lshrrev_b32_e32 v22, 6, v22
-; GFX7-NEXT: v_readfirstlane_b32 s54, v22
+; GFX7-NEXT: v_readfirstlane_b32 s59, v22
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
+; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: v_readlane_b32 s55, v21, 16
; GFX7-NEXT: v_readlane_b32 s54, v21, 15
@@ -693,11 +686,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: v_mad_u32_u24 v22, 16, 64, s32
-; GFX8-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
; GFX8-NEXT: v_lshrrev_b32_e32 v22, 6, v22
-; GFX8-NEXT: v_readfirstlane_b32 s54, v22
+; GFX8-NEXT: v_readfirstlane_b32 s59, v22
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
+; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: v_readlane_b32 s55, v21, 16
; GFX8-NEXT: v_readlane_b32 s54, v21, 15
@@ -752,11 +744,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_lshrrev_b32_e64 v22, 6, s32
-; GFX900-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
; GFX900-NEXT: v_add_u32_e32 v22, 16, v22
-; GFX900-NEXT: v_readfirstlane_b32 s54, v22
+; GFX900-NEXT: v_readfirstlane_b32 s59, v22
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
+; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: v_readlane_b32 s55, v21, 16
; GFX900-NEXT: v_readlane_b32 s54, v21, 15
@@ -810,13 +801,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_addc_u32 s59, s32, 16
-; GFX942-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX942-NEXT: s_bitcmp1_b32 s59, 0
-; GFX942-NEXT: s_bitset0_b32 s59, 0
-; GFX942-NEXT: s_mov_b32 s54, s59
+; GFX942-NEXT: s_addc_u32 s60, s32, 16
+; GFX942-NEXT: s_bitcmp1_b32 s60, 0
+; GFX942-NEXT: s_bitset0_b32 s60, 0
+; GFX942-NEXT: s_mov_b32 s59, s60
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
+; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s55, v21, 16
; GFX942-NEXT: v_readlane_b32 s54, v21, 15
@@ -872,11 +862,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_lshrrev_b32_e64 v22, 5, s32
-; GFX10_1-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 16, v22
-; GFX10_1-NEXT: v_readfirstlane_b32 s54, v22
+; GFX10_1-NEXT: v_readfirstlane_b32 s59, v22
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
+; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_readlane_b32 s55, v21, 16
; GFX10_1-NEXT: v_readlane_b32 s54, v21, 15
@@ -932,11 +921,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_lshrrev_b32_e64 v22, 5, s32
-; GFX10_3-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 16, v22
-; GFX10_3-NEXT: v_readfirstlane_b32 s54, v22
+; GFX10_3-NEXT: v_readfirstlane_b32 s59, v22
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
+; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_readlane_b32 s55, v21, 16
; GFX10_3-NEXT: v_readlane_b32 s54, v21, 15
@@ -990,14 +978,13 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_addc_u32 s59, s32, 16
-; GFX11-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
+; GFX11-NEXT: s_addc_u32 s60, s32, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_bitcmp1_b32 s59, 0
-; GFX11-NEXT: s_bitset0_b32 s59, 0
-; GFX11-NEXT: s_mov_b32 s54, s59
+; GFX11-NEXT: s_bitcmp1_b32 s60, 0
+; GFX11-NEXT: s_bitset0_b32 s60, 0
+; GFX11-NEXT: s_mov_b32 s59, s60
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
+; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: v_readlane_b32 s55, v21, 16
; GFX11-NEXT: v_readlane_b32 s54, v21, 15
@@ -1055,10 +1042,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX12-NEXT: s_mov_b32 s54, s32
+; GFX12-NEXT: s_mov_b32 s59, s32
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc
+; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s55, v21, 16
@@ -1105,7 +1091,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; scc is unavailable since it is live in
call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
- "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:20]},{vcc},{s54},{scc}"(
+ "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:20]},{vcc},{s59},{scc}"(
<16 x i32> %s0,
<16 x i32> %s1,
<16 x i32> %s2,
@@ -1165,9 +1151,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_readlane_b32 s54, v22, 0
+; GFX7-NEXT: v_readlane_b32 s59, v22, 0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
+; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: v_readlane_b32 s55, v23, 16
; GFX7-NEXT: v_readlane_b32 s54, v23, 15
@@ -1202,66 +1188,58 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX8-NEXT: s_add_i32 s6, s32, 0x201100
; GFX8-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_writelane_b32 v23, s30, 0
-; GFX8-NEXT: v_writelane_b32 v23, s31, 1
-; GFX8-NEXT: v_writelane_b32 v23, s33, 2
-; GFX8-NEXT: v_writelane_b32 v23, s34, 3
-; GFX8-NEXT: v_writelane_b32 v23, s35, 4
-; GFX8-NEXT: v_writelane_b32 v23, s36, 5
-; GFX8-NEXT: v_writelane_b32 v23, s37, 6
-; GFX8-NEXT: v_writelane_b32 v23, s38, 7
-; GFX8-NEXT: v_writelane_b32 v23, s39, 8
-; GFX8-NEXT: v_writelane_b32 v23, s48, 9
-; GFX8-NEXT: v_writelane_b32 v23, s49, 10
-; GFX8-NEXT: v_writelane_b32 v23, s50, 11
-; GFX8-NEXT: v_writelane_b32 v23, s51, 12
-; GFX8-NEXT: v_writelane_b32 v23, s52, 13
-; GFX8-NEXT: s_lshr_b32 s5, s32, 6
-; GFX8-NEXT: v_writelane_b32 v23, s53, 14
+; GFX8-NEXT: v_writelane_b32 v22, s30, 0
+; GFX8-NEXT: v_writelane_b32 v22, s31, 1
+; GFX8-NEXT: v_writelane_b32 v22, s33, 2
+; GFX8-NEXT: v_writelane_b32 v22, s34, 3
+; GFX8-NEXT: v_writelane_b32 v22, s35, 4
+; GFX8-NEXT: v_writelane_b32 v22, s36, 5
+; GFX8-NEXT: v_writelane_b32 v22, s37, 6
+; GFX8-NEXT: v_writelane_b32 v22, s38, 7
+; GFX8-NEXT: v_writelane_b32 v22, s39, 8
+; GFX8-NEXT: v_writelane_b32 v22, s48, 9
+; GFX8-NEXT: v_writelane_b32 v22, s49, 10
+; GFX8-NEXT: v_writelane_b32 v22, s50, 11
+; GFX8-NEXT: v_writelane_b32 v22, s51, 12
+; GFX8-NEXT: v_writelane_b32 v22, s52, 13
+; GFX8-NEXT: s_lshr_b32 s4, s32, 6
+; GFX8-NEXT: v_writelane_b32 v22, s53, 14
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT: s_add_i32 s4, s5, 0x4240
-; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT: v_writelane_b32 v23, s54, 15
+; GFX8-NEXT: s_add_i32 s59, s4, 0x4240
+; GFX8-NEXT: v_writelane_b32 v22, s54, 15
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0
-; GFX8-NEXT: v_writelane_b32 v22, s4, 0
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_writelane_b32 v23, s55, 16
+; GFX8-NEXT: v_writelane_b32 v22, s55, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use alloca0 v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX8-NEXT: v_readlane_b32 s54, v22, 0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
+; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_readlane_b32 s55, v23, 16
-; GFX8-NEXT: v_readlane_b32 s54, v23, 15
-; GFX8-NEXT: v_readlane_b32 s53, v23, 14
-; GFX8-NEXT: v_readlane_b32 s52, v23, 13
-; GFX8-NEXT: v_readlane_b32 s51, v23, 12
-; GFX8-NEXT: v_readlane_b32 s50, v23, 11
-; GFX8-NEXT: v_readlane_b32 s49, v23, 10
-; GFX8-NEXT: v_readlane_b32 s48, v23, 9
-; GFX8-NEXT: v_readlane_b32 s39, v23, 8
-; GFX8-NEXT: v_readlane_b32 s38, v23, 7
-; GFX8-NEXT: v_readlane_b32 s37, v23, 6
-; GFX8-NEXT: v_readlane_b32 s36, v23, 5
-; GFX8-NEXT: v_readlane_b32 s35, v23, 4
-; GFX8-NEXT: v_readlane_b32 s34, v23, 3
-; GFX8-NEXT: v_readlane_b32 s33, v23, 2
-; GFX8-NEXT: v_readlane_b32 s31, v23, 1
-; GFX8-NEXT: v_readlane_b32 s30, v23, 0
+; GFX8-NEXT: v_readlane_b32 s55, v22, 16
+; GFX8-NEXT: v_readlane_b32 s54, v22, 15
+; GFX8-NEXT: v_readlane_b32 s53, v22, 14
+; GFX8-NEXT: v_readlane_b32 s52, v22, 13
+; GFX8-NEXT: v_readlane_b32 s51, v22, 12
+; GFX8-NEXT: v_readlane_b32 s50, v22, 11
+; GFX8-NEXT: v_readlane_b32 s49, v22, 10
+; GFX8-NEXT: v_readlane_b32 s48, v22, 9
+; GFX8-NEXT: v_readlane_b32 s39, v22, 8
+; GFX8-NEXT: v_readlane_b32 s38, v22, 7
+; GFX8-NEXT: v_readlane_b32 s37, v22, 6
+; GFX8-NEXT: v_readlane_b32 s36, v22, 5
+; GFX8-NEXT: v_readlane_b32 s35, v22, 4
+; GFX8-NEXT: v_readlane_b32 s34, v22, 3
+; GFX8-NEXT: v_readlane_b32 s33, v22, 2
+; GFX8-NEXT: v_readlane_b32 s31, v22, 1
+; GFX8-NEXT: v_readlane_b32 s30, v22, 0
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX8-NEXT: s_add_i32 s6, s32, 0x201100
; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1272,66 +1250,58 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX900-NEXT: s_add_i32 s6, s32, 0x201100
; GFX900-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
-; GFX900-NEXT: v_writelane_b32 v23, s30, 0
-; GFX900-NEXT: v_writelane_b32 v23, s31, 1
-; GFX900-NEXT: v_writelane_b32 v23, s33, 2
-; GFX900-NEXT: v_writelane_b32 v23, s34, 3
-; GFX900-NEXT: v_writelane_b32 v23, s35, 4
-; GFX900-NEXT: v_writelane_b32 v23, s36, 5
-; GFX900-NEXT: v_writelane_b32 v23, s37, 6
-; GFX900-NEXT: v_writelane_b32 v23, s38, 7
-; GFX900-NEXT: v_writelane_b32 v23, s39, 8
-; GFX900-NEXT: v_writelane_b32 v23, s48, 9
-; GFX900-NEXT: v_writelane_b32 v23, s49, 10
-; GFX900-NEXT: v_writelane_b32 v23, s50, 11
-; GFX900-NEXT: v_writelane_b32 v23, s51, 12
-; GFX900-NEXT: v_writelane_b32 v23, s52, 13
-; GFX900-NEXT: s_lshr_b32 s5, s32, 6
-; GFX900-NEXT: v_writelane_b32 v23, s53, 14
+; GFX900-NEXT: v_writelane_b32 v22, s30, 0
+; GFX900-NEXT: v_writelane_b32 v22, s31, 1
+; GFX900-NEXT: v_writelane_b32 v22, s33, 2
+; GFX900-NEXT: v_writelane_b32 v22, s34, 3
+; GFX900-NEXT: v_writelane_b32 v22, s35, 4
+; GFX900-NEXT: v_writelane_b32 v22, s36, 5
+; GFX900-NEXT: v_writelane_b32 v22, s37, 6
+; GFX900-NEXT: v_writelane_b32 v22, s38, 7
+; GFX900-NEXT: v_writelane_b32 v22, s39, 8
+; GFX900-NEXT: v_writelane_b32 v22, s48, 9
+; GFX900-NEXT: v_writelane_b32 v22, s49, 10
+; GFX900-NEXT: v_writelane_b32 v22, s50, 11
+; GFX900-NEXT: v_writelane_b32 v22, s51, 12
+; GFX900-NEXT: v_writelane_b32 v22, s52, 13
+; GFX900-NEXT: s_lshr_b32 s4, s32, 6
+; GFX900-NEXT: v_writelane_b32 v22, s53, 14
; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX900-NEXT: s_add_i32 s4, s5, 0x4240
-; GFX900-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX900-NEXT: v_writelane_b32 v23, s54, 15
+; GFX900-NEXT: s_add_i32 s59, s4, 0x4240
+; GFX900-NEXT: v_writelane_b32 v22, s54, 15
; GFX900-NEXT: v_add_u32_e32 v0, 64, v0
-; GFX900-NEXT: v_writelane_b32 v22, s4, 0
; GFX900-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX900-NEXT: v_writelane_b32 v23, s55, 16
+; GFX900-NEXT: v_writelane_b32 v22, s55, 16
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use alloca0 v0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX900-NEXT: v_readlane_b32 s54, v22, 0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
+; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_readlane_b32 s55, v23, 16
-; GFX900-NEXT: v_readlane_b32 s54, v23, 15
-; GFX900-NEXT: v_readlane_b32 s53, v23, 14
-; GFX900-NEXT: v_readlane_b32 s52, v23, 13
-; GFX900-NEXT: v_readlane_b32 s51, v23, 12
-; GFX900-NEXT: v_readlane_b32 s50, v23, 11
-; GFX900-NEXT: v_readlane_b32 s49, v23, 10
-; GFX900-NEXT: v_readlane_b32 s48, v23, 9
-; GFX900-NEXT: v_readlane_b32 s39, v23, 8
-; GFX900-NEXT: v_readlane_b32 s38, v23, 7
-; GFX900-NEXT: v_readlane_b32 s37, v23, 6
-; GFX900-NEXT: v_readlane_b32 s36, v23, 5
-; GFX900-NEXT: v_readlane_b32 s35, v23, 4
-; GFX900-NEXT: v_readlane_b32 s34, v23, 3
-; GFX900-NEXT: v_readlane_b32 s33, v23, 2
-; GFX900-NEXT: v_readlane_b32 s31, v23, 1
-; GFX900-NEXT: v_readlane_b32 s30, v23, 0
+; GFX900-NEXT: v_readlane_b32 s55, v22, 16
+; GFX900-NEXT: v_readlane_b32 s54, v22, 15
+; GFX900-NEXT: v_readlane_b32 s53, v22, 14
+; GFX900-NEXT: v_readlane_b32 s52, v22, 13
+; GFX900-NEXT: v_readlane_b32 s51, v22, 12
+; GFX900-NEXT: v_readlane_b32 s50, v22, 11
+; GFX900-NEXT: v_readlane_b32 s49, v22, 10
+; GFX900-NEXT: v_readlane_b32 s48, v22, 9
+; GFX900-NEXT: v_readlane_b32 s39, v22, 8
+; GFX900-NEXT: v_readlane_b32 s38, v22, 7
+; GFX900-NEXT: v_readlane_b32 s37, v22, 6
+; GFX900-NEXT: v_readlane_b32 s36, v22, 5
+; GFX900-NEXT: v_readlane_b32 s35, v22, 4
+; GFX900-NEXT: v_readlane_b32 s34, v22, 3
+; GFX900-NEXT: v_readlane_b32 s33, v22, 2
+; GFX900-NEXT: v_readlane_b32 s31, v22, 1
+; GFX900-NEXT: v_readlane_b32 s30, v22, 0
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: s_add_i32 s6, s32, 0x201000
-; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
-; GFX900-NEXT: s_add_i32 s6, s32, 0x201100
; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: s_waitcnt vmcnt(0)
@@ -1369,12 +1339,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_add_i32 s58, s32, 0x4240
-; GFX942-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
+; GFX942-NEXT: s_add_i32 s59, s32, 0x4240
; GFX942-NEXT: s_and_b64 s[60:61], 0, exec
-; GFX942-NEXT: s_mov_b32 s54, s58
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
+; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_readlane_b32 s55, v22, 16
; GFX942-NEXT: v_readlane_b32 s54, v22, 15
@@ -1411,7 +1379,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: v_writelane_b32 v22, s30, 0
; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5
-; GFX10_1-NEXT: s_add_i32 s58, s4, 0x4240
+; GFX10_1-NEXT: s_add_i32 s59, s4, 0x4240
; GFX10_1-NEXT: v_writelane_b32 v22, s31, 1
; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
@@ -1436,10 +1404,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX10_1-NEXT: s_mov_b32 s54, s58
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
+; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_readlane_b32 s55, v22, 16
; GFX10_1-NEXT: v_readlane_b32 s54, v22, 15
@@ -1476,7 +1442,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: v_writelane_b32 v22, s30, 0
; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5
-; GFX10_3-NEXT: s_add_i32 s58, s4, 0x4240
+; GFX10_3-NEXT: s_add_i32 s59, s4, 0x4240
; GFX10_3-NEXT: v_writelane_b32 v22, s31, 1
; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
@@ -1501,10 +1467,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX10_3-NEXT: s_mov_b32 s54, s58
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
+; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_readlane_b32 s55, v22, 16
; GFX10_3-NEXT: v_readlane_b32 s54, v22, 15
@@ -1539,7 +1503,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v22, s30, 0
; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: s_add_i32 s58, s32, 0x4240
+; GFX11-NEXT: s_add_i32 s59, s32, 0x4240
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: v_writelane_b32 v22, s31, 1
@@ -1564,10 +1528,8 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX11-NEXT: s_mov_b32 s54, s58
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
+; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s55, v22, 16
@@ -1606,7 +1568,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v22, s30, 0
-; GFX12-NEXT: s_add_co_i32 s58, s32, 0x4200
+; GFX12-NEXT: s_add_co_i32 s59, s32, 0x4200
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
@@ -1631,12 +1593,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: ; kill: def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 killed $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 def $sgpr54
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s54, s58
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc
+; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s55, v22, 16
; GFX12-NEXT: v_readlane_b32 s54, v22, 15
; GFX12-NEXT: v_readlane_b32 s53, v22, 14
@@ -1684,7 +1644,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; scc is unavailable since it is live in
call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9",
- "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{v[0:15]},{v[16:21]},{vcc},{s54},{scc}"(
+ "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{v[0:15]},{v[16:21]},{vcc},{s59},{scc}"(
<16 x i32> %s0,
<16 x i32> %s1,
<16 x i32> %s2,
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
index f70cd6816a966..79187f51af0d2 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll
@@ -44,7 +44,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; scc is unavailable since it is live in
call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
- "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s64},{scc}"(
+ "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"(
<16 x i32> %s0,
<16 x i32> %s1,
<16 x i32> %s2,
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
index 92027d0043f76..c431b058f0d2d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -17,12 +17,15 @@
define void @s116_modified(ptr %a) {
; CHECK-LABEL: @s116_modified(
-; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 4
+; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3
; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 3, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[LD0]], i32 3
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 5, i32 6>
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 0f56862446a9d..18acae5835724 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-20 | FileCheck %s
; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-15 | FileCheck %s --check-prefix=THR15
define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
@@ -17,122 +17,78 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[TMP115:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
-; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
-; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1
-; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32>
-; CHECK-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
-; CHECK-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32>
-; CHECK-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]]
-; CHECK-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16)
-; CHECK-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]]
-; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP34]], [[TMP33]]
-; CHECK-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP34]], [[TMP33]]
-; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
-; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]]
-; CHECK-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]]
-; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
-; CHECK-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
-; CHECK-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]]
-; CHECK-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
-; CHECK-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; CHECK-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]]
-; CHECK-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16)
-; CHECK-NEXT: [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]]
-; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP53]]
-; CHECK-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP54]], [[TMP53]]
-; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
-; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
-; CHECK-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
-; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
-; CHECK-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
-; CHECK-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
-; CHECK-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
-; CHECK-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
-; CHECK-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
-; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP115]], i32 0
-; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2)
-; CHECK-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
-; CHECK-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
-; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]]
-; CHECK-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16)
-; CHECK-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]]
-; CHECK-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]]
-; CHECK-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]]
-; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]]
-; CHECK-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]]
-; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]]
-; CHECK-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]]
-; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP87]], i64 4)
-; CHECK-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]]
-; CHECK-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]]
-; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4)
-; CHECK-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]]
-; CHECK-NEXT: [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]]
-; CHECK-NEXT: [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 12, i32 8, i32 13, i32 9, i32 14, i32 10, i32 15, i32 11>
-; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <16 x i32> [[TMP98]], <16 x i32> [[TMP99]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 18, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <16 x i32> [[TMP102]], <16 x i32> [[TMP103]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 poison, i32 18, i32 19>
-; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <16 x i32> [[TMP104]], <16 x i32> [[TMP105]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 poison, i32 17, i32 poison, i32 12, i32 poison, i32 14, i32 15>
-; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <16 x i32> [[TMP106]], <16 x i32> [[TMP107]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 10, i32 17, i32 12, i32 18, i32 14, i32 15>
-; CHECK-NEXT: [[TMP109:%.*]] = lshr <16 x i32> [[TMP108]], splat (i32 15)
-; CHECK-NEXT: [[TMP110:%.*]] = and <16 x i32> [[TMP109]], splat (i32 65537)
-; CHECK-NEXT: [[TMP111:%.*]] = mul <16 x i32> [[TMP110]], splat (i32 65535)
-; CHECK-NEXT: [[TMP112:%.*]] = add <16 x i32> [[TMP111]], [[TMP97]]
-; CHECK-NEXT: [[TMP113:%.*]] = xor <16 x i32> [[TMP112]], [[TMP108]]
-; CHECK-NEXT: [[TMP114:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP113]])
-; CHECK-NEXT: ret i32 [[TMP114]]
+; CHECK-NEXT: [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT: [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT: [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1
+; CHECK-NEXT: [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT: [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT: [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr null, align 1
+; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP10]], i64 0)
+; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP14]], i64 4)
+; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP16]], <4 x i8> [[TMP2]], i64 8)
+; CHECK-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP17]], <4 x i8> [[TMP6]], i64 12)
+; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
+; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr null, align 1
+; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP11]], i64 0)
+; CHECK-NEXT: [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP20]], i64 4)
+; CHECK-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP92]], i64 8)
+; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP23]], <4 x i8> [[TMP132]], i64 12)
+; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
+; CHECK-NEXT: [[TMP26:%.*]] = sub <16 x i32> [[TMP19]], [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 2, i32 6, i32 14, i32 10, i32 1, i32 5, i32 13, i32 9, i32 0, i32 4, i32 12, i32 8>
+; CHECK-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <2 x i8> [[TMP28]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP29]], <16 x i32> <i32 3, i32 4, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 0, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 19, i32 poison, i32 4, i32 poison, i32 18, i32 poison, i32 8, i32 poison, i32 17, i32 poison, i32 12, i32 13, i32 16, i32 poison>
+; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 poison, i32 6, i32 18, i32 8, i32 poison, i32 10, i32 17, i32 12, i32 13, i32 14, i32 16>
+; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP3]], i32 5
+; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP52]], i32 9
+; CHECK-NEXT: [[TMP37:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i32>
+; CHECK-NEXT: [[TMP38:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP13]], i64 0)
+; CHECK-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP38]], i64 4)
+; CHECK-NEXT: [[TMP41:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP98]], i64 8)
+; CHECK-NEXT: [[TMP42:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP41]], <4 x i8> [[TMP138]], i64 12)
+; CHECK-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32>
+; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 2, i32 6, i32 14, i32 10, i32 1, i32 5, i32 13, i32 9, i32 0, i32 4, i32 12, i32 8>
+; CHECK-NEXT: [[TMP45:%.*]] = sub <16 x i32> [[TMP37]], [[TMP44]]
+; CHECK-NEXT: [[TMP46:%.*]] = shl <16 x i32> [[TMP45]], splat (i32 16)
+; CHECK-NEXT: [[TMP47:%.*]] = add <16 x i32> [[TMP46]], [[TMP27]]
+; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP49:%.*]] = add <16 x i32> [[TMP47]], [[TMP48]]
+; CHECK-NEXT: [[TMP50:%.*]] = sub <16 x i32> [[TMP47]], [[TMP48]]
+; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP53:%.*]] = add <16 x i32> [[TMP51]], [[TMP70]]
+; CHECK-NEXT: [[TMP54:%.*]] = sub <16 x i32> [[TMP51]], [[TMP70]]
+; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT: [[TMP57:%.*]] = sub <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT: [[TMP58:%.*]] = add <16 x i32> [[TMP55]], [[TMP56]]
+; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 4, i32 21, i32 22, i32 7, i32 8, i32 25, i32 26, i32 11, i32 12, i32 29, i32 30, i32 15>
+; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEXT: [[TMP61:%.*]] = add <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]]
+; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP19]], <16 x i32> <i32 0, i32 20, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 28, i32 29, i32 30, i32 11, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT: [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15)
+; CHECK-NEXT: [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537)
+; CHECK-NEXT: [[TMP67:%.*]] = mul <16 x i32> [[TMP66]], splat (i32 65535)
+; CHECK-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP63]]
+; CHECK-NEXT: [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP64]]
+; CHECK-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
+; CHECK-NEXT: ret i32 [[ADD113_3]]
;
; THR15-LABEL: define i32 @test(
; THR15-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -148,122 +104,78 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1
+; THR15-NEXT: [[TMP48:%.*]] = load i8, ptr null, align 1
; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1
; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
-; THR15-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
-; THR15-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; THR15-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; THR15-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; THR15-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
-; THR15-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; THR15-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
-; THR15-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]]
-; THR15-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
-; THR15-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; THR15-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; THR15-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]]
-; THR15-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP14]], [[TMP13]]
-; THR15-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
-; THR15-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; THR15-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]]
-; THR15-NEXT: [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]]
-; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; THR15-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1
-; THR15-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32>
-; THR15-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
-; THR15-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
-; THR15-NEXT: [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]]
-; THR15-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; THR15-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
-; THR15-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32>
-; THR15-NEXT: [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]]
-; THR15-NEXT: [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16)
-; THR15-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]]
-; THR15-NEXT: [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; THR15-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP34]], [[TMP33]]
-; THR15-NEXT: [[TMP36:%.*]] = sub <4 x i32> [[TMP34]], [[TMP33]]
-; THR15-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
-; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; THR15-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]]
-; THR15-NEXT: [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]]
-; THR15-NEXT: [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; THR15-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; THR15-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
-; THR15-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
-; THR15-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
-; THR15-NEXT: [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]]
-; THR15-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; THR15-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
-; THR15-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; THR15-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]]
-; THR15-NEXT: [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16)
-; THR15-NEXT: [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]]
-; THR15-NEXT: [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; THR15-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP53]]
-; THR15-NEXT: [[TMP56:%.*]] = sub <4 x i32> [[TMP54]], [[TMP53]]
-; THR15-NEXT: [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> <i32 2, i32 7, i32 0, i32 5>
-; THR15-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; THR15-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
-; THR15-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
-; THR15-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; THR15-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
-; THR15-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
-; THR15-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
-; THR15-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
-; THR15-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; THR15-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
-; THR15-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
-; THR15-NEXT: [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2)
-; THR15-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
-; THR15-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; THR15-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
-; THR15-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; THR15-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]]
-; THR15-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16)
-; THR15-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]]
-; THR15-NEXT: [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; THR15-NEXT: [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]]
-; THR15-NEXT: [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]]
-; THR15-NEXT: [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; THR15-NEXT: [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; THR15-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]]
-; THR15-NEXT: [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]]
-; THR15-NEXT: [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; THR15-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]]
-; THR15-NEXT: [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]]
-; THR15-NEXT: [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP87]], i64 4)
-; THR15-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]]
-; THR15-NEXT: [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]]
-; THR15-NEXT: [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4)
-; THR15-NEXT: [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]]
-; THR15-NEXT: [[TMP96:%.*]] = sub <8 x i32> [[TMP90]], [[TMP94]]
-; THR15-NEXT: [[TMP97:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP96]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 12, i32 8, i32 13, i32 9, i32 14, i32 10, i32 15, i32 11>
-; THR15-NEXT: [[TMP98:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP99:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP100:%.*]] = shufflevector <16 x i32> [[TMP98]], <16 x i32> [[TMP99]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP101:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 18, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP104:%.*]] = shufflevector <16 x i32> [[TMP102]], <16 x i32> [[TMP103]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 poison, i32 18, i32 19>
-; THR15-NEXT: [[TMP105:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP106:%.*]] = shufflevector <16 x i32> [[TMP104]], <16 x i32> [[TMP105]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 poison, i32 17, i32 poison, i32 12, i32 poison, i32 14, i32 15>
-; THR15-NEXT: [[TMP107:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT: [[TMP108:%.*]] = shufflevector <16 x i32> [[TMP106]], <16 x i32> [[TMP107]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 10, i32 17, i32 12, i32 18, i32 14, i32 15>
-; THR15-NEXT: [[TMP109:%.*]] = lshr <16 x i32> [[TMP108]], splat (i32 15)
-; THR15-NEXT: [[TMP110:%.*]] = and <16 x i32> [[TMP109]], splat (i32 65537)
-; THR15-NEXT: [[TMP111:%.*]] = mul <16 x i32> [[TMP110]], splat (i32 65535)
-; THR15-NEXT: [[TMP112:%.*]] = add <16 x i32> [[TMP111]], [[TMP97]]
-; THR15-NEXT: [[TMP113:%.*]] = xor <16 x i32> [[TMP112]], [[TMP108]]
-; THR15-NEXT: [[TMP114:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP113]])
-; THR15-NEXT: ret i32 [[TMP114]]
+; THR15-NEXT: [[TMP143:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; THR15-NEXT: [[TMP146:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; THR15-NEXT: [[TMP147:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; THR15-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1
+; THR15-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT: [[TMP152:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; THR15-NEXT: [[TMP153:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; THR15-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; THR15-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; THR15-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; THR15-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr null, align 1
+; THR15-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP10]], i64 0)
+; THR15-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP14]], i64 4)
+; THR15-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP16]], <4 x i8> [[TMP2]], i64 8)
+; THR15-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP17]], <4 x i8> [[TMP6]], i64 12)
+; THR15-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
+; THR15-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr null, align 1
+; THR15-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP11]], i64 0)
+; THR15-NEXT: [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP20]], i64 4)
+; THR15-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP143]], i64 8)
+; THR15-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP23]], <4 x i8> [[TMP148]], i64 12)
+; THR15-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
+; THR15-NEXT: [[TMP26:%.*]] = sub <16 x i32> [[TMP19]], [[TMP25]]
+; THR15-NEXT: [[TMP27:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 2, i32 6, i32 14, i32 10, i32 1, i32 5, i32 13, i32 9, i32 0, i32 4, i32 12, i32 8>
+; THR15-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
+; THR15-NEXT: [[TMP29:%.*]] = shufflevector <2 x i8> [[TMP28]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THR15-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP29]], <16 x i32> <i32 3, i32 4, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 0, i32 5, i32 poison, i32 poison>
+; THR15-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THR15-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 19, i32 poison, i32 4, i32 poison, i32 18, i32 poison, i32 8, i32 poison, i32 17, i32 poison, i32 12, i32 13, i32 16, i32 poison>
+; THR15-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THR15-NEXT: [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 poison, i32 6, i32 18, i32 8, i32 poison, i32 10, i32 17, i32 12, i32 13, i32 14, i32 16>
+; THR15-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP1]], i32 5
+; THR15-NEXT: [[TMP36:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP48]], i32 9
+; THR15-NEXT: [[TMP37:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i32>
+; THR15-NEXT: [[TMP38:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; THR15-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP13]], i64 0)
+; THR15-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP38]], i64 4)
+; THR15-NEXT: [[TMP41:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP147]], i64 8)
+; THR15-NEXT: [[TMP42:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP41]], <4 x i8> [[TMP153]], i64 12)
+; THR15-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32>
+; THR15-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 2, i32 6, i32 14, i32 10, i32 1, i32 5, i32 13, i32 9, i32 0, i32 4, i32 12, i32 8>
+; THR15-NEXT: [[TMP45:%.*]] = sub <16 x i32> [[TMP37]], [[TMP44]]
+; THR15-NEXT: [[TMP46:%.*]] = shl <16 x i32> [[TMP45]], splat (i32 16)
+; THR15-NEXT: [[TMP47:%.*]] = add <16 x i32> [[TMP46]], [[TMP27]]
+; THR15-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
+; THR15-NEXT: [[TMP49:%.*]] = add <16 x i32> [[TMP47]], [[TMP70]]
+; THR15-NEXT: [[TMP50:%.*]] = sub <16 x i32> [[TMP47]], [[TMP70]]
+; THR15-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+; THR15-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; THR15-NEXT: [[TMP53:%.*]] = add <16 x i32> [[TMP51]], [[TMP52]]
+; THR15-NEXT: [[TMP54:%.*]] = sub <16 x i32> [[TMP51]], [[TMP52]]
+; THR15-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; THR15-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; THR15-NEXT: [[TMP57:%.*]] = sub <16 x i32> [[TMP55]], [[TMP56]]
+; THR15-NEXT: [[TMP58:%.*]] = add <16 x i32> [[TMP55]], [[TMP56]]
+; THR15-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 4, i32 21, i32 22, i32 7, i32 8, i32 25, i32 26, i32 11, i32 12, i32 29, i32 30, i32 15>
+; THR15-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; THR15-NEXT: [[TMP61:%.*]] = add <16 x i32> [[TMP59]], [[TMP60]]
+; THR15-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]]
+; THR15-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; THR15-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP19]], <16 x i32> <i32 0, i32 20, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 28, i32 29, i32 30, i32 11, i32 24, i32 25, i32 26, i32 27>
+; THR15-NEXT: [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15)
+; THR15-NEXT: [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537)
+; THR15-NEXT: [[TMP67:%.*]] = mul <16 x i32> [[TMP66]], splat (i32 65535)
+; THR15-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP63]]
+; THR15-NEXT: [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP64]]
+; THR15-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
+; THR15-NEXT: ret i32 [[ADD113_3]]
;
entry:
%0 = load i8, ptr %pix1, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
index 5d9975b25c381..7723746dda301 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -1022,8 +1022,10 @@ define i32 @stride_sum_abs_
diff (ptr %p, ptr %q, i64 %stride) {
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2)
; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index 3e2c305dbed65..e24c52ba81ddf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -1,45 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s
define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
-; SSE2-LABEL: @sitofp_uitofp(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @sitofp_uitofp(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @sitofp_uitofp(
-; AVX-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT: ret <8 x float> [[TMP3]]
-;
-; AVX2-LABEL: @sitofp_uitofp(
-; AVX2-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX2-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x float> [[TMP3]]
-;
-; AVX512-LABEL: @sitofp_uitofp(
-; AVX512-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @sitofp_uitofp(
+; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
@@ -69,39 +41,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
}
define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
-; SSE2-LABEL: @fptosi_fptoui(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: ret <8 x i32> [[TMP5]]
-;
-; SLM-LABEL: @fptosi_fptoui(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX2-LABEL: @fptosi_fptoui(
-; AVX2-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX2-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @fptosi_fptoui(
-; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @fptosi_fptoui(
+; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
@@ -131,39 +75,11 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
}
define <8 x float> @fneg_fabs(<8 x float> %a) {
-; SSE2-LABEL: @fneg_fabs(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]]
-;
-; SLM-LABEL: @fneg_fabs(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX-LABEL: @fneg_fabs(
-; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX2-LABEL: @fneg_fabs(
-; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX512-LABEL: @fneg_fabs(
-; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]]
+; CHECK-LABEL: @fneg_fabs(
+; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
+; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
@@ -209,39 +125,11 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
}
define <8 x i32> @sext_zext(<8 x i16> %a) {
-; SSE2-LABEL: @sext_zext(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: ret <8 x i32> [[TMP5]]
-;
-; SLM-LABEL: @sext_zext(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX-LABEL: @sext_zext(
-; AVX-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; AVX-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX2-LABEL: @sext_zext(
-; AVX2-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; AVX2-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @sext_zext(
-; AVX512-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; AVX512-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @sext_zext(
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
%a0 = extractelement <8 x i16> %a, i32 0
%a1 = extractelement <8 x i16> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 880523d6474ac..0f8751a6da7f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -1,45 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s
define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
-; SSE2-LABEL: @sitofp_uitofp(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @sitofp_uitofp(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @sitofp_uitofp(
-; AVX-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT: ret <8 x float> [[TMP3]]
-;
-; AVX2-LABEL: @sitofp_uitofp(
-; AVX2-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX2-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x float> [[TMP3]]
-;
-; AVX512-LABEL: @sitofp_uitofp(
-; AVX512-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @sitofp_uitofp(
+; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
@@ -69,39 +41,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
}
define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
-; SSE2-LABEL: @fptosi_fptoui(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: ret <8 x i32> [[TMP5]]
-;
-; SLM-LABEL: @fptosi_fptoui(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX2-LABEL: @fptosi_fptoui(
-; AVX2-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX2-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @fptosi_fptoui(
-; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @fptosi_fptoui(
+; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
@@ -131,39 +75,11 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
}
define <8 x float> @fneg_fabs(<8 x float> %a) {
-; SSE2-LABEL: @fneg_fabs(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SSE2-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SSE2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: ret <8 x float> [[DOTUNCASTED]]
-;
-; SLM-LABEL: @fneg_fabs(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SLM-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SLM-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX-LABEL: @fneg_fabs(
-; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT: ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX2-LABEL: @fneg_fabs(
-; AVX2-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX2-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX512-LABEL: @fneg_fabs(
-; AVX512-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX512-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x float> [[DOTUNCASTED]]
+; CHECK-LABEL: @fneg_fabs(
+; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
+; CHECK-NEXT: [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x float> [[DOTUNCASTED]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
@@ -209,39 +125,11 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
}
define <8 x i32> @sext_zext(<8 x i16> %a) {
-; SSE2-LABEL: @sext_zext(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: ret <8 x i32> [[TMP5]]
-;
-; SLM-LABEL: @sext_zext(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX-LABEL: @sext_zext(
-; AVX-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; AVX-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX2-LABEL: @sext_zext(
-; AVX2-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; AVX2-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @sext_zext(
-; AVX512-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; AVX512-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @sext_zext(
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
%a0 = extractelement <8 x i16> %a, i32 0
%a1 = extractelement <8 x i16> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
index 5cee6984df04f..5a1de4f3e3d7f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -1,47 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fadd_fsub_v8f32(
-; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SSE-NEXT: ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @fadd_fsub_v8f32(
-; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SLM-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @fadd_fsub_v8f32(
-; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX2-LABEL: @fadd_fsub_v8f32(
-; AVX2-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX2-NEXT: ret <8 x float> [[TMP3]]
-;
-; AVX512-LABEL: @fadd_fsub_v8f32(
-; AVX512-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT: ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @fadd_fsub_v8f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
@@ -79,43 +49,11 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SSE-NEXT: ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SLM-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX2-LABEL: @fmul_fdiv_v8f32(
-; AVX2-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX2-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX512-LABEL: @fmul_fdiv_v8f32(
-; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT: ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @fmul_fdiv_v8f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
@@ -172,10 +110,6 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
; AVX-NEXT: ret <4 x float> [[TMP1]]
;
-; AVX2-LABEL: @fmul_fdiv_v4f32_const(
-; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
-; AVX2-NEXT: ret <4 x float> [[TMP1]]
-;
; AVX512-LABEL: @fmul_fdiv_v4f32_const(
; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
; AVX512-NEXT: ret <4 x float> [[TMP1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
index 9a2f959ac63bc..046ed781f4c8d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -1,47 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fadd_fsub_v8f32(
-; SSE-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SSE-NEXT: ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @fadd_fsub_v8f32(
-; SLM-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SLM-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @fadd_fsub_v8f32(
-; AVX-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX2-LABEL: @fadd_fsub_v8f32(
-; AVX2-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX2-NEXT: ret <8 x float> [[TMP3]]
-;
-; AVX512-LABEL: @fadd_fsub_v8f32(
-; AVX512-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT: ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @fadd_fsub_v8f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
@@ -79,43 +49,11 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SSE-NEXT: ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SLM-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX2-LABEL: @fmul_fdiv_v8f32(
-; AVX2-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX2-NEXT: [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX2-NEXT: ret <8 x float> [[TMP5]]
-;
-; AVX512-LABEL: @fmul_fdiv_v8f32(
-; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT: ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @fmul_fdiv_v8f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
;
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
@@ -172,10 +110,6 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
; AVX-NEXT: ret <4 x float> [[TMP1]]
;
-; AVX2-LABEL: @fmul_fdiv_v4f32_const(
-; AVX2-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
-; AVX2-NEXT: ret <4 x float> [[TMP1]]
-;
; AVX512-LABEL: @fmul_fdiv_v4f32_const(
; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
; AVX512-NEXT: ret <4 x float> [[TMP1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
index f8c5df9944538..8839fc2281788 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -7,39 +7,11 @@
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @add_sub_v8i32(
-; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT: ret <8 x i32> [[TMP5]]
-;
-; SLM-LABEL: @add_sub_v8i32(
-; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX1-LABEL: @add_sub_v8i32(
-; AVX1-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX2-LABEL: @add_sub_v8i32(
-; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @add_sub_v8i32(
-; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @add_sub_v8i32(
+; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
@@ -134,16 +106,14 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT: ret <8 x i32> [[TMP5]]
+; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT: ret <8 x i32> [[R71]]
;
; SLM-LABEL: @ashr_shl_v8i32(
; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT: ret <8 x i32> [[TMP3]]
;
; AVX1-LABEL: @ashr_shl_v8i32(
; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
@@ -204,16 +174,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT: ret <8 x i32> [[TMP5]]
+; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: ret <8 x i32> [[R71]]
;
; SLM-LABEL: @ashr_shl_v8i32_const(
; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
+; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT: ret <8 x i32> [[R71]]
;
; AVX1-LABEL: @ashr_shl_v8i32_const(
; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
@@ -531,49 +501,13 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
}
define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: @add_sub_v8i32_splat(
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT: ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @add_sub_v8i32_splat(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[TMP7]]
-;
-; AVX1-LABEL: @add_sub_v8i32_splat(
-; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX2-LABEL: @add_sub_v8i32_splat(
-; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX512-LABEL: @add_sub_v8i32_splat(
-; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP5]]
+; CHECK-LABEL: @add_sub_v8i32_splat(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP5]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index b84ef027f67c5..dfa918a6ea453 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -7,39 +7,11 @@
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @add_sub_v8i32(
-; SSE-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT: ret <8 x i32> [[TMP5]]
-;
-; SLM-LABEL: @add_sub_v8i32(
-; SLM-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX1-LABEL: @add_sub_v8i32(
-; AVX1-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX1-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX2-LABEL: @add_sub_v8i32(
-; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @add_sub_v8i32(
-; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @add_sub_v8i32(
+; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
@@ -134,16 +106,14 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT: ret <8 x i32> [[TMP5]]
+; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT: ret <8 x i32> [[R71]]
;
; SLM-LABEL: @ashr_shl_v8i32(
; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
+; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT: ret <8 x i32> [[TMP3]]
;
; AVX1-LABEL: @ashr_shl_v8i32(
; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
@@ -204,16 +174,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT: ret <8 x i32> [[TMP5]]
+; SSE-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: ret <8 x i32> [[R71]]
;
; SLM-LABEL: @ashr_shl_v8i32_const(
; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[TMP5]]
+; SLM-NEXT: [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT: ret <8 x i32> [[R71]]
;
; AVX1-LABEL: @ashr_shl_v8i32_const(
; AVX1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
@@ -531,49 +501,13 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
}
define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: @add_sub_v8i32_splat(
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT: ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @add_sub_v8i32_splat(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SLM-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[TMP7]]
-;
-; AVX1-LABEL: @add_sub_v8i32_splat(
-; AVX1-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX1-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX1-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX1-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX2-LABEL: @add_sub_v8i32_splat(
-; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT: ret <8 x i32> [[TMP5]]
-;
-; AVX512-LABEL: @add_sub_v8i32_splat(
-; AVX512-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX512-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT: ret <8 x i32> [[TMP5]]
+; CHECK-LABEL: @add_sub_v8i32_splat(
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: ret <8 x i32> [[TMP5]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
index 7ed5f33c9dc6c..b659c10bb2fbf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
@@ -7,7 +7,7 @@ define void @test() {
; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[ADD]], i32 3
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0
+; CHECK-NEXT: [[ICMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0
; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64
; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]]
@@ -16,8 +16,6 @@ define void @test() {
; CHECK-NEXT: [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer)
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[CALL]], i32 3
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4)
; CHECK-NEXT: ret void
;
bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll
index e42e6183b8cae..48b04201d1acc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll
@@ -15,8 +15,8 @@ define ptr @test(ptr %0, ptr %args_gep) {
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[ARG26]], i64 17
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 8
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[ARG1]], i64 12
-; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0:![0-9]+]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0]]
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
index 70c67ff251d6d..9fc2b7d6e7865 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
@@ -9,10 +9,10 @@ define void @test(ptr noalias %0, ptr noalias %1) {
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16
; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP7]], <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 5>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP14]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP7]], <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 5>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP10]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11>
; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index faaac0c7614f6..c3122d991da20 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -286,8 +286,8 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S
; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
index ea497c95d4114..cfbfd0ebc37bc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
@@ -10,24 +10,22 @@ define i32 @bar() local_unnamed_addr {
; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef
; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef
; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[SUB102_1]], i32 2
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[ADD94_1]], i32 3
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef>, i32 [[SUB86_1]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[ADD78_2]], i32 5
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[SUB102_3]], i32 6
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 5, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 14>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 13, i32 14, i32 7>
-; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <8 x i32> [[TMP7]], [[TMP9]]
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP18]], <8 x i32> [[TMP10]], i64 8)
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[SUB102_1]], i32 4
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 5
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 poison>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 7, i32 6, i32 5, i32 4, i32 24, i32 25, i32 26, i32 27, i32 poison, i32 29, i32 30, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[SUB102_3]], i32 12
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12>
+; CHECK-NEXT: [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], splat (i32 15)
; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[TMP12]], splat (i32 65537)
; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], splat (i32 65535)
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP20]]
+; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP11]]
; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]])
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP17]], 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
index e9a65bf6d6f0d..2f49a2e6a212e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
@@ -6,11 +6,11 @@ define i1 @foo() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_2329_I_I:%.*]] = icmp ne i32 0, 0
; CHECK-NEXT: [[STOREMERGE_2333_I_I:%.*]] = select i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 0, i32 0
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[STOREMERGE_2333_I_I]], i32 1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> zeroinitializer, [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 0>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> <i1 false, i1 false, i1 undef, i1 undef>, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef>, <4 x i1> [[TMP6]], i64 4)
+; CHECK-NEXT: [[TOBOOL_NOT_NOT509_I_1_2_I_I:%.*]] = icmp ne i32 [[STOREMERGE_2333_I_I]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL_NOT_NOT509_I_1_2_I_I]], i32 4
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i1> [[TMP0]], i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 5
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP1]], <4 x i1> zeroinitializer, i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v2i1(<8 x i1> [[TMP2]], <2 x i1> zeroinitializer, i64 6)
; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 17ae33652b6d8..df85656800aac 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -153,8 +153,8 @@ define float @foo3(ptr nocapture readonly %A) #0 {
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
@@ -283,7 +283,7 @@ define void @test(ptr %i1, ptr %i2, ptr %o, i1 %arg) {
; CHECK-NEXT: [[I1_0:%.*]] = load x86_fp80, ptr [[I1:%.*]], align 16
; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr x86_fp80, ptr [[I1]], i64 1
; CHECK-NEXT: [[I1_1:%.*]] = load x86_fp80, ptr [[I1_GEP1]], align 16
-; CHECK-NEXT: br i1 [[ARG:%.*]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK-NEXT: br i1 %arg, label [[THEN:%.*]], label [[END:%.*]]
; CHECK: then:
; CHECK-NEXT: [[I2_0:%.*]] = load x86_fp80, ptr [[I2:%.*]], align 16
; CHECK-NEXT: [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, ptr [[I2]], i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
index b4e66138578df..787bd39759dc7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
@@ -103,10 +103,10 @@ define void @test2(ptr %p1, ptr %p2) {
; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> <double 4.000000e+00, double 4.100000e+00>, [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], <double 2.000000e+00, double 2.100000e+00>
; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP10]], <double 3.000000e+00, double 3.100000e+00>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: br label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP15:%.*]], [[BB6:%.*]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x double> [ [[TMP11]], [[BB1]] ], [ [[TMP16:%.*]], [[BB6:%.*]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[X0:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, ptr [[X0]], align 8
; CHECK-NEXT: br i1 poison, label [[BB3:%.*]], label [[BB6]]
@@ -117,7 +117,8 @@ define void @test2(ptr %p1, ptr %p2) {
; CHECK: bb5:
; CHECK-NEXT: br label [[BB6]]
; CHECK: bb6:
-; CHECK-NEXT: [[TMP15]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ]
+; CHECK-NEXT: [[TMP16]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: br label [[BB2]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
index cda88620ab88a..9682567b173c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
@@ -11,9 +11,9 @@ define void @test() {
; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i16> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i16> [[TMP6]], zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i16> zeroinitializer, [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = sub <4 x i16> zeroinitializer, [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
index 3b9222b7d5ed1..8a017a397cff9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-4 < %s | FileCheck %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-10 < %s | FileCheck %s
define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
; CHECK-LABEL: define i32 @test(
@@ -13,7 +13,7 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
; CHECK-NEXT: br i1 false, label %[[D_EXIT_3]], label %[[D_EXIT_6:.*]]
; CHECK: [[D_EXIT_3]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[ENTRY]] ], [ poison, %[[IF_END_I_1]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[RETVAL_0_I_219]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 0, i32 0>, i32 [[RETVAL_0_I_219]], i32 0
; CHECK-NEXT: br i1 [[TOBOOL_I_4]], label %[[D_EXIT_4:.*]], label %[[D_EXIT_6]]
; CHECK: [[D_EXIT_4]]:
; CHECK-NEXT: br label %[[D_EXIT_6]]
@@ -21,29 +21,25 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
; CHECK-NEXT: br i1 false, label %[[D_EXIT_6]], label %[[D_EXIT_7:.*]]
; CHECK: [[D_EXIT_6]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP1]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ [[TMP1]], %[[D_EXIT_4]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ zeroinitializer, %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]
; CHECK-NEXT: br label %[[D_EXIT_7]]
; CHECK: [[D_EXIT_7]]:
-; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP8]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 poison, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> <i32 1, i32 poison, i32 poison, i32 1, i32 poison, i32 1, i32 1, i32 poison>, <8 x i32> <i32 8, i32 1, i32 2, i32 11, i32 poison, i32 13, i32 14, i32 poison>
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP0]], i32 4
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[RETVAL_0_I_219]], i32 7
+; CHECK-NEXT: [[TMP12:%.*]] = add <8 x i32> [[TMP11]], [[TMP7]]
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[RETVAL_0_I_219]], i32 3
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 1, i32 1>, i32 [[RETVAL_0_I_219]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]]
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP22:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP21]], <4 x i32> [[TMP10]], i64 4)
-; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> <i32 poison, i32 poison, i32 1, i32 1, i32 1, i32 poison, i32 poison, i32 1>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 5, i32 6, i32 15>
-; CHECK-NEXT: [[TMP19:%.*]] = add <8 x i32> [[TMP18]], [[TMP22]]
-; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP19]], i64 0)
-; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i32> [[TMP20]], [[TMP16]]
-; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP19]], <4 x i32> [[RDX_OP]], i64 0)
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP12]])
-; CHECK-NEXT: ret i32 [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP16]])
+; CHECK-NEXT: [[OP_RDX4:%.*]] = or i32 [[TMP18]], [[TMP17]]
+; CHECK-NEXT: ret i32 [[OP_RDX4]]
;
entry:
%0 = load i32, ptr %f, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
index 1294a87ff6967..c01c44ff03c15 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
@@ -7,14 +7,20 @@ define void @test(i1 %c, ptr %arg) {
; CHECK: if:
; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2)
; CHECK-NEXT: br label [[JOIN:%.*]]
; CHECK: else:
; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0)
+; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2)
; CHECK-NEXT: br label [[JOIN]]
; CHECK: join:
; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
index 38e9ba7ce7028..33fa00c1881da 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
@@ -6,19 +6,23 @@ define i32 @a() {
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
-; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP10]], <4 x i8> [[TMP6]], i64 4)
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 2, i32 3, i32 12, i32 3, i32 12, i32 13, i32 14>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 9, i32 13>
; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP23]], <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i8> [[TMP13]], ptr null, align 4
+; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr null, align 4
+; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: br label %[[BB1]]
;
br label %1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
index e3a6020a542fb..0ed12760b563f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
@@ -35,8 +35,8 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
index cea98bf55b6ff..f47373747e578 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -35,8 +35,8 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 7060288d739bd..d650a972ad8ca 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -29,8 +29,8 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
diff --git a/llvm/test/Transforms/SLPVectorizer/addsub.ll b/llvm/test/Transforms/SLPVectorizer/addsub.ll
index 6814bc0f566f6..3961250d56451 100644
--- a/llvm/test/Transforms/SLPVectorizer/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/addsub.ll
@@ -387,10 +387,14 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re
define void @vec_shuff_reorder() #0 {
; CHECK-LABEL: @vec_shuff_reorder(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @fb, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @fa, align 4
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr @fa, align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2)
; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
More information about the llvm-commits
mailing list