[llvm] ef98e24 - [SLP]Initial support for copyable elements (non-schedulable only)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 25 10:55:24 PDT 2025
Author: Alexey Bataev
Date: 2025-07-25T10:55:07-07:00
New Revision: ef98e248c7740fb882b256dd325d22a057de1951
URL: https://github.com/llvm/llvm-project/commit/ef98e248c7740fb882b256dd325d22a057de1951
DIFF: https://github.com/llvm/llvm-project/commit/ef98e248c7740fb882b256dd325d22a057de1951.diff
LOG: [SLP]Initial support for copyable elements (non-schedulable only)
Adds initial support for copyable elements. This patch only models adds
and model copyable elements as add <element>, 0, i.e. uses identity
constants for missing lanes.
Only support for elements, which do not require scheduling, is added to
reduce size of the patch.
Fixed compile time regressions, reported crashes, updated release notes
Reviewers: RKSimon, hiraditya
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/140279
Added:
llvm/test/Transforms/SLPVectorizer/X86/user-node-no-state.ll
Modified:
llvm/docs/ReleaseNotes.md
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
Removed:
################################################################################
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 48d2ef1b4d1c5..021f321bd9dc2 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -68,6 +68,12 @@ Changes to TableGen
Changes to Interprocedural Optimizations
----------------------------------------
+Changes to Vectorizers
+----------------------------------------
+
+* Added initial support for copyable elements in SLP, which models copyable
+ elements as add <element>, 0, i.e. uses identity constants for missing lanes.
+
Changes to the AArch64 Backend
------------------------------
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0d0b342505214..593868fb8811a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -206,6 +206,12 @@ static cl::opt<bool> VectorizeNonPowerOf2(
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
+/// Enables vectorization of copyable elements.
+static cl::opt<bool> VectorizeCopyableElements(
+ "slp-copyable-elements", cl::init(true), cl::Hidden,
+ cl::desc("Try to replace values with the idempotent instructions for "
+ "better vectorization."));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -855,6 +861,13 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) {
return *EI->idx_begin();
}
+namespace llvm {
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V);
+} // namespace llvm
+
namespace {
/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
@@ -957,6 +970,33 @@ class BinOpSameOpcodeHelper {
return Instruction::Xor;
llvm_unreachable("Cannot find interchangeable instruction.");
}
+
+ /// Return true if the instruction can be converted to \p Opcode.
+ bool hasCandidateOpcode(unsigned Opcode) const {
+ MaskType Candidate = Mask & SeenBefore;
+ switch (Opcode) {
+ case Instruction::Shl:
+ return Candidate & ShlBIT;
+ case Instruction::AShr:
+ return Candidate & AShrBIT;
+ case Instruction::Mul:
+ return Candidate & MulBIT;
+ case Instruction::Add:
+ return Candidate & AddBIT;
+ case Instruction::Sub:
+ return Candidate & SubBIT;
+ case Instruction::And:
+ return Candidate & AndBIT;
+ case Instruction::Or:
+ return Candidate & OrBIT;
+ case Instruction::Xor:
+ return Candidate & XorBIT;
+ default:
+ break;
+ }
+ llvm_unreachable("Cannot find interchangeable instruction.");
+ }
+
SmallVector<Value *> getOperand(const Instruction *To) const {
unsigned ToOpcode = To->getOpcode();
unsigned FromOpcode = I->getOpcode();
@@ -1117,6 +1157,10 @@ class BinOpSameOpcodeHelper {
AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
}
unsigned getMainOpcode() const { return MainOp.getOpcode(); }
+ /// Checks if the list of potential opcodes includes \p Opcode.
+ bool hasCandidateOpcode(unsigned Opcode) const {
+ return MainOp.hasCandidateOpcode(Opcode);
+ }
bool hasAltOp() const { return AltOp.I; }
unsigned getAltOpcode() const {
return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
@@ -1152,6 +1196,8 @@ class InstructionsState {
/// GetVectorCost.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
+ /// Wether the instruction state represents copyable instructions.
+ bool HasCopyables = false;
public:
Instruction *getMainOp() const {
@@ -1190,9 +1236,11 @@ class InstructionsState {
if (!I->isBinaryOp())
return nullptr;
BinOpSameOpcodeHelper Converter(MainOp);
- if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp())
- return MainOp;
- return AltOp;
+ if (!Converter.add(I) || !Converter.add(MainOp))
+ return nullptr;
+ if (Converter.hasAltOp() && !isAltShuffle())
+ return nullptr;
+ return Converter.hasAltOp() ? AltOp : MainOp;
}
/// Checks if main/alt instructions are shift operations.
@@ -1237,9 +1285,63 @@ class InstructionsState {
explicit operator bool() const { return valid(); }
InstructionsState() = delete;
- InstructionsState(Instruction *MainOp, Instruction *AltOp)
- : MainOp(MainOp), AltOp(AltOp) {}
+ InstructionsState(Instruction *MainOp, Instruction *AltOp,
+ bool HasCopyables = false)
+ : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
static InstructionsState invalid() { return {nullptr, nullptr}; }
+
+ bool isCopyableElement(Value *V) const {
+ assert(valid() && "InstructionsState is invalid.");
+ if (!HasCopyables)
+ return false;
+ if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return !isa<PoisonValue>(V);
+ if (I->getParent() != MainOp->getParent() &&
+ (!isVectorLikeInstWithConstOps(I) ||
+ !isVectorLikeInstWithConstOps(MainOp)))
+ return true;
+ if (I->getOpcode() == MainOp->getOpcode())
+ return false;
+ if (!I->isBinaryOp())
+ return true;
+ BinOpSameOpcodeHelper Converter(MainOp);
+ return !Converter.add(I) || !Converter.add(MainOp) ||
+ Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
+ }
+
+ /// Checks if the value is non-schedulable.
+ bool isNonSchedulable(Value *V) const {
+ assert(valid() && "InstructionsState is invalid.");
+ auto *I = dyn_cast<Instruction>(V);
+ if (!HasCopyables)
+ return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
+ doesNotNeedToBeScheduled(V);
+ // MainOp for copyables always schedulable to correctly identify
+ // non-schedulable copyables.
+ if (isCopyableElement(V)) {
+ auto IsNonSchedulableCopyableElement = [this](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
+ (doesNotNeedToBeScheduled(I) &&
+ // If the copyable instructions comes after MainOp
+ // (non-schedulable, but used in the block) - cannot vectorize
+ // it, will possibly generate use before def.
+ (isVectorLikeInstWithConstOps(I) || !MainOp->comesBefore(I)));
+ };
+
+ return IsNonSchedulableCopyableElement(V);
+ }
+ return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
+ doesNotNeedToBeScheduled(V);
+ }
+
+ bool areInstructionsWithCopyableElements() const {
+ assert(valid() && "InstructionsState is invalid.");
+ return HasCopyables;
+ }
};
std::pair<Instruction *, SmallVector<Value *>>
@@ -1917,6 +2019,7 @@ class BoUpSLP {
CompressEntryToData.clear();
ExternalUses.clear();
ExternalUsesAsOriginalScalar.clear();
+ ExternalUsesWithNonUsers.clear();
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
@@ -2899,9 +3002,6 @@ class BoUpSLP {
for (OperandDataVec &Ops : OpsVec)
Ops.resize(NumLanes);
for (unsigned Lane : seq<unsigned>(NumLanes)) {
- Value *V = VL[Lane];
- assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
- "Expected instruction or poison value");
// Our tree has just 3 nodes: the root and two operands.
// It is therefore trivial to get the APO. We only need to check the
// opcode of V and whether the operand at OpIdx is the LHS or RHS
@@ -2912,17 +3012,24 @@ class BoUpSLP {
// Since operand reordering is performed on groups of commutative
// operations or alternating sequences (e.g., +, -), we can safely tell
// the inverse operations by checking commutativity.
- if (isa<PoisonValue>(V)) {
+ auto *I = dyn_cast<Instruction>(VL[Lane]);
+ if (!I && isa<PoisonValue>(VL[Lane])) {
for (unsigned OpIdx : seq<unsigned>(NumOperands))
OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
continue;
}
- auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S);
- // We cannot check commutativity by the converted instruction
- // (SelectedOp) because isCommutative also examines def-use
- // relationships.
- bool IsInverseOperation =
- !isCommutative(SelectedOp, cast<Instruction>(V));
+ bool IsInverseOperation = false;
+ if (S.isCopyableElement(VL[Lane])) {
+ // The value is a copyable element.
+ IsInverseOperation = !isCommutative(MainOp);
+ } else {
+ assert(I && "Expected instruction");
+ auto [SelectedOp, Ops] = convertTo(I, S);
+ // We cannot check commutativity by the converted instruction
+ // (SelectedOp) because isCommutative also examines def-use
+ // relationships.
+ IsInverseOperation = !isCommutative(SelectedOp, I);
+ }
for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
@@ -3792,6 +3899,9 @@ class BoUpSLP {
/// reordering of operands during buildTreeRec() and vectorizeTree().
SmallVector<ValueList, 2> Operands;
+ /// Copyable elements of the entry node.
+ SmallPtrSet<const Value *, 4> CopyableElements;
+
/// MainOp and AltOp are recorded inside. S should be obtained from
/// newTreeEntry.
InstructionsState S = InstructionsState::invalid();
@@ -3820,11 +3930,7 @@ class BoUpSLP {
void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
/// Marks the node as one that does not require scheduling.
- void setDoesNotNeedToSchedule() {
- assert(::doesNotNeedToSchedule(Scalars) &&
- "Expected to not need scheduling");
- DoesNotNeedToSchedule = true;
- }
+ void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
/// Returns true if the node is marked as one that does not require
/// scheduling.
bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
@@ -3896,6 +4002,20 @@ class BoUpSLP {
bool hasState() const { return S.valid(); }
+ /// Add \p V to the list of copyable elements.
+ void addCopyableElement(Value *V) {
+ assert(S.isCopyableElement(V) && "Not a copyable element.");
+ CopyableElements.insert(V);
+ }
+
+ /// Returns true if \p V is a copyable element.
+ bool isCopyableElement(Value *V) const {
+ return CopyableElements.contains(V);
+ }
+
+ /// Returns true if any scalar in the list is a copyable element.
+ bool hasCopyableElements() const { return !CopyableElements.empty(); }
+
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
unsigned findLaneForValue(Value *V) const {
@@ -3968,6 +4088,8 @@ class BoUpSLP {
for (Value *V : Scalars)
dbgs().indent(2) << *V << "\n";
dbgs() << "State: ";
+ if (S && hasCopyableElements())
+ dbgs() << "[[Copyable]] ";
switch (State) {
case Vectorize:
if (InterleaveFactor > 0) {
@@ -4145,12 +4267,20 @@ class BoUpSLP {
}
}
} else if (!Last->isGather()) {
- if (doesNotNeedToSchedule(VL))
+ if (isa<PHINode>(S.getMainOp()) ||
+ isVectorLikeInstWithConstOps(S.getMainOp()) ||
+ (!S.areInstructionsWithCopyableElements() &&
+ doesNotNeedToSchedule(VL)) ||
+ all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
Last->setDoesNotNeedToSchedule();
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
if (isa<PoisonValue>(V))
continue;
+ if (S.isCopyableElement(V)) {
+ Last->addCopyableElement(V);
+ continue;
+ }
auto It = ScalarToTreeEntries.find(V);
if (It == ScalarToTreeEntries.end()) {
ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
@@ -4162,16 +4292,14 @@ class BoUpSLP {
}
}
// Update the scheduler bundle to point to this TreeEntry.
- assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) ||
- isVectorLikeInstWithConstOps(S.getMainOp()) ||
- Last->doesNotNeedToSchedule()) &&
+ assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
"Bundle and VL out of sync");
if (!Bundle.getBundle().empty()) {
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
auto *BundleMember = Bundle.getBundle().begin();
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second)
+ if (S.isNonSchedulable(V) || !Processed.insert(V).second)
continue;
++BundleMember;
}
@@ -4280,7 +4408,8 @@ class BoUpSLP {
/// in general.
ScalarsVectorizationLegality
getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) const;
+ const EdgeInfo &UserTreeIdx,
+ bool TryCopyableElementsVectorization) const;
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
@@ -4420,6 +4549,10 @@ class BoUpSLP {
/// extractelement instructions.
SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
+ /// A list of scalar to be extracted without specific user necause of too many
+ /// uses.
+ SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
+
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
@@ -4996,7 +5129,8 @@ class BoUpSLP {
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
- ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
+ ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
@@ -6727,7 +6861,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
return std::move(ResOrder);
}
if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
- (!TE.UserTreeIndex ||
+ (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
!Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
(TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
return std::nullopt;
@@ -7038,10 +7172,11 @@ bool BoUpSLP::isProfitableToReorder() const {
VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
VectorizableTree.front()->ReorderIndices.empty()) {
// Check if the tree has only single store and single (unordered) load node,
- // other nodes are phis or geps/binops, combined with phis, and/orsingle
+ // other nodes are phis or geps/binops, combined with phis, and/or single
// gather load node
bool HasPhis = false;
- if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+ if (VectorizableTree.front()->hasState() &&
+ VectorizableTree.front()->getOpcode() == Instruction::PHI &&
VectorizableTree.front()->Scalars.size() == TinyVF &&
VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
return false;
@@ -7049,6 +7184,8 @@ bool BoUpSLP::isProfitableToReorder() const {
unsigned GatherLoads = 0;
for (const std::unique_ptr<TreeEntry> &TE :
ArrayRef(VectorizableTree).drop_front()) {
+ if (TE->State == TreeEntry::SplitVectorize)
+ continue;
if (!TE->hasState()) {
if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
@@ -7072,7 +7209,10 @@ bool BoUpSLP::isProfitableToReorder() const {
if (TE->getOpcode() == Instruction::GetElementPtr ||
Instruction::isBinaryOp(TE->getOpcode()))
continue;
- if (TE->getOpcode() != Instruction::PHI)
+ if (TE->getOpcode() != Instruction::PHI &&
+ (!TE->hasCopyableElements() ||
+ static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
+ TE->Scalars.size() / 2))
return true;
if (VectorizableTree.front()->Scalars.size() == TinyVF &&
TE->getNumOperands() > PhiOpsLimit)
@@ -7860,7 +8000,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
- if ((Entry.getOpcode() == Instruction::Store ||
+ if (Entry.hasState() &&
+ (Entry.getOpcode() == Instruction::Store ||
Entry.getOpcode() == Instruction::Load) &&
Entry.State == TreeEntry::StridedVectorize &&
!Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
@@ -7870,7 +8011,9 @@ Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
void BoUpSLP::buildExternalUses(
const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
+ const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
DenseMap<Value *, unsigned> ScalarToExtUses;
+ SmallPtrSet<Value *, 4> ExternalUsers;
// Collect the values that we need to extract from the tree.
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
@@ -7882,13 +8025,24 @@ void BoUpSLP::buildExternalUses(
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
- if (!isa<Instruction>(Scalar))
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
continue;
+
// All uses must be replaced already? No need to do it again.
auto It = ScalarToExtUses.find(Scalar);
if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
continue;
+ if (Scalar->hasNUsesOrMore(NumVectScalars)) {
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
+ << " from " << *Scalar << "for many users.\n");
+ It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
+ ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
+ ExternalUsesWithNonUsers.insert(Scalar);
+ continue;
+ }
+
// Check if the scalar is externally used as an extra arg.
const auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
@@ -7916,7 +8070,10 @@ void BoUpSLP::buildExternalUses(
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in FoundLane will
// be used.
- if (all_of(UseEntries, [&](TreeEntry *UseEntry) {
+ if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
+ isa<LoadInst, StoreInst>(UserInst)) ||
+ isa<CallInst>(UserInst)) ||
+ all_of(UseEntries, [&](TreeEntry *UseEntry) {
return UseEntry->State == TreeEntry::ScatterVectorize ||
!doesInTreeUserNeedToExtract(
Scalar, getRootEntryInstruction(*UseEntry), TLI,
@@ -7946,6 +8103,7 @@ void BoUpSLP::buildExternalUses(
<< ".\n");
It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
+ ExternalUsesWithNonUsers.insert(Scalar);
if (!U)
break;
}
@@ -9612,7 +9770,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
PoisonValue::get(UniqueValues.front()->getType()));
// Check that extended with poisons operations are still valid for
// vectorization (div/rem are not allowed).
- if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) {
+ if (!S.areInstructionsWithCopyableElements() &&
+ !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
ReuseShuffleIndices.clear();
return false;
@@ -9761,13 +9920,95 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
}
namespace {
-/// Class accepts incoming list of values and generates the list of values
-/// for scheduling and list of operands for the new nodes.
+/// Class accepts incoming list of values, checks if it is able to model
+/// "copyable" values as compatible operations, and generates the list of values
+/// for scheduling and list of operands doe the new nodes.
class InstructionsCompatibilityAnalysis {
DominatorTree &DT;
const DataLayout &DL;
const TargetTransformInfo &TTI;
const TargetLibraryInfo &TLI;
+ unsigned MainOpcode = 0;
+ Instruction *MainOp = nullptr;
+
+ /// Identifies the best candidate value, which represents main opcode
+ /// operation.
+ /// Currently the best candidate is the Add instruction with the parent
+ /// block with the highest DFS incoming number (block, that dominates other).
+ void findAndSetMainInstruction(ArrayRef<Value *> VL) {
+ BasicBlock *Parent = nullptr;
+ // Checks if the instruction has supported opcode.
+ auto IsSupportedOpcode = [](Instruction *I) {
+ return I && I->getOpcode() == Instruction::Add;
+ };
+ SmallDenseSet<Value *, 8> Operands;
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (!DT.isReachableFromEntry(I->getParent()))
+ continue;
+ if (!MainOp) {
+ MainOp = I;
+ Parent = I->getParent();
+ Operands.insert(I->op_begin(), I->op_end());
+ continue;
+ }
+ if (Parent == I->getParent()) {
+ if (!IsSupportedOpcode(MainOp))
+ MainOp = I;
+ if (MainOp->getOpcode() == I->getOpcode() &&
+ doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I))
+ MainOp = I;
+ Operands.insert(I->op_begin(), I->op_end());
+ continue;
+ }
+ auto *NodeA = DT.getNode(Parent);
+ auto *NodeB = DT.getNode(I->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) ==
+ (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have
diff erent DFS numbers");
+ if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
+ MainOp = I;
+ Parent = I->getParent();
+ Operands.clear();
+ Operands.insert(I->op_begin(), I->op_end());
+ }
+ }
+ if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
+ MainOp = nullptr;
+ return;
+ }
+ MainOpcode = MainOp->getOpcode();
+ }
+
+ /// Returns the idempotent value for the \p MainOp with the detected \p
+ /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
+ /// the operand itself, since V or V == V.
+ Value *selectBestIdempotentValue() const {
+ assert(MainOpcode == Instruction::Add && "Unsupported opcode");
+ return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
+ !MainOp->isCommutative());
+ }
+
+ /// Returns the value and operands for the \p V, considering if it is original
+ /// instruction and its actual operands should be returned, or it is a
+ /// copyable element and its should be represented as idempotent instruction.
+ SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
+ if (isa<PoisonValue>(V))
+ return {V, V};
+ if (!S.isCopyableElement(V))
+ return convertTo(cast<Instruction>(V), S).second;
+ switch (MainOpcode) {
+ case Instruction::Add:
+ return {V, selectBestIdempotentValue()};
+ default:
+ break;
+ }
+ llvm_unreachable("Unsupported opcode");
+ }
/// Builds operands for the original instructions.
void
@@ -9928,22 +10169,165 @@ class InstructionsCompatibilityAnalysis {
const TargetLibraryInfo &TLI)
: DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
+ InstructionsState
+ buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
+ bool TryCopyableElementsVectorization,
+ bool WithProfitabilityCheck = false,
+ bool SkipSameCodeCheck = false) {
+ InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
+ ? InstructionsState::invalid()
+ : getSameOpcode(VL, TLI);
+ if (S)
+ return S;
+ if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
+ return S;
+ findAndSetMainInstruction(VL);
+ if (!MainOp)
+ return InstructionsState::invalid();
+ S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
+ // TODO: Remove this check once support for schulable copyables is landed.
+ if (any_of(VL, [&](Value *V) {
+ return S.isCopyableElement(V) && !S.isNonSchedulable(V);
+ }))
+ return InstructionsState::invalid();
+
+ if (!WithProfitabilityCheck)
+ return S;
+ // Check if it is profitable to vectorize the instruction.
+ SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
+ auto BuildCandidates =
+ [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
+ Value *V2) {
+ if (V1 != V2 && isa<PHINode>(V1))
+ return;
+ auto *I1 = dyn_cast<Instruction>(V1);
+ auto *I2 = dyn_cast<Instruction>(V2);
+ if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
+ I1->getParent() != I2->getParent())
+ return;
+ Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
+ };
+ if (VL.size() == 2) {
+ // Check if the operands allow better vectorization.
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
+ BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
+ BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
+ bool Res = !Candidates1.empty() && !Candidates2.empty() &&
+ R.findBestRootPair(Candidates1) &&
+ R.findBestRootPair(Candidates2);
+ if (!Res && isCommutative(MainOp)) {
+ Candidates1.clear();
+ Candidates2.clear();
+ BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
+ BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
+ Res = !Candidates1.empty() && !Candidates2.empty() &&
+ R.findBestRootPair(Candidates1) &&
+ R.findBestRootPair(Candidates2);
+ }
+ if (!Res)
+ return InstructionsState::invalid();
+ return S;
+ }
+ assert(Operands.size() == 2 && "Unexpected number of operands!");
+ unsigned CopyableNum =
+ count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
+ if (CopyableNum < VL.size() / 2)
+ return S;
+ // Too many phi copyables - exit.
+ const unsigned Limit = VL.size() / 24;
+ if ((CopyableNum >= VL.size() - Limit ||
+ (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
+ CopyableNum >= MaxPHINumOperands) &&
+ all_of(VL, [&](Value *V) {
+ return isa<PHINode>(V) || !S.isCopyableElement(V);
+ }))
+ return InstructionsState::invalid();
+ // Check profitability if number of copyables > VL.size() / 2.
+ // 1. Reorder operands for better matching.
+ if (isCommutative(MainOp)) {
+ for (auto &Ops : Operands) {
+ // Make instructions the first operands.
+ if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
+ std::swap(Ops.front(), Ops.back());
+ continue;
+ }
+ // Make constants the second operands.
+ if (isa<Constant>(Ops.front())) {
+ std::swap(Ops.front(), Ops.back());
+ continue;
+ }
+ }
+ }
+ // 2. Check, if operands can be vectorized.
+ if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
+ return InstructionsState::invalid();
+ auto CheckOperand = [&](ArrayRef<Value *> Ops) {
+ if (allConstant(Ops) || isSplat(Ops))
+ return true;
+ // Check if it is "almost" splat, i.e. has >= 4 elements and only single
+ // one is
diff erent.
+ constexpr unsigned Limit = 4;
+ if (Operands.front().size() >= Limit) {
+ SmallDenseMap<const Value *, unsigned> Counters;
+ for (Value *V : Ops) {
+ if (isa<UndefValue>(V))
+ continue;
+ ++Counters[V];
+ }
+ if (Counters.size() == 2 &&
+ any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
+ return C.second == 1;
+ }))
+ return true;
+ }
+ // First operand not a constant or splat? Last attempt - check for
+ // potential vectorization.
+ InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+ InstructionsState OpS = Analysis.buildInstructionsState(
+ Ops, R, /*TryCopyableElementsVectorization=*/true);
+ if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
+ return false;
+ unsigned CopyableNum =
+ count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
+ return CopyableNum <= VL.size() / 2;
+ };
+ if (!CheckOperand(Operands.front()))
+ return InstructionsState::invalid();
+
+ return S;
+ }
+
SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
ArrayRef<Value *> VL) {
assert(S && "Invalid state!");
SmallVector<BoUpSLP::ValueList> Operands;
- buildOriginalOperands(S, VL, Operands);
+ if (S.areInstructionsWithCopyableElements()) {
+ MainOp = S.getMainOp();
+ MainOpcode = S.getOpcode();
+ Operands.assign(MainOp->getNumOperands(),
+ BoUpSLP::ValueList(VL.size(), nullptr));
+ for (auto [Idx, V] : enumerate(VL)) {
+ SmallVector<Value *> OperandsForValue = getOperands(S, V);
+ for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
+ Operands[OperandIdx][Idx] = Operand;
+ }
+ } else {
+ buildOriginalOperands(S, VL, Operands);
+ }
return Operands;
}
};
} // namespace
-BoUpSLP::ScalarsVectorizationLegality
-BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) const {
+BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
+ ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
+ bool TryCopyableElementsVectorization) const {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
- InstructionsState S = getSameOpcode(VL, *TLI);
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ VL, *this, TryCopyableElementsVectorization,
+ /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
// Don't go into catchswitch blocks, which can happen with PHIs.
// Such blocks can only have PHIs and the catchswitch. There is no
@@ -10066,7 +10450,7 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
bool IsScatterVectorizeUserTE =
UserTreeIdx.UserTE &&
UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
- bool AreAllSameBlock = S && allSameBlock(VL);
+ bool AreAllSameBlock = S.valid();
bool AreScatterAllGEPSameBlock =
(IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
VL.size() > 2 &&
@@ -10091,12 +10475,18 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
NotProfitableForVectorization(VL)) {
if (!S) {
LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
- "C,S,B,O, small shuffle. \n");
+ "C,S,B,O, small shuffle. \n";
+ dbgs() << "[";
+ interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
+ dbgs() << "]\n");
return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
/*TryToFindDuplicates=*/true,
/*TrySplitVectorize=*/true);
}
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
+ dbgs() << "[";
+ interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
+ dbgs() << "]\n");
return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
}
@@ -10242,9 +10632,29 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
return true;
};
- ScalarsVectorizationLegality Legality =
- getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
- const InstructionsState &S = Legality.getInstructionsState();
+ auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
+ bool AreConsts = false;
+ for (Value *V : VL) {
+ if (isa<PoisonValue>(V))
+ continue;
+ if (isa<Constant>(V)) {
+ AreConsts = true;
+ continue;
+ }
+ if (!isa<PHINode>(V))
+ return false;
+ }
+ return AreConsts;
+ };
+ if (AreOnlyConstsWithPHIs(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
+ newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
+ return;
+ }
+
+ ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+ InstructionsState S = Legality.getInstructionsState();
if (!Legality.isLegal()) {
if (Legality.trySplitVectorize()) {
auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
@@ -10252,11 +10662,18 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
return;
}
- if (Legality.tryToFindDuplicates())
- tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx);
+ if (!S)
+ Legality = getScalarsVectorizationLegality(
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
+ if (!Legality.isLegal()) {
+ if (Legality.tryToFindDuplicates())
+ tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
+ UserTreeIdx);
- newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
- return;
+ newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
+ return;
+ }
+ S = Legality.getInstructionsState();
}
// FIXME: investigate if there are profitable cases for VL.size() <= 4.
@@ -13024,7 +13441,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
(E->getOpcode() == Instruction::GetElementPtr &&
- E->getMainOp()->getType()->isPointerTy())) &&
+ E->getMainOp()->getType()->isPointerTy()) ||
+ E->hasCopyableElements()) &&
"Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
@@ -13036,6 +13454,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallBitVector UsedScalars(Sz, false);
for (unsigned I = 0; I < Sz; ++I) {
if (isa<Instruction>(UniqueValues[I]) &&
+ !E->isCopyableElement(UniqueValues[I]) &&
getTreeEntries(UniqueValues[I]).front() == E)
continue;
UsedScalars.set(I);
@@ -14075,15 +14494,45 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// If the tree contains only phis, buildvectors, split nodes and
// small nodes with reuses, we can skip it.
+ SmallVector<const TreeEntry *> StoreLoadNodes;
+ unsigned NumGathers = 0;
+ constexpr int LimitTreeSize = 36;
if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
- all_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
- return TE->State == TreeEntry::SplitVectorize ||
- (TE->isGather() &&
- none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
- (TE->hasState() && (TE->getOpcode() == Instruction::PHI ||
- (!TE->ReuseShuffleIndices.empty() &&
- TE->Scalars.size() == 2)));
- }))
+ all_of(VectorizableTree,
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ if (!TE->isGather() && TE->hasState() &&
+ (TE->getOpcode() == Instruction::Load ||
+ TE->getOpcode() == Instruction::Store)) {
+ StoreLoadNodes.push_back(TE.get());
+ return true;
+ }
+ if (TE->isGather())
+ ++NumGathers;
+ return TE->State == TreeEntry::SplitVectorize ||
+ (TE->Idx == 0 && TE->Scalars.size() == 2 &&
+ TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
+ VectorizableTree.size() > LimitTreeSize) ||
+ (TE->isGather() &&
+ none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
+ (TE->hasState() &&
+ (TE->getOpcode() == Instruction::PHI ||
+ (TE->hasCopyableElements() &&
+ static_cast<unsigned>(count_if(
+ TE->Scalars, IsaPred<PHINode, Constant>)) >=
+ TE->Scalars.size() / 2) ||
+ ((!TE->ReuseShuffleIndices.empty() ||
+ !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
+ TE->Scalars.size() == 2)));
+ }) &&
+ (StoreLoadNodes.empty() ||
+ (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
+ (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
+ return TE->getOpcode() == Instruction::Store ||
+ all_of(TE->Scalars, [&](Value *V) {
+ return !isa<LoadInst>(V) ||
+ areAllUsersVectorized(cast<Instruction>(V));
+ });
+ })))))
return true;
// We can vectorize the tree if its size is greater than or equal to the
@@ -14826,6 +15275,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
bool IsProfitablePHIUser =
(KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
VectorizableTree.front()->Scalars.size() > 2)) &&
+ VectorizableTree.front()->hasState() &&
VectorizableTree.front()->getOpcode() == Instruction::PHI &&
!Inst->hasNUsesOrMore(UsesLimit) &&
none_of(Inst->users(),
@@ -15276,7 +15726,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
const BasicBlock *TEInsertBlock = nullptr;
// Main node of PHI entries keeps the correct order of operands/incoming
// blocks.
- if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp());
+ if (auto *PHI = dyn_cast_or_null<PHINode>(
+ TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
TEInsertPt = TEInsertBlock->getTerminator();
@@ -15375,7 +15826,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
"Expected only single user of a gather node.");
const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
- PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize
+ PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
+ UseEI.UserTE->hasState())
? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
: nullptr;
Instruction *InsertPt =
@@ -15388,7 +15840,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
TEUseEI.UserTE->isAltShuffle()) &&
all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
if (UseEI.UserTE->State != TreeEntry::Vectorize ||
- (UseEI.UserTE->getOpcode() == Instruction::PHI &&
+ (UseEI.UserTE->hasState() &&
+ UseEI.UserTE->getOpcode() == Instruction::PHI &&
!UseEI.UserTE->isAltShuffle()) ||
!all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
continue;
@@ -16009,25 +16462,32 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
Instruction *Res = nullptr;
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block (except for extractelement-like instructions with
- // constant indices or gathered loads).
- auto *Front = E->getMainOp();
+ // constant indices or gathered loads or copyables).
+ Instruction *Front;
+ unsigned Opcode;
+ if (E->hasState()) {
+ Front = E->getMainOp();
+ Opcode = E->getOpcode();
+ } else {
+ Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
+ Opcode = Front->getOpcode();
+ }
auto *BB = Front->getParent();
- assert(((GatheredLoadsEntriesFirst.has_value() &&
- E->getOpcode() == Instruction::Load && E->isGather() &&
- E->Idx < *GatheredLoadsEntriesFirst) ||
- E->State == TreeEntry::SplitVectorize ||
- all_of(E->Scalars,
- [=](Value *V) -> bool {
- if (E->getOpcode() == Instruction::GetElementPtr &&
- !isa<GetElementPtrInst>(V))
- return true;
- auto *I = dyn_cast<Instruction>(V);
- return !I || !E->getMatchingMainOpOrAltOp(I) ||
- I->getParent() == BB ||
- isVectorLikeInstWithConstOps(I);
- })) &&
- "Expected gathered loads or GEPs or instructions from same basic "
- "block.");
+ assert(
+ ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
+ E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
+ E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
+ all_of(E->Scalars,
+ [=](Value *V) -> bool {
+ if (Opcode == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(V))
+ return true;
+ auto *I = dyn_cast<Instruction>(V);
+ return !I || !E->getMatchingMainOpOrAltOp(I) ||
+ I->getParent() == BB || isVectorLikeInstWithConstOps(I);
+ })) &&
+ "Expected gathered loads or GEPs or instructions from same basic "
+ "block.");
auto FindLastInst = [&]() {
Instruction *LastInst = Front;
@@ -16035,18 +16495,20 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
+ if (E->isCopyableElement(I))
+ continue;
if (LastInst->getParent() == I->getParent()) {
if (LastInst->comesBefore(I))
LastInst = I;
continue;
}
- assert(((E->getOpcode() == Instruction::GetElementPtr &&
+ assert(((Opcode == Instruction::GetElementPtr &&
!isa<GetElementPtrInst>(I)) ||
E->State == TreeEntry::SplitVectorize ||
(isVectorLikeInstWithConstOps(LastInst) &&
isVectorLikeInstWithConstOps(I)) ||
(GatheredLoadsEntriesFirst.has_value() &&
- E->getOpcode() == Instruction::Load && E->isGather() &&
+ Opcode == Instruction::Load && E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst)) &&
"Expected vector-like or non-GEP in GEP node insts only.");
if (!DT->isReachableFromEntry(LastInst->getParent())) {
@@ -16075,16 +16537,18 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
+ if (E->isCopyableElement(I))
+ continue;
if (FirstInst->getParent() == I->getParent()) {
if (I->comesBefore(FirstInst))
FirstInst = I;
continue;
}
- assert(((E->getOpcode() == Instruction::GetElementPtr &&
- !isa<GetElementPtrInst>(I)) ||
- (isVectorLikeInstWithConstOps(FirstInst) &&
- isVectorLikeInstWithConstOps(I))) &&
- "Expected vector-like or non-GEP in GEP node insts only.");
+ assert(((Opcode == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(I)) ||
+ (isVectorLikeInstWithConstOps(FirstInst) &&
+ isVectorLikeInstWithConstOps(I))) &&
+ "Expected vector-like or non-GEP in GEP node insts only.");
if (!DT->isReachableFromEntry(FirstInst->getParent())) {
FirstInst = I;
continue;
@@ -16122,7 +16586,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
// Set insertpoint for gathered loads to the very first load.
if (GatheredLoadsEntriesFirst.has_value() &&
E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
- E->getOpcode() == Instruction::Load) {
+ Opcode == Instruction::Load) {
Res = FindFirstInst();
EntryToLastInstruction.try_emplace(E, Res);
return *Res;
@@ -16139,7 +16603,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
return nullptr;
for (Value *V : E->Scalars) {
auto *I = dyn_cast<Instruction>(V);
- if (!I || isa<PHINode>(I) || doesNotNeedToBeScheduled(I))
+ if (!I || isa<PHINode>(I) ||
+ (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
continue;
ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
if (Bundles.empty())
@@ -16153,13 +16618,13 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
};
const ScheduleBundle *Bundle = FindScheduleBundle(E);
if (!E->isGather() && !Bundle) {
- if ((E->getOpcode() == Instruction::GetElementPtr &&
+ if ((Opcode == Instruction::GetElementPtr &&
any_of(E->Scalars,
[](Value *V) {
return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
})) ||
- all_of(E->Scalars, [](Value *V) {
- return isa<PoisonValue>(V) ||
+ all_of(E->Scalars, [&](Value *V) {
+ return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
(!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
}))
Res = FindLastInst();
@@ -18640,6 +19105,7 @@ Value *BoUpSLP::vectorizeTree(
TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
(TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
TE->UserTreeIndex.UserTE->isAltShuffle()) &&
+ !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
all_of(TE->UserTreeIndex.UserTE->Scalars,
[](Value *V) { return isUsedOutsideBlock(V); })) {
Instruction &LastInst =
@@ -18903,7 +19369,7 @@ Value *BoUpSLP::vectorizeTree(
continue;
assert(
(ExternallyUsedValues.count(Scalar) ||
- Scalar->hasNUsesOrMore(UsesLimit) ||
+ ExternalUsesWithNonUsers.count(Scalar) ||
ExternalUsesAsOriginalScalar.contains(Scalar) ||
any_of(
Scalar->users(),
@@ -19182,7 +19648,7 @@ Value *BoUpSLP::vectorizeTree(
if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
EE && IgnoredExtracts.contains(EE))
continue;
- if (isa<PoisonValue>(Scalar))
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
continue;
#ifndef NDEBUG
Type *Ty = Scalar->getType();
@@ -19424,12 +19890,15 @@ void BoUpSLP::optimizeGatherSequence() {
}
BoUpSLP::ScheduleBundle &
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S) {
auto &BundlePtr =
ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
for (Value *V : VL) {
if (doesNotNeedToBeScheduled(V))
continue;
+ if (S.isCopyableElement(V))
+ continue;
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember && "no ScheduleData for bundle member "
"(maybe not in same basic block)");
@@ -19450,10 +19919,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
+ bool HasCopyables = S.areInstructionsWithCopyableElements();
if (isa<PHINode>(S.getMainOp()) ||
- isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL))
+ isVectorLikeInstWithConstOps(S.getMainOp()) ||
+ (!HasCopyables && doesNotNeedToSchedule(VL)) ||
+ all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
return nullptr;
+ // TODO Remove once full support for copyables is landed.
+ assert(all_of(VL,
+ [&](Value *V) {
+ return !S.isCopyableElement(V) || S.isNonSchedulable(V);
+ }) &&
+ "Copyable elements should not be schedulable");
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
@@ -19499,7 +19977,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
continue;
if (!extendSchedulingRegion(V, S)) {
// If the scheduling region got new instructions at the lower end (or it
@@ -19516,7 +19994,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
bool ReSchedule = false;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
continue;
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
@@ -19541,7 +20019,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReSchedule = true;
}
- ScheduleBundle &Bundle = buildBundle(VL);
+ ScheduleBundle &Bundle = buildBundle(VL, S);
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle.isReady()) {
for (ScheduleData *BD : Bundle.getBundle()) {
@@ -19558,7 +20036,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
}
ScheduledBundlesList.pop_back();
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
continue;
ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back();
}
@@ -20187,7 +20665,7 @@ bool BoUpSLP::collectValuesToDemote(
};
if (E.isGather() || !Visited.insert(&E).second ||
any_of(E.Scalars, [&](Value *V) {
- return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
+ return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
return isa<InsertElementInst>(U) && !isVectorized(U);
});
}))
@@ -20555,9 +21033,10 @@ void BoUpSLP::computeMinimumValueSizes() {
if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
SelectInst>(U) ||
isa<SIToFPInst, UIToFPInst>(U) ||
- !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
- SelectInst>(UserTE->getMainOp()) ||
- isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))
+ (UserTE->hasState() &&
+ (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
+ SelectInst>(UserTE->getMainOp()) ||
+ isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
return true;
unsigned UserTESz = DL->getTypeSizeInBits(
UserTE->Scalars.front()->getType());
@@ -20653,7 +21132,12 @@ void BoUpSLP::computeMinimumValueSizes() {
if (!IsKnownPositive)
++BitWidth1;
- APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
+ auto *I = dyn_cast<Instruction>(Root);
+ if (!I) {
+ MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
+ continue;
+ }
+ APInt Mask = DB->getDemandedBits(I);
unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
MaxBitWidth =
std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
@@ -20802,6 +21286,7 @@ void BoUpSLP::computeMinimumValueSizes() {
NodeIdx < VectorizableTree.size() &&
VectorizableTree[NodeIdx]->UserTreeIndex &&
VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
+ VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
Instruction::Trunc &&
!VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
@@ -20982,7 +21467,9 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
for (Value *V : Chain)
ValOps.insert(cast<StoreInst>(V)->getValueOperand());
// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
- InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
DenseSet<Value *> Stores(Chain.begin(), Chain.end());
bool IsAllowedSize =
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
index 07fdc9d8dd2fa..7408ba10cc772 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
@@ -4,9 +4,6 @@
define void @test() {
; CHECK-LABEL: define void @test() {
; CHECK-NEXT: [[BB:.*:]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[ADD]], i32 3
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer
; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0
; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64
@@ -17,8 +14,7 @@ define void @test() {
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[CALL]], i32 3
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: ret void
;
bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
index 514d5f974cb16..7a1cf7b573a99 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -6,15 +6,13 @@ define i64 @foo(i32 %tmp7) {
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP8:%.*]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP0]], <i32 0, i32 0, i32 poison, i32 0, i32 0, i32 poison, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0>, <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 14, i32 poison, i32 poison, i32 7>
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5
-; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP13]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP13]], [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0>, [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0>, [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
-; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 0, [[TMP10]]
+; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]]
+; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 0, [[TMP8]]
; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64
; CHECK-NEXT: ret i64 [[TMP64]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
index 15ba98f90f0b8..5e3d4715e99c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
@@ -7,17 +7,10 @@ define i32 @test() {
; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]]
; CHECK: [[FUNC_135_EXIT_I]]:
; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 23, i32 8, i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <12 x i32> [[TMP3]], <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP17]], <16 x i32> [[TMP8]], <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 23, i32 24, i32 25, i32 26, i32 2, i32 2, i32 2, i32 2, i32 3>
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison>, [[TMP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4>
; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
index 1c482e079bb0f..03d76ef571d64 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
@@ -4,11 +4,10 @@
define i64 @test() {
; CHECK-LABEL: define i64 @test() {
; CHECK-NEXT: [[BB:.*]]:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 0, i32 1
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> zeroinitializer, [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT: br label %[[BB5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
index 652abef14771d..6bb52e0fc43b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
@@ -7,19 +7,17 @@ define void @test() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[IF_THEN_I_I:.*]]:
-; CHECK-NEXT: br label %[[BB5:.*]]
+; CHECK-NEXT: br label %[[BB3:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[TMP0:%.*]] = zext i1 false to i64
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> zeroinitializer, [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <4 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB2:.*]]
-; CHECK: [[BB5]]:
-; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, i64 [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> zeroinitializer, [[TMP1]]
+; CHECK-NEXT: br i1 false, label %[[BB3]], label %[[BB2:.*]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ [[TMP2]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ]
; CHECK-NEXT: br label %[[BB2]]
; CHECK: [[BB2]]:
-; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP6]], %[[BB5]] ], [ [[TMP4]], %[[BB1]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB3]] ], [ [[TMP2]], %[[BB1]] ]
; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr null, i64 40), align 8
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
index a4949bc67b0f1..782aada17acac 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -6,14 +6,9 @@ target triple = "x86_64-unknown-linux-gnu"
define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1
-; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECINIT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VECINIT51:%.*]] = add <4 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <4 x i32> [[VECINIT51]]
;
%vecinit = insertelement <4 x i32> undef, i32 %f, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
index 8f6a53c03ac68..f7811aba5ab5f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -6,14 +6,6 @@ define <4 x i16> @test() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
-; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
-; CHECK-NEXT: [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
-; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1
-; CHECK-NEXT: [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
-; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2
-; CHECK-NEXT: [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
-; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3
; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]]
; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
@@ -28,8 +20,7 @@ define <4 x i16> @test() {
; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
-; CHECK-NEXT: [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX9]], [[TMP35]]
-; CHECK-NEXT: ret <4 x i16> [[OP_RDX11]]
+; CHECK-NEXT: ret <4 x i16> [[OP_RDX9]]
;
entry:
%subi = add <4 x i16> zeroinitializer, zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/user-node-no-state.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-node-no-state.ll
new file mode 100644
index 0000000000000..237f308e55e42
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/user-node-no-state.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+ at g = global [128 x i8] zeroinitializer, align 16
+
+define i64 @test() {
+; CHECK-LABEL: define i64 @test() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @g, align 8
+; CHECK-NEXT: br label %[[FUNC_154_EXIT_FUNC_146_EXIT_CRIT_EDGE_I:.*]]
+; CHECK: [[FUNC_154_EXIT_FUNC_146_EXIT_CRIT_EDGE_I]]:
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 80), align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 88), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 32), align 16
+; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @g, align 16
+; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 8), align 8
+; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @g, align 16
+; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 24), align 8
+; CHECK-NEXT: [[TMP8:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[TMP9]], [[TMP4]]
+; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], [[TMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], [[TMP6]]
+; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP12]], [[TMP7]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], [[TMP0]]
+; CHECK-NEXT: ret i64 [[TMP14]]
+;
+entry:
+ %0 = load i64, ptr @g, align 8
+ br label %func_154.exit.func_146.exit_crit_edge.i
+
+func_154.exit.func_146.exit_crit_edge.i:
+ %1 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 80), align 16
+ %2 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 88), align 8
+ %3 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 32), align 16
+ %4 = load i64, ptr @g, align 16
+ %5 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 8), align 8
+ %6 = load i64, ptr @g, align 16
+ %7 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 24), align 8
+ %8 = xor i64 %1, %2
+ %9 = xor i64 %8, %3
+ %10 = xor i64 %9, %4
+ %11 = xor i64 %10, %5
+ %12 = xor i64 %11, %6
+ %13 = xor i64 %12, %7
+ %14 = xor i64 %13, %0
+ ret i64 %14
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index ad4daeab003f5..125c2dce32663 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -150,9 +150,9 @@ define <2 x i32> @replace_through_int_casts_ele0_only(i16 %inp, <2 x i16> %dead)
define <2 x i8> @replace_through_binop_fail_cant_speculate(i8 %inp, <2 x i8> %d, <2 x i8> %any) {
; CHECK-LABEL: define <2 x i8> @replace_through_binop_fail_cant_speculate(
; CHECK-SAME: i8 [[INP:%.*]], <2 x i8> [[D:%.*]], <2 x i8> [[ANY:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = add i8 [[INP]], 5
-; CHECK-NEXT: [[V0:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i64 0
-; CHECK-NEXT: [[V:%.*]] = insertelement <2 x i8> [[V0]], i8 [[ADD]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[V:%.*]] = add <2 x i8> [[TMP2]], <i8 0, i8 5>
; CHECK-NEXT: [[DIV0:%.*]] = sdiv <2 x i8> splat (i8 -128), [[V]]
; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[INP]], 123
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i8> [[DIV0]], i8 [[TMP1]], i64 0
More information about the llvm-commits
mailing list