[llvm] [SLP]Initial support for copyable elements (PR #147366)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 17 08:45:18 PDT 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/147366
>From 94d13aa34f611d3b584dd40b77eb363f1c6e922d Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 7 Jul 2025 18:20:05 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
=?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 1496 +++++++++++++++--
.../X86/buildvector-schedule-for-subvector.ll | 8 +-
.../X86/cast-operand-extracted.ll | 8 +-
.../X86/full-match-with-poison-scalar.ll | 14 +-
.../X86/node-outside-used-only.ll | 3 +-
...dulable-instructions-become-schedulable.ll | 16 +-
.../Transforms/SLPVectorizer/X86/pr35497.ll | 20 +-
.../Transforms/SLPVectorizer/X86/pr47642.ll | 11 +-
.../X86/shuffle-mask-emission.ll | 4 +-
.../SLPVectorizer/alternate-non-profitable.ll | 6 +-
llvm/test/Transforms/SLPVectorizer/revec.ll | 12 +-
11 files changed, 1369 insertions(+), 229 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c93af749507f8..c5a6de40fa72b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -206,6 +206,11 @@ static cl::opt<bool> VectorizeNonPowerOf2(
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
+static cl::opt<bool> VectorizeCopyableElements(
+ "slp-copyable-elements", cl::init(true), cl::Hidden,
+ cl::desc("Try to replace values with the idempotent instructions for "
+ "better vectorization."));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -519,17 +524,17 @@ static bool isSplat(ArrayRef<Value *> VL) {
/// instructions, we need to use the converted opcode along with the original
/// uses.
/// \param I The instruction to check for commutativity
-/// \param InstWithUses The instruction whose uses are analyzed for special
+/// \param ValWithUses The value whose uses are analyzed for special
/// patterns
-static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
+static bool isCommutative(Instruction *I, Value *ValWithUses) {
if (auto *Cmp = dyn_cast<CmpInst>(I))
return Cmp->isCommutative();
if (auto *BO = dyn_cast<BinaryOperator>(I))
return BO->isCommutative() ||
(BO->getOpcode() == Instruction::Sub &&
- !InstWithUses->hasNUsesOrMore(UsesLimit) &&
+ !ValWithUses->hasNUsesOrMore(UsesLimit) &&
all_of(
- InstWithUses->uses(),
+ ValWithUses->uses(),
[](const Use &U) {
// Commutative, if icmp eq/ne sub, 0
CmpPredicate Pred;
@@ -546,8 +551,8 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
Flag->isOne());
})) ||
(BO->getOpcode() == Instruction::FSub &&
- !InstWithUses->hasNUsesOrMore(UsesLimit) &&
- all_of(InstWithUses->uses(), [](const Use &U) {
+ !ValWithUses->hasNUsesOrMore(UsesLimit) &&
+ all_of(ValWithUses->uses(), [](const Use &U) {
return match(U.getUser(),
m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
}));
@@ -564,6 +569,19 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
/// \returns true if the instruction is commutative, false otherwise
static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
+/// \returns number of operands of \p I, considering commutativity. Returns 2
+/// for commutative instrinsics.
+/// \param I The instruction to check for commutativity
+static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) {
+ if (isa<IntrinsicInst>(I) && isCommutative(I)) {
+ // IntrinsicInst::isCommutative returns true if swapping the first "two"
+ // arguments to the intrinsic produces the same result.
+ constexpr unsigned IntrinsicNumOperands = 2;
+ return IntrinsicNumOperands;
+ }
+ return I->getNumOperands();
+}
+
template <typename T>
static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
unsigned Offset) {
@@ -855,6 +873,23 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) {
return *EI->idx_begin();
}
+namespace llvm {
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all operands are either not instructions
+/// or phi nodes or instructions from different blocks.
+static bool areAllOperandsNonInsts(Value *V);
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all users are phi nodes or instructions
+/// from the different blocks.
+static bool isUsedOutsideBlock(Value *V);
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V);
+} // namespace llvm
+
namespace {
/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
@@ -957,6 +992,31 @@ class BinOpSameOpcodeHelper {
return Instruction::Xor;
llvm_unreachable("Cannot find interchangeable instruction.");
}
+ /// Return true if the \p Opcode is a candidate for interchange.
+ bool hasCandidateOpcode(unsigned Opcode) const {
+ MaskType Candidate = Mask & SeenBefore;
+ switch (Opcode) {
+ case Instruction::Shl:
+ return Candidate & ShlBIT;
+ case Instruction::AShr:
+ return Candidate & AShrBIT;
+ case Instruction::Mul:
+ return Candidate & MulBIT;
+ case Instruction::Add:
+ return Candidate & AddBIT;
+ case Instruction::Sub:
+ return Candidate & SubBIT;
+ case Instruction::And:
+ return Candidate & AndBIT;
+ case Instruction::Or:
+ return Candidate & OrBIT;
+ case Instruction::Xor:
+ return Candidate & XorBIT;
+ default:
+ break;
+ }
+ llvm_unreachable("Cannot find interchangeable instruction.");
+ }
SmallVector<Value *> getOperand(const Instruction *To) const {
unsigned ToOpcode = To->getOpcode();
unsigned FromOpcode = I->getOpcode();
@@ -1117,6 +1177,10 @@ class BinOpSameOpcodeHelper {
AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
}
unsigned getMainOpcode() const { return MainOp.getOpcode(); }
+ /// Return true if the \p Opcode is a candidate for interchange.
+ bool hasCandidateOpcode(unsigned Opcode) const {
+ return MainOp.hasCandidateOpcode(Opcode);
+ }
bool hasAltOp() const { return AltOp.I; }
unsigned getAltOpcode() const {
return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
@@ -1152,6 +1216,8 @@ class InstructionsState {
/// GetVectorCost.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
+ /// Weather the instruction state represents copyable instructions.
+ bool HasCopyables = false;
public:
Instruction *getMainOp() const {
@@ -1190,9 +1256,11 @@ class InstructionsState {
if (!I->isBinaryOp())
return nullptr;
BinOpSameOpcodeHelper Converter(MainOp);
- if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp())
- return MainOp;
- return AltOp;
+ if (!Converter.add(I) || !Converter.add(MainOp))
+ return nullptr;
+ if (Converter.hasAltOp() && !isAltShuffle())
+ return nullptr;
+ return Converter.hasAltOp() ? AltOp : MainOp;
}
/// Checks if main/alt instructions are shift operations.
@@ -1237,9 +1305,67 @@ class InstructionsState {
explicit operator bool() const { return valid(); }
InstructionsState() = delete;
- InstructionsState(Instruction *MainOp, Instruction *AltOp)
- : MainOp(MainOp), AltOp(AltOp) {}
+ InstructionsState(Instruction *MainOp, Instruction *AltOp,
+ bool HasCopyables = false)
+ : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
static InstructionsState invalid() { return {nullptr, nullptr}; }
+
+ /// Checks if the value is a copyable element.
+ bool isCopyableElement(Value *V) const {
+ assert(valid() && "InstructionsState is invalid.");
+ if (!HasCopyables)
+ return false;
+ if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return !isa<PoisonValue>(V);
+ if (I->getParent() != MainOp->getParent() &&
+ (!isVectorLikeInstWithConstOps(I) ||
+ !isVectorLikeInstWithConstOps(MainOp)))
+ return true;
+ if (I->getOpcode() == MainOp->getOpcode())
+ return false;
+ if (!I->isBinaryOp())
+ return true;
+ BinOpSameOpcodeHelper Converter(MainOp);
+ return !Converter.add(I) || !Converter.add(MainOp) ||
+ Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
+ }
+
+ /// Checks if the value is non-schedulable.
+ bool isNonSchedulable(Value *V) const {
+ assert(valid() && "InstructionsState is invalid.");
+ auto *I = dyn_cast<Instruction>(V);
+ if (!HasCopyables)
+ return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
+ doesNotNeedToBeScheduled(V);
+ // MainOp for copyables always schedulable to correctly identify
+ // non-schedulable copyables.
+ if (getMainOp() == V)
+ return false;
+ if (isCopyableElement(V)) {
+ auto IsNonSchedulableCopyableElement = [this](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
+ (doesNotNeedToBeScheduled(I) &&
+ // If the copyable instructions comes after MainOp
+ // (non-schedulable, but used in the block) - cannot vectorize
+ // it, will possibly generate use before def.
+ (isVectorLikeInstWithConstOps(I) || !MainOp->comesBefore(I)));
+ };
+
+ return IsNonSchedulableCopyableElement(V);
+ }
+ return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
+ doesNotNeedToBeScheduled(V);
+ }
+
+ /// Checks if the state represents copyable instructions.
+ bool areInstructionsWithCopyableElements() const {
+ assert(valid() && "InstructionsState is invalid.");
+ return HasCopyables;
+ }
};
std::pair<Instruction *, SmallVector<Value *>>
@@ -1767,6 +1893,7 @@ class BoUpSLP {
class TreeEntry;
class ScheduleEntity;
class ScheduleData;
+ class ScheduleCopyableData;
class ScheduleBundle;
class ShuffleCostEstimator;
class ShuffleInstructionBuilder;
@@ -2126,6 +2253,7 @@ class BoUpSLP {
operator bool() const { return UserTE != nullptr; }
};
+ friend struct DenseMapInfo<EdgeInfo>;
/// A helper class used for scoring candidates for two consecutive lanes.
class LookAheadHeuristics {
@@ -2890,18 +3018,14 @@ class BoUpSLP {
assert(S.valid() && "InstructionsState is invalid.");
// IntrinsicInst::isCommutative returns true if swapping the first "two"
// arguments to the intrinsic produces the same result.
- constexpr unsigned IntrinsicNumOperands = 2;
Instruction *MainOp = S.getMainOp();
unsigned NumOperands = MainOp->getNumOperands();
- ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
+ ArgSize = ::getNumberOfPotentiallyCommutativeOps(MainOp);
OpsVec.resize(ArgSize);
unsigned NumLanes = VL.size();
for (OperandDataVec &Ops : OpsVec)
Ops.resize(NumLanes);
for (unsigned Lane : seq<unsigned>(NumLanes)) {
- Value *V = VL[Lane];
- assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
- "Expected instruction or poison value");
// Our tree has just 3 nodes: the root and two operands.
// It is therefore trivial to get the APO. We only need to check the
// opcode of V and whether the operand at OpIdx is the LHS or RHS
@@ -2912,17 +3036,24 @@ class BoUpSLP {
// Since operand reordering is performed on groups of commutative
// operations or alternating sequences (e.g., +, -), we can safely tell
// the inverse operations by checking commutativity.
- if (isa<PoisonValue>(V)) {
+ auto *I = dyn_cast<Instruction>(VL[Lane]);
+ if (!I && isa<PoisonValue>(VL[Lane])) {
for (unsigned OpIdx : seq<unsigned>(NumOperands))
OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
continue;
}
- auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S);
- // We cannot check commutativity by the converted instruction
- // (SelectedOp) because isCommutative also examines def-use
- // relationships.
- bool IsInverseOperation =
- !isCommutative(SelectedOp, cast<Instruction>(V));
+ bool IsInverseOperation = false;
+ if (S.isCopyableElement(VL[Lane])) {
+ // The value is a copyable element.
+ IsInverseOperation = !isCommutative(MainOp, VL[Lane]);
+ } else {
+ assert(I && "Expected instruction");
+ auto [SelectedOp, Ops] = convertTo(I, S);
+ // We cannot check commutativity by the converted instruction
+ // (SelectedOp) because isCommutative also examines def-use
+ // relationships.
+ IsInverseOperation = !isCommutative(SelectedOp, I);
+ }
for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
@@ -3792,6 +3923,9 @@ class BoUpSLP {
/// reordering of operands during buildTreeRec() and vectorizeTree().
SmallVector<ValueList, 2> Operands;
+ /// Copyable elements of the entry node.
+ SmallPtrSet<const Value *, 4> CopyableElements;
+
/// MainOp and AltOp are recorded inside. S should be obtained from
/// newTreeEntry.
InstructionsState S = InstructionsState::invalid();
@@ -3820,11 +3954,7 @@ class BoUpSLP {
void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
/// Marks the node as one that does not require scheduling.
- void setDoesNotNeedToSchedule() {
- assert(::doesNotNeedToSchedule(Scalars) &&
- "Expected to not need scheduling");
- DoesNotNeedToSchedule = true;
- }
+ void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
/// Returns true if the node is marked as one that does not require
/// scheduling.
bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
@@ -3896,6 +4026,20 @@ class BoUpSLP {
bool hasState() const { return S.valid(); }
+ /// Add \p V to the list of copyable elements.
+ void addCopyableElement(Value *V) {
+ assert(S.isCopyableElement(V) && "Not a copyable element.");
+ CopyableElements.insert(V);
+ }
+
+ /// Returns true if \p V is a copyable element.
+ bool isCopyableElement(Value *V) const {
+ return CopyableElements.contains(V);
+ }
+
+ /// Returns true if any scalar in the list is a copyable element.
+ bool hasCopyableElements() const { return !CopyableElements.empty(); }
+
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
int findLaneForValue(Value *V) const {
@@ -3968,6 +4112,8 @@ class BoUpSLP {
for (Value *V : Scalars)
dbgs().indent(2) << *V << "\n";
dbgs() << "State: ";
+ if (S && hasCopyableElements())
+ dbgs() << "[[Copyable]] ";
switch (State) {
case Vectorize:
if (InterleaveFactor > 0) {
@@ -4145,12 +4291,20 @@ class BoUpSLP {
}
}
} else if (!Last->isGather()) {
- if (doesNotNeedToSchedule(VL))
+ if (isa<PHINode>(S.getMainOp()) ||
+ isVectorLikeInstWithConstOps(S.getMainOp()) ||
+ (!S.areInstructionsWithCopyableElements() &&
+ doesNotNeedToSchedule(VL)) ||
+ all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
Last->setDoesNotNeedToSchedule();
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
if (isa<PoisonValue>(V))
continue;
+ if (S.isCopyableElement(V)) {
+ Last->addCopyableElement(V);
+ continue;
+ }
auto It = ScalarToTreeEntries.find(V);
if (It == ScalarToTreeEntries.end()) {
ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
@@ -4162,16 +4316,14 @@ class BoUpSLP {
}
}
// Update the scheduler bundle to point to this TreeEntry.
- assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) ||
- isVectorLikeInstWithConstOps(S.getMainOp()) ||
- Last->doesNotNeedToSchedule()) &&
+ assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
"Bundle and VL out of sync");
if (!Bundle.getBundle().empty()) {
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
auto *BundleMember = Bundle.getBundle().begin();
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second)
+ if (S.isNonSchedulable(V) || !Processed.insert(V).second)
continue;
++BundleMember;
}
@@ -4280,7 +4432,8 @@ class BoUpSLP {
/// in general.
ScalarsVectorizationLegality
getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) const;
+ const EdgeInfo &UserTreeIdx,
+ bool TryCopyableElementsVectorization) const;
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
@@ -4433,16 +4586,18 @@ class BoUpSLP {
/// List of hashes of vector of loads, which are known to be non vectorizable.
DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
- /// Represents a scheduling entity, either ScheduleData or ScheduleBundle.
- /// ScheduleData used to gather dependecies for a single instructions, while
- /// ScheduleBundle represents a batch of instructions, going to be groupped
- /// together.
+ /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
+ /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
+ /// instructions, while ScheduleBundle represents a batch of instructions,
+ /// going to be groupped together. ScheduleCopyableData models extra user for
+ /// "copyable" instructions.
class ScheduleEntity {
friend class ScheduleBundle;
friend class ScheduleData;
+ friend class ScheduleCopyableData;
protected:
- enum class Kind { ScheduleData, ScheduleBundle };
+ enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
Kind getKind() const { return K; }
ScheduleEntity(Kind K) : K(K) {}
@@ -4461,17 +4616,79 @@ class BoUpSLP {
void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
int getSchedulingPriority() const { return SchedulingPriority; }
bool isReady() const {
- if (auto *SD = dyn_cast<ScheduleData>(this))
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
return SD->isReady();
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+ return CD->isReady();
return cast<ScheduleBundle>(this)->isReady();
}
+ /// Returns true if the dependency information has been calculated.
+ /// Note that depenendency validity can vary between instructions within
+ /// a single bundle.
+ bool hasValidDependencies() const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->hasValidDependencies();
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+ return CD->hasValidDependencies();
+ return cast<ScheduleBundle>(this)->hasValidDependencies();
+ }
+ /// Gets the number of unscheduled dependencies.
+ int getUnscheduledDeps() const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->getUnscheduledDeps();
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+ return CD->getUnscheduledDeps();
+ return cast<ScheduleBundle>(this)->unscheduledDepsInBundle();
+ }
+ /// Increments the number of unscheduled dependencies.
+ int incrementUnscheduledDeps(int Incr) {
+ if (auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->incrementUnscheduledDeps(Incr);
+ return cast<ScheduleCopyableData>(this)->incrementUnscheduledDeps(Incr);
+ }
+ /// Gets the number of dependencies.
+ int getDependencies() const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->getDependencies();
+ return cast<ScheduleCopyableData>(this)->getDependencies();
+ }
+ /// Gets the instruction.
+ Instruction *getInst() const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->getInst();
+ return cast<ScheduleCopyableData>(this)->getInst();
+ }
+
/// Gets/sets if the bundle is scheduled.
bool isScheduled() const { return IsScheduled; }
void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
static bool classof(const ScheduleEntity *) { return true; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(raw_ostream &OS) const {
+ if (const auto *SD = dyn_cast<ScheduleData>(this))
+ return SD->dump(OS);
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
+ return CD->dump(OS);
+ return cast<ScheduleBundle>(this)->dump(OS);
+ }
+
+ LLVM_DUMP_METHOD void dump() const {
+ dump(dbgs());
+ dbgs() << '\n';
+ }
+#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
};
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const BoUpSLP::ScheduleEntity &SE) {
+ SE.dump(OS);
+ return OS;
+ }
+#endif
+
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
/// instruction bundle (= a group of instructions which is combined into a
@@ -4534,10 +4751,18 @@ class BoUpSLP {
/// Clears all dependency information.
void clearDependencies() {
- Dependencies = InvalidDeps;
- resetUnscheduledDeps();
+ clearDirectDependencies();
MemoryDependencies.clear();
ControlDependencies.clear();
+ }
+
+ /// Clears all direct dependencies only, except for control and memory
+ /// dependencies.
+ /// Required for copyable elements to correctly handle control/memory deps
+ /// and avoid extra reclaculation of such deps.
+ void clearDirectDependencies() {
+ Dependencies = InvalidDeps;
+ resetUnscheduledDeps();
IsScheduled = false;
}
@@ -4627,7 +4852,7 @@ class BoUpSLP {
class ScheduleBundle final : public ScheduleEntity {
/// The schedule data for the instructions in the bundle.
- SmallVector<ScheduleData *> Bundle;
+ SmallVector<ScheduleEntity *> Bundle;
/// True if this bundle is valid.
bool IsValid = true;
/// The TreeEntry that this instruction corresponds to.
@@ -4643,7 +4868,7 @@ class BoUpSLP {
/// Verify basic self consistency properties
void verify() const {
- for (const ScheduleData *SD : Bundle) {
+ for (const ScheduleEntity *SD : Bundle) {
if (SD->hasValidDependencies()) {
assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
"invariant");
@@ -4663,7 +4888,7 @@ class BoUpSLP {
int unscheduledDepsInBundle() const {
assert(*this && "bundle must not be empty");
int Sum = 0;
- for (const ScheduleData *BundleMember : Bundle) {
+ for (const ScheduleEntity *BundleMember : Bundle) {
if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
return ScheduleData::InvalidDeps;
Sum += BundleMember->getUnscheduledDeps();
@@ -4675,7 +4900,7 @@ class BoUpSLP {
/// Note that depenendency validity can vary between instructions within
/// a single bundle.
bool hasValidDependencies() const {
- return all_of(Bundle, [](const ScheduleData *SD) {
+ return all_of(Bundle, [](const ScheduleEntity *SD) {
return SD->hasValidDependencies();
});
}
@@ -4689,10 +4914,10 @@ class BoUpSLP {
/// Returns the bundle of scheduling data, associated with the current
/// instruction.
- ArrayRef<ScheduleData *> getBundle() { return Bundle; }
- ArrayRef<const ScheduleData *> getBundle() const { return Bundle; }
+ ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
+ ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
/// Adds an instruction to the bundle.
- void add(ScheduleData *SD) { Bundle.push_back(SD); }
+ void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
/// Gets/sets the associated tree entry.
void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
@@ -4709,8 +4934,11 @@ class BoUpSLP {
return;
}
OS << '[';
- interleaveComma(Bundle, OS,
- [&](const ScheduleData *SD) { OS << *SD->getInst(); });
+ interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
+ if (isa<ScheduleCopyableData>(SD))
+ OS << "<Copyable>";
+ OS << *SD->getInst();
+ });
OS << ']';
}
@@ -4729,6 +4957,131 @@ class BoUpSLP {
}
#endif
+ /// Contains all scheduling relevant data for the copyable instruction.
+ /// It models the virtual instructions, supposed to replace the original
+ /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
+ /// %1], where %1 = add, then the ScheduleCopyableData models virtual
+ /// instruction %virt = add %0, 0.
+ class ScheduleCopyableData final : public ScheduleEntity {
+ /// The source schedule data for the instruction.
+ Instruction *Inst = nullptr;
+ /// The edge information for the instruction.
+ const EdgeInfo EI;
+ /// This ScheduleData is in the current scheduling region if this matches
+ /// the current SchedulingRegionID of BlockScheduling.
+ int SchedulingRegionID = 0;
+ /// Bundle, this data is part of.
+ ScheduleBundle &Bundle;
+
+ public:
+ ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
+ const EdgeInfo &EI, ScheduleBundle &Bundle)
+ : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
+ SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
+ static bool classof(const ScheduleEntity *Entity) {
+ return Entity->getKind() == Kind::ScheduleCopyableData;
+ }
+
+ /// Verify basic self consistency properties
+ void verify() {
+ if (hasValidDependencies()) {
+ assert(UnscheduledDeps <= Dependencies && "invariant");
+ } else {
+ assert(UnscheduledDeps == Dependencies && "invariant");
+ }
+
+ if (IsScheduled) {
+ assert(hasValidDependencies() && UnscheduledDeps == 0 &&
+ "unexpected scheduled state");
+ }
+ }
+
+ /// Returns true if the dependency information has been calculated.
+ /// Note that depenendency validity can vary between instructions within
+ /// a single bundle.
+ bool hasValidDependencies() const {
+ return Dependencies != ScheduleData::InvalidDeps;
+ }
+
+ /// Returns true if it is ready for scheduling, i.e. it has no more
+ /// unscheduled depending instructions/bundles.
+ bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
+
+ /// Modifies the number of unscheduled dependencies for this instruction,
+ /// and returns the number of remaining dependencies for the containing
+ /// bundle.
+ int incrementUnscheduledDeps(int Incr) {
+ assert(hasValidDependencies() &&
+ "increment of unscheduled deps would be meaningless");
+ UnscheduledDeps += Incr;
+ assert(UnscheduledDeps >= 0 && "invariant");
+ return UnscheduledDeps;
+ }
+
+ /// Sets the number of unscheduled dependencies to the number of
+ /// dependencies.
+ void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
+
+ /// Gets the number of unscheduled dependencies.
+ int getUnscheduledDeps() const { return UnscheduledDeps; }
+ /// Gets the number of dependencies.
+ int getDependencies() const { return Dependencies; }
+ /// Initializes the number of dependencies.
+ void initDependencies() { Dependencies = 0; }
+ /// Increments the number of dependencies.
+ void incDependencies() { Dependencies++; }
+
+ /// Gets scheduling region ID.
+ int getSchedulingRegionID() const { return SchedulingRegionID; }
+
+ /// Gets the instruction.
+ Instruction *getInst() const { return Inst; }
+
+ /// Clears all dependency information.
+ void clearDependencies() {
+ Dependencies = ScheduleData::InvalidDeps;
+ UnscheduledDeps = ScheduleData::InvalidDeps;
+ IsScheduled = false;
+ }
+
+ /// Gets the edge information.
+ const EdgeInfo &getEdgeInfo() const { return EI; }
+
+ /// Gets the bundle.
+ ScheduleBundle &getBundle() { return Bundle; }
+ const ScheduleBundle &getBundle() const { return Bundle; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(raw_ostream &OS) const {
+ OS << "[Copyable]" << *getInst();
+ }
+
+ LLVM_DUMP_METHOD void dump() const {
+ dump(dbgs());
+ dbgs() << '\n';
+ }
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+ private:
+ /// true, if it has valid dependency information. These nodes always have
+ /// only single dependency.
+ int Dependencies = ScheduleData::InvalidDeps;
+
+ /// The number of dependencies minus the number of dependencies of scheduled
+ /// instructions. As soon as this is zero, the instruction/bundle gets ready
+ /// for scheduling.
+ /// Note that this is negative as long as Dependencies is not calculated.
+ int UnscheduledDeps = ScheduleData::InvalidDeps;
+ };
+
+#ifndef NDEBUG
+ friend inline raw_ostream &
+ operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
+ SD.dump(OS);
+ return OS;
+ }
+#endif
+
friend struct GraphTraits<BoUpSLP *>;
friend struct DOTGraphTraits<BoUpSLP *>;
@@ -4755,6 +5108,10 @@ class BoUpSLP {
void clear() {
ScheduledBundles.clear();
ScheduledBundlesList.clear();
+ ScheduleCopyableDataMap.clear();
+ ScheduleCopyableDataMapByInst.clear();
+ ScheduleCopyableDataMapByInstUser.clear();
+ ScheduleCopyableDataMapByUsers.clear();
ReadyInsts.clear();
ScheduleStart = nullptr;
ScheduleEnd = nullptr;
@@ -4781,7 +5138,7 @@ class BoUpSLP {
// Avoid lookup if can't possibly be in map.
return nullptr;
ScheduleData *SD = ScheduleDataMap.lookup(I);
- if (SD && isInSchedulingRegion(SD))
+ if (SD && isInSchedulingRegion(*SD))
return SD;
return nullptr;
}
@@ -4790,6 +5147,180 @@ class BoUpSLP {
return getScheduleData(dyn_cast<Instruction>(V));
}
+ /// Returns the ScheduleCopyableData for the given edge (user tree entry and
+ /// operand number) and value.
+ ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
+ const Value *V) const {
+ auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
+ if (It == ScheduleCopyableDataMap.end())
+ return nullptr;
+ ScheduleCopyableData *SD = It->getSecond().get();
+ if (!isInSchedulingRegion(*SD))
+ return nullptr;
+ return SD;
+ }
+
+ /// Returns the ScheduleCopyableData for the given user \p User, operand
+ /// number and operand \p V.
+ SmallVector<ScheduleCopyableData *>
+ getScheduleCopyableData(const Value *User, unsigned OperandIdx,
+ const Value *V) {
+ const auto It = ScheduleCopyableDataMapByInstUser.find(
+ std::make_pair(std::make_pair(User, OperandIdx), V));
+ if (It == ScheduleCopyableDataMapByInstUser.end())
+ return {};
+ SmallVector<ScheduleCopyableData *> Res;
+ for (ScheduleCopyableData *SD : It->getSecond()) {
+ if (isInSchedulingRegion(*SD))
+ Res.push_back(SD);
+ }
+ return Res;
+ }
+
+ /// Returns true if all operands of the given instruction \p User are
+ /// replaced by copyable data.
+ /// \param User The user instruction.
+ /// \param Op The operand, which might be replaced by the copyable data.
+ /// \param SLP The SLP tree.
+ /// \param NumOps The number of operands used. If the instruction uses the
+ /// same operand several times, check for the first use, then the second,
+ /// etc.
+ bool areAllOperandsReplacedByCopyableData(Instruction *User,
+ Instruction *Op, BoUpSLP &SLP,
+ unsigned NumOps) const {
+ assert(NumOps > 0 && "No operands");
+ SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
+ SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
+ for (const Use &U : User->operands()) {
+ if (U.get() != Op)
+ continue;
+ ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
+ if (Entries.empty())
+ return false;
+ // Check all tree entries, if they have operands replaced by copyable
+ // data.
+ for (TreeEntry *TE : SLP.getTreeEntries(User)) {
+ // Check if the user is commutative.
+ // The commutatives are handled later, as their oeprands can be
+ // reordered.
+ // Same applies even for non-commutative cmps, because we can invert
+ // their predicate potentially and, thus, reorder the operands.
+ bool IsCommutativeUser =
+ ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
+ EdgeInfo EI(TE, U.getOperandNo());
+ if (!IsCommutativeUser && !isa<CmpInst>(User)) {
+ unsigned &OpCnt =
+ OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
+ if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
+ return false;
+ // Found copyable operand - continue.
+ ++OpCnt;
+ continue;
+ }
+ ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
+ .first->getSecond();
+ }
+ }
+ // Check the commutative/cmp entries.
+ if (!PotentiallyReorderedEntriesCount.empty()) {
+ for (auto &P : PotentiallyReorderedEntriesCount) {
+ auto *It = find(P.first->Scalars, User);
+ assert(It != P.first->Scalars.end() &&
+ "User is not in the tree entry");
+ int Lane = std::distance(P.first->Scalars.begin(), It);
+ assert(Lane >= 0 && "Lane is not found");
+ if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty())
+ Lane = P.first->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
+ "Couldn't find extract lane");
+ SmallVector<unsigned> OpIndices;
+ for (unsigned OpIdx :
+ seq<unsigned>(::getNumberOfPotentiallyCommutativeOps(
+ P.first->getMainOp()))) {
+ if (P.first->getOperand(OpIdx)[Lane] == Op &&
+ getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op))
+ --P.getSecond();
+ }
+ }
+ return all_of(PotentiallyReorderedEntriesCount,
+ [&](const std::pair<const TreeEntry *, unsigned> &P) {
+ return P.second == NumOps - 1;
+ });
+ }
+ return true;
+ }
+
+ SmallVector<ScheduleCopyableData *>
+ getScheduleCopyableData(const Instruction *I) const {
+ const auto It = ScheduleCopyableDataMapByInst.find(I);
+ if (It == ScheduleCopyableDataMapByInst.end())
+ return {};
+ SmallVector<ScheduleCopyableData *> Res;
+ for (ScheduleCopyableData *SD : It->getSecond()) {
+ if (isInSchedulingRegion(*SD))
+ Res.push_back(SD);
+ }
+ return Res;
+ }
+
+ SmallVector<ScheduleCopyableData *>
+ getScheduleCopyableDataUsers(const Instruction *User) const {
+ const auto It = ScheduleCopyableDataMapByUsers.find(User);
+ if (It == ScheduleCopyableDataMapByUsers.end())
+ return {};
+ SmallVector<ScheduleCopyableData *> Res;
+ for (ScheduleCopyableData *SD : It->getSecond()) {
+ if (isInSchedulingRegion(*SD))
+ Res.push_back(SD);
+ }
+ return Res;
+ }
+
+ ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
+ Instruction *I,
+ int SchedulingRegionID,
+ ScheduleBundle &Bundle) {
+ assert(!getScheduleCopyableData(EI, I) && "already in the map");
+ ScheduleCopyableData *CD =
+ ScheduleCopyableDataMap
+ .try_emplace(std::make_pair(EI, I),
+ std::make_unique<ScheduleCopyableData>(
+ SchedulingRegionID, I, EI, Bundle))
+ .first->getSecond()
+ .get();
+ ScheduleCopyableDataMapByInst[I].push_back(CD);
+ if (EI.UserTE) {
+ ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+ const auto *It = find(Op, I);
+ assert(It != Op.end() && "Lane not set");
+ do {
+ int Lane = std::distance(Op.begin(), It);
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+ !EI.UserTE->ReorderIndices.empty())
+ Lane = EI.UserTE->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+ "Couldn't find extract lane");
+ auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+ ScheduleCopyableDataMapByInstUser
+ .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
+ .first->getSecond()
+ .push_back(CD);
+ ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(CD);
+ // Remove extra deps for users, becoming non-immediate users of the
+ // instruction. It may happen, if the chain of same copyable elements
+ // appears in the tree.
+ if (In == I) {
+ EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
+ if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, In))
+ ScheduleCopyableDataMapByUsers[I].remove(UserCD);
+ }
+ It = find(make_range(std::next(It), Op.end()), I);
+ } while (It != Op.end());
+ }
+ return *CD;
+ }
+
ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
auto *I = dyn_cast<Instruction>(V);
if (!I)
@@ -4800,34 +5331,44 @@ class BoUpSLP {
return It->getSecond();
}
- bool isInSchedulingRegion(ScheduleData *SD) const {
- return SD->getSchedulingRegionID() == SchedulingRegionID;
- }
-
- bool isInSchedulingRegion(const ScheduleBundle &Bundle) const {
- return all_of(Bundle.getBundle(), [&](const ScheduleData *BundleMember) {
- return BundleMember->getSchedulingRegionID() == SchedulingRegionID;
- });
+ /// Returns true if the entity is in the scheduling region.
+ bool isInSchedulingRegion(const ScheduleEntity &SD) const {
+ if (const auto *Data = dyn_cast<ScheduleData>(&SD))
+ return Data->getSchedulingRegionID() == SchedulingRegionID;
+ if (const auto *CD = dyn_cast<ScheduleCopyableData>(&SD))
+ return CD->getSchedulingRegionID() == SchedulingRegionID;
+ return all_of(cast<ScheduleBundle>(SD).getBundle(),
+ [&](const ScheduleEntity *BundleMember) {
+ return isInSchedulingRegion(*BundleMember);
+ });
}
/// Marks an instruction as scheduled and puts all dependent ready
/// instructions into the ready-list.
template <typename ReadyListType>
- void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) {
- auto ProcessBundleMember = [&](ScheduleData *BundleMember,
- ScheduleBundle *Bundle) {
+ void schedule(const BoUpSLP &R, const InstructionsState &S,
+ const EdgeInfo &EI, ScheduleEntity *Data,
+ ReadyListType &ReadyList) {
+ auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
+ ArrayRef<ScheduleBundle *> Bundles) {
// Handle the def-use chain dependencies.
// Decrement the unscheduled counter and insert to ready list if ready.
- auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) {
+ auto DecrUnsched = [&](ScheduleEntity *Data, bool IsControl = false) {
if ((IsControl || Data->hasValidDependencies()) &&
Data->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after
// decrementing, so we can put the dependent instruction
// into the ready list.
- if (ArrayRef<ScheduleBundle *> Bundles =
- getScheduleBundles(Data->getInst());
- !Bundles.empty()) {
+ SmallVector<ScheduleBundle *, 1> CopyableBundle;
+ ArrayRef<ScheduleBundle *> Bundles;
+ if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
+ CopyableBundle.push_back(&CD->getBundle());
+ Bundles = CopyableBundle;
+ } else {
+ Bundles = getScheduleBundles(Data->getInst());
+ }
+ if (!Bundles.empty()) {
for (ScheduleBundle *Bundle : Bundles) {
if (Bundle->unscheduledDepsInBundle() == 0) {
assert(!Bundle->isScheduled() &&
@@ -4841,12 +5382,21 @@ class BoUpSLP {
}
assert(!Data->isScheduled() &&
"already scheduled bundle gets ready");
+ assert(!isa<ScheduleCopyableData>(Data) &&
+ "Expected non-copyable data");
ReadyList.insert(Data);
LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
}
};
- auto DecrUnschedForInst = [&](Instruction *I) {
+ auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
+ Instruction *I) {
+ SmallVector<ScheduleCopyableData *> CopyableData =
+ getScheduleCopyableData(User, OpIdx, I);
+ for (ScheduleCopyableData *CD : CopyableData)
+ DecrUnsched(CD, /*IsControl=*/false);
+ if (!CopyableData.empty())
+ return;
if (ScheduleData *OpSD = getScheduleData(I))
DecrUnsched(OpSD, /*IsControl=*/false);
};
@@ -4854,45 +5404,91 @@ class BoUpSLP {
// If BundleMember is a vector bundle, its operands may have been
// reordered during buildTree(). We therefore need to get its operands
// through the TreeEntry.
- if (Bundle) {
- // Need to search for the lane since the tree entry can be reordered.
+ if (!Bundles.empty()) {
auto *In = BundleMember->getInst();
- int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
- find(Bundle->getTreeEntry()->Scalars, In));
- assert(Lane >= 0 && "Lane not set");
-
- // Since vectorization tree is being built recursively this assertion
- // ensures that the tree entry has all operands set before reaching
- // this code. Couple of exceptions known at the moment are extracts
- // where their second (immediate) operand is not added. Since
- // immediates do not affect scheduler behavior this is considered
- // okay.
- assert(In &&
- (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
- In->getNumOperands() ==
- Bundle->getTreeEntry()->getNumOperands()) &&
- "Missed TreeEntry operands?");
-
- for (unsigned OpIdx :
- seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
- if (auto *I = dyn_cast<Instruction>(
- Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
- LLVM_DEBUG(dbgs()
- << "SLP: check for readiness (def): " << *I << "\n");
- DecrUnschedForInst(I);
+ // Count uses of each instruction operand.
+ SmallDenseMap<const Instruction *, unsigned> OperandsUses;
+ if (isa<ScheduleCopyableData>(BundleMember)) {
+ // Copyable data is used only once (uses itself).
+ OperandsUses[In] = 1;
+ } else {
+ for (const Use &U : In->operands()) {
+ if (auto *I = dyn_cast<Instruction>(U.get()))
+ ++OperandsUses[I];
+ }
+ }
+ // Decrement the unscheduled counter and insert to ready list if ready.
+ auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
+ unsigned OpIdx) {
+ const EdgeInfo EI = {UserTE, OpIdx};
+ if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
+ DecrUnsched(CD, /*IsControl=*/false);
+ return;
+ }
+ if (ScheduleData *OpSD = getScheduleData(I)) {
+ auto It = OperandsUses.find(I);
+ assert(It != OperandsUses.end() && "Operand not found");
+ if (It->second > 0) {
+ DecrUnsched(OpSD, /*IsControl=*/false);
+ --It->getSecond();
+ }
}
+ };
+
+ for (ScheduleBundle *Bundle : Bundles) {
+ // Need to search for the lane since the tree entry can be
+ // reordered.
+ int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
+ find(Bundle->getTreeEntry()->Scalars, In));
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(In) &&
+ !Bundle->getTreeEntry()->ReorderIndices.empty())
+ Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(
+ Bundle->getTreeEntry()->Scalars.size()) &&
+ "Couldn't find extract lane");
+
+ // Since vectorization tree is being built recursively this
+ // assertion ensures that the tree entry has all operands set before
+ // reaching this code. Couple of exceptions known at the moment are
+ // extracts where their second (immediate) operand is not added.
+ // Since immediates do not affect scheduler behavior this is
+ // considered okay.
+ assert(In &&
+ (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
+ In->getNumOperands() ==
+ Bundle->getTreeEntry()->getNumOperands() ||
+ Bundle->getTreeEntry()->isCopyableElement(In)) &&
+ "Missed TreeEntry operands?");
+
+ for (unsigned OpIdx :
+ seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
+ if (auto *I = dyn_cast<Instruction>(
+ Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
+ LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I
+ << "\n");
+ DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
+ }
+ }
} else {
// If BundleMember is a stand-alone instruction, no operand reordering
// has taken place, so we directly access its operands.
- for (Use &U : BundleMember->getInst()->operands())
+ for (Use &U : BundleMember->getInst()->operands()) {
if (auto *I = dyn_cast<Instruction>(U.get())) {
LLVM_DEBUG(dbgs()
<< "SLP: check for readiness (def): " << *I << "\n");
- DecrUnschedForInst(I);
+ DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
}
+ }
}
// Handle the memory dependencies.
- for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) {
+ auto *SD = dyn_cast<ScheduleData>(BundleMember);
+ if (!SD)
+ return;
+ SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
+ for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
+ if (!VisitedMemory.insert(MemoryDep).second)
+ continue;
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
@@ -4900,7 +5496,10 @@ class BoUpSLP {
DecrUnsched(MemoryDep);
}
// Handle the control dependencies.
- for (ScheduleData *Dep : BundleMember->getControlDependencies()) {
+ SmallPtrSet<const ScheduleData *, 4> VisitedControl;
+ for (ScheduleData *Dep : SD->getControlDependencies()) {
+ if (!VisitedControl.insert(Dep).second)
+ continue;
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
LLVM_DEBUG(dbgs()
@@ -4911,12 +5510,14 @@ class BoUpSLP {
if (auto *SD = dyn_cast<ScheduleData>(Data)) {
SD->setScheduled(/*Scheduled=*/true);
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
- ProcessBundleMember(SD, nullptr);
+ ProcessBundleMember(SD, {});
} else {
ScheduleBundle &Bundle = *cast<ScheduleBundle>(Data);
Bundle.setScheduled(/*Scheduled=*/true);
LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
- auto AreAllBundlesScheduled = [&](const ScheduleData *SD) {
+ auto AreAllBundlesScheduled = [&](const ScheduleEntity *SD) {
+ if (isa<ScheduleCopyableData>(SD))
+ return true;
ArrayRef<ScheduleBundle *> SDBundles =
getScheduleBundles(SD->getInst());
return !SDBundles.empty() &&
@@ -4924,10 +5525,12 @@ class BoUpSLP {
return SDBundle->isScheduled();
});
};
- for (ScheduleData *SD : Bundle.getBundle()) {
+ for (ScheduleEntity *SD : Bundle.getBundle()) {
if (AreAllBundlesScheduled(SD)) {
SD->setScheduled(/*Scheduled=*/true);
- ProcessBundleMember(SD, &Bundle);
+ ProcessBundleMember(SD, isa<ScheduleCopyableData>(SD)
+ ? &Bundle
+ : getScheduleBundles(SD->getInst()));
}
}
}
@@ -4955,7 +5558,7 @@ class BoUpSLP {
auto *SD = getScheduleData(I);
if (!SD)
continue;
- assert(isInSchedulingRegion(SD) &&
+ assert(isInSchedulingRegion(*SD) &&
"primary schedule data not in window?");
SD->verify();
}
@@ -4996,7 +5599,11 @@ class BoUpSLP {
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
- ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
+ /// \param VL The list of scalar instructions.
+ /// \param S The state of the instructions.
+ /// \param EI The edge in the SLP graph or the user node/operand number.
+ ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S, const EdgeInfo &EI);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
@@ -5005,7 +5612,7 @@ class BoUpSLP {
/// std::nullopt if \p VL is allowed to be scheduled.
std::optional<ScheduleBundle *>
tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S);
+ const InstructionsState &S, const EdgeInfo &EI);
/// Allocates schedule data chunk.
ScheduleData *allocateScheduleDataChunks();
@@ -5045,6 +5652,48 @@ class BoUpSLP {
/// ScheduleData structures are recycled.
SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
+ /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
+ /// number) and the operand instruction, represented as copyable element.
+ SmallDenseMap<std::pair<EdgeInfo, const Value *>,
+ std::unique_ptr<ScheduleCopyableData>>
+ ScheduleCopyableDataMap;
+
+ /// Represents mapping between instruction and all related
+ /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
+ /// element). The SLP tree may contain several representations of the same
+ /// instruction.
+ SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
+ ScheduleCopyableDataMapByInst;
+
+ /// Represents mapping between user value and operand number, the operand
+ /// value and all related ScheduleCopyableData. The relation is 1:n, because
+ /// the same user may refernce the same operand in different tree entries
+ /// and the operand may be modelled by the different copyable data element.
+ SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
+ SmallVector<ScheduleCopyableData *>>
+ ScheduleCopyableDataMapByInstUser;
+
+ /// Represents mapping between instruction and all related
+ /// ScheduleCopyableData. It represents the mapping between the actual
+ /// instruction and the last copyable data element in the chain. E.g., if
+ /// the graph models the following instructions:
+ /// %0 = non-add instruction ...
+ /// ...
+ /// %4 = add %3, 1
+ /// %5 = add %4, 1
+ /// %6 = insertelement poison, %0, 0
+ /// %7 = insertelement %6, %5, 1
+ /// And the graph is modeled as:
+ /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
+ /// -> [1, 0] -> [%1, 0]
+ ///
+ /// this map will map %0 only to the copyable element <1>, which is the last
+ /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
+ /// keep the map to <0>, not the %0.
+ SmallDenseMap<const Instruction *,
+ SmallSetVector<ScheduleCopyableData *, 4>>
+ ScheduleCopyableDataMapByUsers;
+
/// Attaches ScheduleBundle to Instruction.
SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
ScheduledBundles;
@@ -5091,7 +5740,7 @@ class BoUpSLP {
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
- void scheduleBlock(BlockScheduling *BS);
+ void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
@@ -5164,6 +5813,30 @@ class BoUpSLP {
} // end namespace slpvectorizer
+template <> struct DenseMapInfo<BoUpSLP::EdgeInfo> {
+ using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>;
+ using SecondInfo = DenseMapInfo<unsigned>;
+ static BoUpSLP::EdgeInfo getEmptyKey() {
+ return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
+ SecondInfo::getEmptyKey());
+ }
+
+ static BoUpSLP::EdgeInfo getTombstoneKey() {
+ return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
+ SecondInfo::getTombstoneKey());
+ }
+
+ static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
+ return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE),
+ SecondInfo::getHashValue(Val.EdgeIdx));
+ }
+
+ static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
+ const BoUpSLP::EdgeInfo &RHS) {
+ return LHS == RHS;
+ }
+};
+
template <> struct GraphTraits<BoUpSLP *> {
using TreeEntry = BoUpSLP::TreeEntry;
@@ -7891,7 +8564,7 @@ void BoUpSLP::buildExternalUses(
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
- if (!isa<Instruction>(Scalar))
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
continue;
// All uses must be replaced already? No need to do it again.
auto It = ScalarToExtUses.find(Scalar);
@@ -9599,7 +10272,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
}))) {
if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
S.getMainOp()->isSafeToRemove() &&
- all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
+ (S.areInstructionsWithCopyableElements() ||
+ all_of(UniqueValues, IsaPred<Instruction, PoisonValue>))) {
// Find the number of elements, which forms full vectors.
unsigned PWSz = getFullVectorNumberOfElements(
TTI, UniqueValues.front()->getType(), UniqueValues.size());
@@ -9616,9 +10290,10 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
PaddedUniqueValues.append(
PWSz - UniqueValues.size(),
PoisonValue::get(UniqueValues.front()->getType()));
- // Check that extended with poisons operations are still valid for
- // vectorization (div/rem are not allowed).
- if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) {
+ // Check that extended with poisons/copyable operations are still valid
+ // for vectorization (div/rem are not allowed).
+ if (!S.areInstructionsWithCopyableElements() &&
+ !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
ReuseShuffleIndices.clear();
return false;
@@ -9767,13 +10442,98 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
}
namespace {
-/// Class accepts incoming list of values and generates the list of values
-/// for scheduling and list of operands for the new nodes.
+/// Class accepts incoming list of values, checks if it is able to model
+/// "copyable" values as compatible operations, and generates the list of values
+/// for scheduling and list of operands doe the new nodes.
class InstructionsCompatibilityAnalysis {
DominatorTree &DT;
const DataLayout &DL;
const TargetTransformInfo &TTI;
const TargetLibraryInfo &TLI;
+ unsigned MainOpcode = 0;
+ Instruction *MainOp = nullptr;
+
+ /// Identifies the best candidate value, which represents main opcode
+ /// operation.
+ /// Currently the best candidate is the Add instruction with the parent
+ /// block with the highest DFS incoming number (block, that dominates other).
+ void findMainInstruction(ArrayRef<Value *> VL) {
+ BasicBlock *Parent = nullptr;
+ // Checks if the instruction has supported opcode.
+ auto IsSupportedOpcode = [](Instruction *I) {
+ return I && I->getOpcode() == Instruction::Add;
+ };
+ // Exclude operands instructions immediately to improve compile time, it
+ // will be unable to schedule anyway.
+ SmallDenseSet<Value *, 8> Operands;
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (!DT.isReachableFromEntry(I->getParent()))
+ continue;
+ if (!MainOp) {
+ MainOp = I;
+ Parent = I->getParent();
+ Operands.insert(I->op_begin(), I->op_end());
+ continue;
+ }
+ if (Parent == I->getParent()) {
+ if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
+ MainOp = I;
+ Operands.insert(I->op_begin(), I->op_end());
+ continue;
+ }
+ auto *NodeA = DT.getNode(Parent);
+ auto *NodeB = DT.getNode(I->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) ==
+ (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
+ MainOp = I;
+ Parent = I->getParent();
+ Operands.clear();
+ Operands.insert(I->op_begin(), I->op_end());
+ }
+ }
+ if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
+ MainOp = nullptr;
+ return;
+ }
+ MainOpcode = MainOp->getOpcode();
+ }
+
+ /// Returns the idempotent value for the \p MainOp with the detected \p
+ /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
+ /// the operand itself, since V or V == V.
+ Value *selectBestIdempotentValue() const {
+ switch (MainOpcode) {
+ case Instruction::Add:
+ return ConstantInt::getNullValue(MainOp->getType());
+ default:
+ break;
+ }
+ llvm_unreachable("Unsupported opcode");
+ }
+
+ /// Returns the value and operands for the \p V, considering if it is original
+ /// instruction and its actual operands should be returned, or it is a
+ /// copyable element and its should be represented as idempotent instruction.
+ SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
+ if (isa<PoisonValue>(V))
+ return {V, V};
+ if (!S.isCopyableElement(V))
+ return convertTo(cast<Instruction>(V), S).second;
+ switch (MainOpcode) {
+ case Instruction::Add:
+ return {V, selectBestIdempotentValue()};
+ default:
+ break;
+ }
+ llvm_unreachable("Unsupported opcode");
+ }
/// Builds operands for the original instructions.
void
@@ -9934,22 +10694,145 @@ class InstructionsCompatibilityAnalysis {
const TargetLibraryInfo &TLI)
: DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
+ InstructionsState
+ buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
+ bool TryCopyableElementsVectorization,
+ bool WithProfitabilityCheck = false) {
+ InstructionsState S = getSameOpcode(VL, TLI);
+ if (S)
+ return S;
+ if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
+ return S;
+ findMainInstruction(VL);
+ if (!MainOp)
+ return InstructionsState::invalid();
+ S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
+ if (!WithProfitabilityCheck)
+ return S;
+ // Check if it is profitable to vectorize the instruction.
+ SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
+ if (VL.size() == 2) {
+ // Check if the operands allow better vectorization.
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+ Candidates.emplace_back(Operands[0][0], Operands[0][1]);
+ Candidates.emplace_back(Operands[1][0], Operands[1][1]);
+ if (isCommutative(MainOp)) {
+ Candidates.emplace_back(Operands[0][0], Operands[1][1]);
+ Candidates.emplace_back(Operands[1][0], Operands[0][1]);
+ }
+ // No good candidates - not profitable.
+ if (!R.findBestRootPair(Candidates,
+ BoUpSLP::LookAheadHeuristics::ScoreSplat)) {
+ // Deeper analysis for 2 splats/constants.
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
+ Candidates1.emplace_back(Operands[0][0], Operands[0][1]);
+ Candidates2.emplace_back(Operands[1][0], Operands[1][1]);
+ bool Res =
+ R.findBestRootPair(Candidates1) && R.findBestRootPair(Candidates2);
+ if (!Res && isCommutative(MainOp)) {
+ Candidates1.clear();
+ Candidates2.clear();
+ Candidates1.emplace_back(Operands[0][0], Operands[1][1]);
+ Candidates2.emplace_back(Operands[1][0], Operands[0][1]);
+ Res = R.findBestRootPair(Candidates1) &&
+ R.findBestRootPair(Candidates2);
+ }
+ if (!Res)
+ return InstructionsState::invalid();
+ }
+ return S;
+ }
+ assert(Operands.size() == 2 && "Unexpected number of operands!");
+ unsigned CopyableNum =
+ count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
+ if (CopyableNum < VL.size() / 2)
+ return S;
+ // Check profitability if number of copyables > VL.size() / 2.
+ // 1. Reorder operands for better matching.
+ if (isCommutative(MainOp)) {
+ for (auto &Ops : Operands) {
+ // Make instructions the first operands.
+ if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
+ std::swap(Ops.front(), Ops.back());
+ continue;
+ }
+ // Make constants the second operands.
+ if (isa<Constant>(Ops.front())) {
+ std::swap(Ops.front(), Ops.back());
+ continue;
+ }
+ }
+ }
+ // 2. Check, if operands can be vectorized.
+ if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
+ return InstructionsState::invalid();
+ auto CheckOperand = [&](ArrayRef<Value *> Ops) {
+ if (allConstant(Ops) || isSplat(Ops))
+ return true;
+ // Check if it is "almost" splat, i.e. has >= 4 elements and only single
+ // one is different.
+ constexpr unsigned Limit = 4;
+ if (Operands.front().size() >= Limit) {
+ SmallDenseMap<const Value *, unsigned> Counters;
+ for (Value *V : Ops) {
+ if (isa<UndefValue>(V))
+ continue;
+ ++Counters[V];
+ }
+ if (Counters.size() == 2 &&
+ any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
+ return C.second == 1;
+ }))
+ return true;
+ }
+ // First operand not a constant or splat? Last attempt - check for
+ // potential vectorization.
+ InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+ InstructionsState OpS = Analysis.buildInstructionsState(
+ Ops, R, /*TryCopyableElementsVectorization=*/true);
+ if (!OpS)
+ return false;
+ unsigned CopyableNum =
+ count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
+ return CopyableNum <= VL.size() / 2;
+ };
+ if (!CheckOperand(Operands.front()))
+ return InstructionsState::invalid();
+
+ return S;
+ }
+
SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
ArrayRef<Value *> VL) {
assert(S && "Invalid state!");
SmallVector<BoUpSLP::ValueList> Operands;
- buildOriginalOperands(S, VL, Operands);
+ if (S.areInstructionsWithCopyableElements()) {
+ MainOp = S.getMainOp();
+ MainOpcode = S.getOpcode();
+ Operands.assign(MainOp->getNumOperands(),
+ BoUpSLP::ValueList(VL.size(), nullptr));
+ for (auto [Idx, V] : enumerate(VL)) {
+ SmallVector<Value *> OperandsForValue = getOperands(S, V);
+ for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
+ Operands[OperandIdx][Idx] = Operand;
+ }
+ } else {
+ buildOriginalOperands(S, VL, Operands);
+ }
return Operands;
}
};
} // namespace
-BoUpSLP::ScalarsVectorizationLegality
-BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) const {
+BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
+ ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
+ bool TryCopyableElementsVectorization) const {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
- InstructionsState S = getSameOpcode(VL, *TLI);
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ VL, *this, TryCopyableElementsVectorization,
+ /*WithProfitabilityCheck=*/true);
// Don't go into catchswitch blocks, which can happen with PHIs.
// Such blocks can only have PHIs and the catchswitch. There is no
@@ -10248,9 +11131,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
return true;
};
- ScalarsVectorizationLegality Legality =
- getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
- const InstructionsState &S = Legality.getInstructionsState();
+ ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+ InstructionsState S = Legality.getInstructionsState();
if (!Legality.isLegal()) {
if (Legality.trySplitVectorize()) {
auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
@@ -10258,11 +11141,18 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
return;
}
- if (Legality.tryToFindDuplicates())
- tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx);
+ if (!S)
+ Legality = getScalarsVectorizationLegality(
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
+ if (!Legality.isLegal()) {
+ if (Legality.tryToFindDuplicates())
+ tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
+ UserTreeIdx);
- newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
- return;
+ newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
+ return;
+ }
+ S = Legality.getInstructionsState();
}
// FIXME: investigate if there are profitable cases for VL.size() <= 4.
@@ -10299,7 +11189,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
SetVector<Value *> UniqueValues(llvm::from_range, VL);
std::optional<ScheduleBundle *> BundlePtr =
- BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S);
+ BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
@@ -13021,7 +13911,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
assert(E->getOpcode() &&
((allSameType(VL) && allSameBlock(VL)) ||
(E->getOpcode() == Instruction::GetElementPtr &&
- E->getMainOp()->getType()->isPointerTy())) &&
+ E->getMainOp()->getType()->isPointerTy()) ||
+ E->hasCopyableElements()) &&
"Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
@@ -13033,6 +13924,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallBitVector UsedScalars(Sz, false);
for (unsigned I = 0; I < Sz; ++I) {
if (isa<Instruction>(UniqueValues[I]) &&
+ !E->isCopyableElement(UniqueValues[I]) &&
getTreeEntries(UniqueValues[I]).front() == E)
continue;
UsedScalars.set(I);
@@ -14083,6 +14975,31 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
}))
return true;
+ // If the tree contains only buildvector, 2 non-buildvectors (with root user
+ // tree node) and other buildvectors, we can skip it.
+ if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
+ VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
+ VectorizableTree.size() >= Limit &&
+ count_if(ArrayRef(VectorizableTree).drop_front(),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return !TE->isGather() && TE->UserTreeIndex.UserTE &&
+ TE->UserTreeIndex.UserTE->Idx == 0;
+ }) == 2)
+ return true;
+
+ // If the tree contains only vectorization of the phi node from the
+ // buildvector - skip it.
+ if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
+ VectorizableTree.size() > 2 &&
+ VectorizableTree.front()->State == TreeEntry::Vectorize &&
+ VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
+ VectorizableTree[1]->State == TreeEntry::Vectorize &&
+ VectorizableTree[1]->getOpcode() == Instruction::PHI &&
+ all_of(
+ ArrayRef(VectorizableTree).drop_front(2),
+ [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
+ return true;
+
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
@@ -16063,6 +16980,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
+ if (E->isCopyableElement(I))
+ continue;
if (FirstInst->getParent() == I->getParent()) {
if (I->comesBefore(FirstInst))
FirstInst = I;
@@ -16127,7 +17046,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
return nullptr;
for (Value *V : E->Scalars) {
auto *I = dyn_cast<Instruction>(V);
- if (!I || isa<PHINode>(I) || doesNotNeedToBeScheduled(I))
+ if (!I || isa<PHINode>(I) ||
+ (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
continue;
ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
if (Bundles.empty())
@@ -18575,7 +19495,7 @@ Value *BoUpSLP::vectorizeTree(
EntryToLastInstruction.clear();
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules)
- scheduleBlock(BSIter.second.get());
+ scheduleBlock(*this, BSIter.second.get());
// Cache last instructions for the nodes to avoid side effects, which may
// appear during vectorization, like extra uses, etc.
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
@@ -19140,7 +20060,7 @@ Value *BoUpSLP::vectorizeTree(
if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
EE && IgnoredExtracts.contains(EE))
continue;
- if (isa<PoisonValue>(Scalar))
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
continue;
#ifndef NDEBUG
Type *Ty = Scalar->getType();
@@ -19381,21 +20301,29 @@ void BoUpSLP::optimizeGatherSequence() {
GatherShuffleExtractSeq.clear();
}
-BoUpSLP::ScheduleBundle &
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
+ ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
auto &BundlePtr =
ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (S.isNonSchedulable(V))
continue;
+ auto *I = cast<Instruction>(V);
+ if (S.isCopyableElement(V)) {
+ // Add a copyable element model.
+ ScheduleCopyableData &SD =
+ addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr);
+ // Group the instructions to a bundle.
+ BundlePtr->add(&SD);
+ continue;
+ }
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember && "no ScheduleData for bundle member "
"(maybe not in same basic block)");
// Group the instructions to a bundle.
BundlePtr->add(BundleMember);
- ScheduledBundles.try_emplace(cast<Instruction>(V))
- .first->getSecond()
- .push_back(BundlePtr.get());
+ ScheduledBundles.try_emplace(I).first->getSecond().push_back(
+ BundlePtr.get());
}
assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
return *BundlePtr;
@@ -19405,11 +20333,15 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
// and schedules instructions until the bundle gets ready.
std::optional<BoUpSLP::ScheduleBundle *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S) {
+ const InstructionsState &S,
+ const EdgeInfo &EI) {
// No need to schedule PHIs, insertelement, extractelement and extractvalue
// instructions.
+ bool HasCopyables = S.areInstructionsWithCopyableElements();
if (isa<PHINode>(S.getMainOp()) ||
- isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL))
+ isVectorLikeInstWithConstOps(S.getMainOp()) ||
+ (!HasCopyables && doesNotNeedToSchedule(VL)) ||
+ all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
return nullptr;
// Initialize the instruction bundle.
@@ -19417,6 +20349,33 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
+ // Clear deps or reculate the region, if the memory instruction is a
+ // copyable. It may have memory deps, which must be reaculated.
+ auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
+ SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
+ for (ScheduleEntity *SE : Bundle.getBundle()) {
+ if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(SE)) {
+ if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
+ BundleMember && BundleMember->hasValidDependencies())
+ BundleMember->clearDirectDependencies();
+ continue;
+ }
+ auto *SD = cast<ScheduleData>(SE);
+ for (const Use &U : SD->getInst()->operands()) {
+ unsigned &NumOps =
+ UserOpToNumOps
+ .try_emplace(std::make_pair(SD->getInst(), U.get()), 0)
+ .first->getSecond();
+ ++NumOps;
+ if (auto *Op = dyn_cast<Instruction>(U.get());
+ Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op,
+ *SLP, NumOps)) {
+ if (ScheduleData *OpSD = getScheduleData(Op))
+ OpSD->clearDirectDependencies();
+ }
+ }
+ }
+ };
// The scheduling region got new instructions at the lower end (or it is a
// new region for the first bundle). This makes it necessary to
// recalculate all dependencies.
@@ -19426,10 +20385,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
if (ScheduleData *SD = getScheduleData(I))
SD->clearDependencies();
+ if (SmallVector<ScheduleCopyableData *> SDs =
+ getScheduleCopyableData(I);
+ !SDs.empty()) {
+ for_each(SDs,
+ [](ScheduleCopyableData *SD) { SD->clearDependencies(); });
+ }
}
ReSchedule = true;
}
+ // Check if the bundle data has deps for copyable elements already. In
+ // this case need to reset deps and recalculate it.
if (Bundle && !Bundle.getBundle().empty()) {
+ CheckIfNeedToClearDeps(Bundle);
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
<< BB->getName() << "\n");
calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP);
@@ -19448,7 +20416,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
!ReadyInsts.empty()) {
ScheduleEntity *Picked = ReadyInsts.pop_back_val();
assert(Picked->isReady() && "must be ready to schedule");
- schedule(Picked, ReadyInsts);
+ schedule(*SLP, S, EI, Picked, ReadyInsts);
if (Picked == &Bundle)
break;
}
@@ -19457,7 +20425,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (S.isNonSchedulable(V))
continue;
if (!extendSchedulingRegion(V, S)) {
// If the scheduling region got new instructions at the lower end (or it
@@ -19474,11 +20442,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
bool ReSchedule = false;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (S.isNonSchedulable(V))
continue;
+ SmallVector<ScheduleCopyableData *> CopyableData =
+ getScheduleCopyableData(cast<Instruction>(V));
+ if (!CopyableData.empty()) {
+ for (ScheduleCopyableData *SD : CopyableData)
+ ReadyInsts.remove(SD);
+ }
ScheduleData *BundleMember = getScheduleData(V);
- assert(BundleMember &&
+ assert((BundleMember || S.isCopyableElement(V)) &&
"no ScheduleData for bundle member (maybe not in same basic block)");
+ if (!BundleMember)
+ continue;
// Make sure we don't leave the pieces of the bundle in the ready list when
// whole bundle might not be ready.
@@ -19489,21 +20465,26 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
ReadyInsts.remove(B);
}
- if (!BundleMember->isScheduled())
+ if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
continue;
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
// existing schedule.
+ // A bundle member has deps calculated before it was copyable element - need
+ // to reschedule.
LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
<< " was already scheduled\n");
ReSchedule = true;
}
- ScheduleBundle &Bundle = buildBundle(VL);
+ ScheduleBundle &Bundle = buildBundle(VL, S, EI);
TryScheduleBundleImpl(ReSchedule, Bundle);
if (!Bundle.isReady()) {
- for (ScheduleData *BD : Bundle.getBundle()) {
- if (BD->isReady()) {
+ for (ScheduleEntity *BD : Bundle.getBundle()) {
+ // Copyable data scheduling is just removed.
+ if (isa<ScheduleCopyableData>(BD))
+ continue;
+ if (!BD->isReady()) {
ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(BD->getInst());
if (Bundles.empty()) {
ReadyInsts.insert(BD);
@@ -19516,9 +20497,49 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
}
ScheduledBundlesList.pop_back();
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V))
+ if (S.isNonSchedulable(V))
continue;
- ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back();
+ auto *I = cast<Instruction>(V);
+ if (S.isCopyableElement(I)) {
+ // Remove the copyable data from the scheduling region and restore
+ // previous mappings.
+ auto KV = std::make_pair(EI, I);
+ assert(ScheduleCopyableDataMap.contains(KV) &&
+ "no ScheduleCopyableData for copyable element");
+ ScheduleCopyableData *SD =
+ ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val();
+ ScheduleCopyableDataMapByUsers[I].remove(SD);
+ if (EI.UserTE) {
+ ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+ const auto *It = find(Op, I);
+ assert(It != Op.end() && "Lane not set");
+ do {
+ int Lane = std::distance(Op.begin(), It);
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+ !EI.UserTE->ReorderIndices.empty())
+ Lane = EI.UserTE->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+ "Couldn't find extract lane");
+ auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+ ScheduleCopyableDataMapByInstUser
+ [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)]
+ .pop_back();
+ It = find(make_range(std::next(It), Op.end()), I);
+ } while (It != Op.end());
+ EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
+ if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I))
+ ScheduleCopyableDataMapByUsers[I].insert(UserCD);
+ }
+ if (ScheduleCopyableDataMapByUsers[I].empty())
+ ScheduleCopyableDataMapByUsers.erase(I);
+ ScheduleCopyableDataMap.erase(KV);
+ // Need to recalculate dependencies for the actual schedule data.
+ if (ScheduleData *OpSD = getScheduleData(I))
+ OpSD->clearDirectDependencies();
+ continue;
+ }
+ ScheduledBundles.find(I)->getSecond().pop_back();
}
return std::nullopt;
}
@@ -19538,10 +20559,6 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V, const InstructionsState &S) {
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
- assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
- !doesNotNeedToBeScheduled(I) &&
- "phi nodes/insertelements/extractelements/extractvalues don't need to "
- "be scheduled");
if (getScheduleData(I))
return true;
if (!ScheduleStart) {
@@ -19611,14 +20628,14 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
ScheduleData *CurrentLoadStore = PrevLoadStore;
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
// No need to allocate data for non-schedulable instructions.
- if (doesNotNeedToBeScheduled(I))
+ if (isa<PHINode>(I))
continue;
ScheduleData *SD = ScheduleDataMap.lookup(I);
if (!SD) {
SD = allocateScheduleDataChunks();
ScheduleDataMap[I] = SD;
}
- assert(!isInSchedulingRegion(SD) &&
+ assert(!isInSchedulingRegion(*SD) &&
"new ScheduleData already in scheduling region");
SD->init(SchedulingRegionID, I);
@@ -19651,24 +20668,101 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP) {
- SmallVector<ScheduleData *> WorkList;
- auto ProcessNode = [&](ScheduleData *BundleMember) {
- if (BundleMember->hasValidDependencies())
+ SmallVector<ScheduleEntity *> WorkList;
+ auto ProcessNode = [&](ScheduleEntity *SE) {
+ if (SE->hasValidDependencies())
return;
+ if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
+ LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
+ CD->initDependencies();
+ CD->resetUnscheduledDeps();
+ const EdgeInfo &EI = CD->getEdgeInfo();
+ if (EI.UserTE) {
+ ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
+ const auto *It = find(Op, CD->getInst());
+ assert(It != Op.end() && "Lane not set");
+ do {
+ int Lane = std::distance(Op.begin(), It);
+ assert(Lane >= 0 && "Lane not set");
+ if (isa<StoreInst>(EI.UserTE->Scalars[Lane]) &&
+ !EI.UserTE->ReorderIndices.empty())
+ Lane = EI.UserTE->ReorderIndices[Lane];
+ assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
+ "Couldn't find extract lane");
+ auto *In = cast<Instruction>(EI.UserTE->Scalars[Lane]);
+ if (EI.UserTE->isCopyableElement(In)) {
+ // We may have not have related copyable scheduling data, if the
+ // instruction is non-schedulable.
+ if (ScheduleCopyableData *UseSD =
+ getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
+ CD->incDependencies();
+ if (!UseSD->isScheduled())
+ CD->incrementUnscheduledDeps(1);
+ WorkList.push_back(UseSD);
+ }
+ } else if (ScheduleData *UseSD = getScheduleData(In)) {
+ CD->incDependencies();
+ if (!UseSD->isScheduled())
+ CD->incrementUnscheduledDeps(1);
+ WorkList.push_back(UseSD);
+ }
+ It = find(make_range(std::next(It), Op.end()), CD->getInst());
+ } while (It != Op.end());
+ if (CD->isReady() && CD->getDependencies() == 0 &&
+ (EI.UserTE->hasState() &&
+ (EI.UserTE->getMainOp()->getParent() !=
+ CD->getInst()->getParent() ||
+ (isa<PHINode>(EI.UserTE->getMainOp()) &&
+ (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) ||
+ any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
+ auto *IU = dyn_cast<Instruction>(U);
+ if (!IU)
+ return true;
+ return IU->getParent() == EI.UserTE->getMainOp()->getParent();
+ })))))) {
+ // If no uses in the block - mark as having pseudo-use, which cannot
+ // be scheduled.
+ // Prevents incorrect def-use tracking between external user and
+ // actual instruction.
+ CD->incDependencies();
+ CD->incrementUnscheduledDeps(1);
+ }
+ }
+ return;
+ }
+ auto *BundleMember = cast<ScheduleData>(SE);
LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
BundleMember->initDependencies();
BundleMember->resetUnscheduledDeps();
// Handle def-use chain dependencies.
+ SmallDenseMap<Value *, unsigned> UserToNumOps;
for (User *U : BundleMember->getInst()->users()) {
if (ScheduleData *UseSD = getScheduleData(U)) {
+ // The operand is a copyable element - skip.
+ unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
+ ++NumOps;
+ if (areAllOperandsReplacedByCopyableData(
+ cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
+ continue;
BundleMember->incDependencies();
if (!UseSD->isScheduled())
BundleMember->incrementUnscheduledDeps(1);
WorkList.push_back(UseSD);
}
}
+ for (ScheduleCopyableData *UseSD :
+ getScheduleCopyableDataUsers(BundleMember->getInst())) {
+ BundleMember->incDependencies();
+ if (!UseSD->isScheduled())
+ BundleMember->incrementUnscheduledDeps(1);
+ WorkList.push_back(UseSD);
+ }
+ SmallPtrSet<const Instruction *, 4> Visited;
auto MakeControlDependent = [&](Instruction *I) {
+ // Do not mark control dependent twice.
+ if (!Visited.insert(I).second)
+ return;
auto *DepDest = getScheduleData(I);
assert(DepDest && "must be in schedule window");
DepDest->addControlDependency(BundleMember);
@@ -19754,7 +20848,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
for (ScheduleData *DepDest = NextLoadStore; DepDest;
DepDest = DepDest->getNextLoadStore()) {
- assert(isInSchedulingRegion(DepDest) && "Expected to be in region");
+ assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
// We have two limits to reduce the complexity:
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
@@ -19802,8 +20896,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
WorkList.push_back(Bundle.getBundle().front());
SmallPtrSet<ScheduleBundle *, 16> Visited;
while (!WorkList.empty()) {
- ScheduleData *SD = WorkList.pop_back_val();
- ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(SD->getInst());
+ ScheduleEntity *SD = WorkList.pop_back_val();
+ SmallVector<ScheduleBundle *, 1> CopyableBundle;
+ ArrayRef<ScheduleBundle *> Bundles;
+ if (auto *CD = dyn_cast<ScheduleCopyableData>(SD)) {
+ CopyableBundle.push_back(&CD->getBundle());
+ Bundles = CopyableBundle;
+ } else {
+ Bundles = getScheduleBundles(SD->getInst());
+ }
if (Bundles.empty()) {
ProcessNode(SD);
if (InsertInReadyList && SD->isReady()) {
@@ -19838,21 +20939,37 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
"tried to reset schedule on block which has not been scheduled");
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
if (ScheduleData *SD = getScheduleData(I)) {
- assert(isInSchedulingRegion(SD) &&
+ assert(isInSchedulingRegion(*SD) &&
"ScheduleData not in scheduling region");
SD->setScheduled(/*Scheduled=*/false);
SD->resetUnscheduledDeps();
}
+ if (SmallVector<ScheduleCopyableData *> SDs = getScheduleCopyableData(I);
+ !SDs.empty()) {
+ for_each(SDs, [&](ScheduleCopyableData *SD) {
+ assert(isInSchedulingRegion(*SD) &&
+ "ScheduleData not in scheduling region");
+ SD->setScheduled(/*Scheduled=*/false);
+ SD->resetUnscheduledDeps();
+ });
+ }
for (ScheduleBundle *Bundle : getScheduleBundles(I)) {
assert(isInSchedulingRegion(*Bundle) &&
"ScheduleBundle not in scheduling region");
Bundle->setScheduled(/*Scheduled=*/false);
}
}
+ // Reset schedule data for copyable elements.
+ for (auto &P : ScheduleCopyableDataMap) {
+ if (isInSchedulingRegion(*P.second.get())) {
+ P.second->setScheduled(/*Scheduled=*/false);
+ P.second->resetUnscheduledDeps();
+ }
+ }
ReadyInsts.clear();
}
-void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
if (!BS->ScheduleStart)
return;
@@ -19890,15 +21007,45 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (!Bundle->hasValidDependencies())
BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this);
}
+ SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
+ for (ScheduleCopyableData *SD : reverse(SDs)) {
+ ScheduleBundle &Bundle = SD->getBundle();
+ Bundle.setSchedulingPriority(Idx++);
+ if (!Bundle.hasValidDependencies())
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+ }
continue;
}
+ SmallVector<ScheduleCopyableData *> CopyableData =
+ BS->getScheduleCopyableDataUsers(I);
if (ScheduleData *SD = BS->getScheduleData(I)) {
[[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(I);
assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
- SDTEs.front()->doesNotNeedToSchedule()) &&
+ SDTEs.front()->doesNotNeedToSchedule() ||
+ doesNotNeedToBeScheduled(I)) &&
"scheduler and vectorizer bundle mismatch");
SD->setSchedulingPriority(Idx++);
- continue;
+ if (!SD->hasValidDependencies() &&
+ (!CopyableData.empty() ||
+ any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) {
+ assert(TE->isGather() && "expected gather node");
+ return TE->hasState() && TE->hasCopyableElements() &&
+ TE->isCopyableElement(I);
+ }))) {
+ // Need to calculate deps for these nodes to correctly handle copyable
+ // dependencies, even if they were cancelled.
+ // If copyables bundle was cancelled, the deps are cleared and need to
+ // recalculate them.
+ ScheduleBundle Bundle;
+ Bundle.add(SD);
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
+ }
+ }
+ for (ScheduleCopyableData *SD : reverse(CopyableData)) {
+ ScheduleBundle &Bundle = SD->getBundle();
+ Bundle.setSchedulingPriority(Idx++);
+ if (!Bundle.hasValidDependencies())
+ BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this);
}
}
BS->initialFillReadyList(ReadyInsts);
@@ -19914,9 +21061,12 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
if (auto *Bundle = dyn_cast<ScheduleBundle>(Picked)) {
- for (const ScheduleData *BundleMember : Bundle->getBundle()) {
+ for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
Instruction *PickedInst = BundleMember->getInst();
- if (!Scheduled.insert(PickedInst).second)
+ // If copyable must be schedule as part of something else, skip it.
+ bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
+ if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
+ (!IsCopyable && !Scheduled.insert(PickedInst).second))
continue;
if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
@@ -19931,7 +21081,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
LastScheduledInst = PickedInst;
}
- BS->schedule(Picked, ReadyInsts);
+ auto Invalid = InstructionsState::invalid();
+ BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts);
}
// Check that we didn't break any of our invariants.
@@ -20145,7 +21296,7 @@ bool BoUpSLP::collectValuesToDemote(
};
if (E.isGather() || !Visited.insert(&E).second ||
any_of(E.Scalars, [&](Value *V) {
- return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
+ return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
return isa<InsertElementInst>(U) && !isVectorized(U);
});
}))
@@ -20611,7 +21762,12 @@ void BoUpSLP::computeMinimumValueSizes() {
if (!IsKnownPositive)
++BitWidth1;
- APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
+ auto *I = dyn_cast<Instruction>(Root);
+ if (!I) {
+ MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
+ continue;
+ }
+ APInt Mask = DB->getDemandedBits(I);
unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
MaxBitWidth =
std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
@@ -20940,7 +22096,9 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
for (Value *V : Chain)
ValOps.insert(cast<StoreInst>(V)->getValueOperand());
// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
- InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
DenseSet<Value *> Stores(Chain.begin(), Chain.end());
bool IsAllowedSize =
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
index 7ed5f33c9dc6c..c791a07993440 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
@@ -4,11 +4,7 @@
define void @test() {
; CHECK-LABEL: define void @test() {
; CHECK-NEXT: [[BB:.*:]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[ADD]], i32 3
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0
-; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0
+; CHECK-NEXT: [[SELECT:%.*]] = select i1 false, i32 0, i32 0
; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64
; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]]
; CHECK-NEXT: store ptr addrspace(1) null, ptr addrspace(1) [[GETELEMENTPTR]], align 8
@@ -16,8 +12,6 @@ define void @test() {
; CHECK-NEXT: [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer)
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[CALL]], i32 3
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4)
; CHECK-NEXT: ret void
;
bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
index fa46bd3d83249..d46098e754136 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
@@ -19,13 +19,13 @@ define void @test(ptr %0, i32 %add651) {
; CHECK-NEXT: [[ARRAYIDX660:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7800
; CHECK-NEXT: [[ARRAYIDX689:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7816
; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> <i32 1, i32 poison>, i32 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[ADD651]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP13]], <2 x i32> [[TMP10]], i64 2)
-; CHECK-NEXT: [[TMP15:%.*]] = lshr <4 x i32> [[TMP14]], splat (i32 1)
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 1, i32 poison>, i32 [[TMP5]], i32 3
+; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[TMP14]], [[TMP12]]
+; CHECK-NEXT: [[TMP15:%.*]] = lshr <4 x i32> [[TMP19]], splat (i32 1)
; CHECK-NEXT: [[SHR685:%.*]] = lshr i32 [[TMP2]], 1
; CHECK-NEXT: [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[CONV686:%.*]] = trunc i32 [[SHR685]] to i16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
index 992909fb3e87f..5e3d4715e99c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
@@ -7,16 +7,10 @@ define i32 @test() {
; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]]
; CHECK: [[FUNC_135_EXIT_I]]:
; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 23, i32 8, i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v12i32(<16 x i32> poison, <12 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 26, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison>, [[TMP1]]
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4>
; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
index 1c482e079bb0f..03d76ef571d64 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
@@ -4,11 +4,10 @@
define i64 @test() {
; CHECK-LABEL: define i64 @test() {
; CHECK-NEXT: [[BB:.*]]:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 0, i32 1
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> zeroinitializer, [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT: br label %[[BB5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
index 382d6ae0e0a6f..6bb52e0fc43b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
@@ -7,19 +7,17 @@ define void @test() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[IF_THEN_I_I:.*]]:
-; CHECK-NEXT: br label %[[BB5:.*]]
+; CHECK-NEXT: br label %[[BB3:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[TMP0:%.*]] = zext i1 false to i64
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> zeroinitializer, [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <2 x i64> [[TMP2]], i64 2)
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <2 x i64> [[TMP2]], i64 2)
-; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB2:.*]]
-; CHECK: [[BB5]]:
-; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, i64 [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> zeroinitializer, [[TMP1]]
+; CHECK-NEXT: br i1 false, label %[[BB3]], label %[[BB2:.*]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ [[TMP2]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ]
; CHECK-NEXT: br label %[[BB2]]
; CHECK: [[BB2]]:
-; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP6]], %[[BB5]] ], [ [[TMP4]], %[[BB1]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB3]] ], [ [[TMP2]], %[[BB1]] ]
; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr null, i64 40), align 8
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
index 9fbe0a54b0688..64344342ffe3a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -59,8 +59,6 @@ define void @pr35497() local_unnamed_addr #0 {
; SSE-LABEL: @pr35497(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP0:%.*]] = load i64, ptr undef, align 1
-; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef
-; SSE-NEXT: store i64 [[ADD]], ptr undef, align 1
; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 undef>, i64 [[TMP0]], i32 0
; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 2)
@@ -68,32 +66,34 @@ define void @pr35497() local_unnamed_addr #0 {
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1
-; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
-; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef
+; SSE-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], <i64 undef, i64 0>
+; SSE-NEXT: store i64 [[ADD]], ptr undef, align 1
; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], splat (i64 2)
; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], splat (i64 20)
+; SSE-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], splat (i64 6)
-; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
+; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP12]], [[TMP10]]
; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
; SSE-NEXT: ret void
;
; AVX-LABEL: @pr35497(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = load i64, ptr undef, align 1
-; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef
-; AVX-NEXT: store i64 [[ADD]], ptr undef, align 1
; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr undef, i64 0, i64 4
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 2)
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], splat (i64 20)
; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
-; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1
+; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef
+; AVX-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], <i64 undef, i64 0>
+; AVX-NEXT: store i64 [[ADD]], ptr undef, align 1
; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], splat (i64 2)
; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], splat (i64 20)
+; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], splat (i64 6)
-; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
+; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP11]], [[TMP9]]
; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1
; AVX-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
index a4949bc67b0f1..782aada17acac 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -6,14 +6,9 @@ target triple = "x86_64-unknown-linux-gnu"
define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1
-; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECINIT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VECINIT51:%.*]] = add <4 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <4 x i32> [[VECINIT51]]
;
%vecinit = insertelement <4 x i32> undef, i32 %f, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll
index a17ccb4b46ef9..a56c6b76ba39f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll
@@ -5,9 +5,11 @@ define i1 @test() {
; CHECK-LABEL: define i1 @test() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[H_PROMOTED118_I_FR:%.*]] = freeze i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[H_PROMOTED118_I_FR]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> zeroinitializer, [[TMP3]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[H_PROMOTED118_I_FR]], i32 2
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> <i32 2, i32 2, i32 7, i32 2>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 0, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[TMP6]], <i32 1, i32 0, i32 0, i32 0>
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index ad4daeab003f5..125c2dce32663 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -150,9 +150,9 @@ define <2 x i32> @replace_through_int_casts_ele0_only(i16 %inp, <2 x i16> %dead)
define <2 x i8> @replace_through_binop_fail_cant_speculate(i8 %inp, <2 x i8> %d, <2 x i8> %any) {
; CHECK-LABEL: define <2 x i8> @replace_through_binop_fail_cant_speculate(
; CHECK-SAME: i8 [[INP:%.*]], <2 x i8> [[D:%.*]], <2 x i8> [[ANY:%.*]]) {
-; CHECK-NEXT: [[ADD:%.*]] = add i8 [[INP]], 5
-; CHECK-NEXT: [[V0:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i64 0
-; CHECK-NEXT: [[V:%.*]] = insertelement <2 x i8> [[V0]], i8 [[ADD]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[V:%.*]] = add <2 x i8> [[TMP2]], <i8 0, i8 5>
; CHECK-NEXT: [[DIV0:%.*]] = sdiv <2 x i8> splat (i8 -128), [[V]]
; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[INP]], 123
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i8> [[DIV0]], i8 [[TMP1]], i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index afe92f89ac0d1..11c4dc9f16880 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -332,12 +332,12 @@ define void @test11(<2 x i64> %0, i64 %1, <2 x i64> %2) {
; CHECK-LABEL: @test11(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> <i64 5, i64 0>, [[TMP2:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[TMP4]] to <2 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP5]], i64 0)
-; CHECK-NEXT: [[TMP7:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i16>
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP6]], <2 x i16> [[TMP7]], i64 2)
-; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i16> [[TMP8]] to <4 x i8>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> <i64 5, i64 0>, i64 0)
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> zeroinitializer, i64 2)
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP2:%.*]], i64 0)
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP6]], <2 x i64> [[TMP3]], i64 2)
+; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i64> [[TMP8]] to <4 x i8>
; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> poison, <2 x i8> zeroinitializer, i64 0)
; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP10]], <2 x i8> zeroinitializer, i64 2)
; CHECK-NEXT: [[TMP12:%.*]] = urem <4 x i8> [[TMP9]], [[TMP11]]
>From 8fbfef4a5e186d056aa30dd2de550e20184dcb56 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 7 Jul 2025 18:38:36 +0000
Subject: [PATCH 2/2] Fix formatting
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 50 ++++++++++---------
1 file changed, 26 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c5a6de40fa72b..6a1ef71000b25 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -996,23 +996,23 @@ class BinOpSameOpcodeHelper {
bool hasCandidateOpcode(unsigned Opcode) const {
MaskType Candidate = Mask & SeenBefore;
switch (Opcode) {
- case Instruction::Shl:
- return Candidate & ShlBIT;
- case Instruction::AShr:
- return Candidate & AShrBIT;
- case Instruction::Mul:
- return Candidate & MulBIT;
- case Instruction::Add:
- return Candidate & AddBIT;
- case Instruction::Sub:
- return Candidate & SubBIT;
- case Instruction::And:
- return Candidate & AndBIT;
- case Instruction::Or:
- return Candidate & OrBIT;
- case Instruction::Xor:
- return Candidate & XorBIT;
- default:
+ case Instruction::Shl:
+ return Candidate & ShlBIT;
+ case Instruction::AShr:
+ return Candidate & AShrBIT;
+ case Instruction::Mul:
+ return Candidate & MulBIT;
+ case Instruction::Add:
+ return Candidate & AddBIT;
+ case Instruction::Sub:
+ return Candidate & SubBIT;
+ case Instruction::And:
+ return Candidate & AndBIT;
+ case Instruction::Or:
+ return Candidate & OrBIT;
+ case Instruction::Xor:
+ return Candidate & XorBIT;
+ default:
break;
}
llvm_unreachable("Cannot find interchangeable instruction.");
@@ -5052,9 +5052,7 @@ class BoUpSLP {
const ScheduleBundle &getBundle() const { return Bundle; }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dump(raw_ostream &OS) const {
- OS << "[Copyable]" << *getInst();
- }
+ void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
LLVM_DUMP_METHOD void dump() const {
dump(dbgs());
@@ -5306,13 +5304,16 @@ class BoUpSLP {
.try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I))
.first->getSecond()
.push_back(CD);
- ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert(CD);
+ ScheduleCopyableDataMapByUsers.try_emplace(I)
+ .first->getSecond()
+ .insert(CD);
// Remove extra deps for users, becoming non-immediate users of the
// instruction. It may happen, if the chain of same copyable elements
// appears in the tree.
if (In == I) {
EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
- if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, In))
+ if (ScheduleCopyableData *UserCD =
+ getScheduleCopyableData(UserEI, In))
ScheduleCopyableDataMapByUsers[I].remove(UserCD);
}
It = find(make_range(std::next(It), Op.end()), I);
@@ -5417,7 +5418,8 @@ class BoUpSLP {
++OperandsUses[I];
}
}
- // Decrement the unscheduled counter and insert to ready list if ready.
+ // Decrement the unscheduled counter and insert to ready list if
+ // ready.
auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
unsigned OpIdx) {
const EdgeInfo EI = {UserTE, OpIdx};
@@ -10709,7 +10711,7 @@ class InstructionsCompatibilityAnalysis {
S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
if (!WithProfitabilityCheck)
return S;
- // Check if it is profitable to vectorize the instruction.
+ // Check if it is profitable to vectorize the instruction.
SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
if (VL.size() == 2) {
// Check if the operands allow better vectorization.
More information about the llvm-commits
mailing list