[llvm] [SLP]Initial support for copyable elements (non-schedulable only) (PR #140279)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 16 09:39:22 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-vectorizers
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
Adds initial support for copyable elements. This patch only models adds
and model copyable elements as add <element>, 0.
Only support for elements, which do not require scheduling is added to
reduce size of the patch.
---
Patch is 37.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140279.diff
6 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+351-44)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll (+1-4)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll (+4-10)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll (+7-9)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll (+2-7)
- (modified) llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll (+3-3)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c63f80675fef4..97d6068571918 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -206,6 +206,12 @@ static cl::opt<bool> VectorizeNonPowerOf2(
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
+/// Enables vectorization of copyable elements.
+static cl::opt<bool> VectorizeCopyableElements(
+ "slp-copyable-elements", cl::init(true), cl::Hidden,
+ cl::desc("Try to replace values with the idempotent instructions for "
+ "better vectorization."));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -835,6 +841,13 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) {
return *EI->idx_begin();
}
+namespace llvm {
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V);
+} // namespace llvm
+
namespace {
/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
@@ -1170,9 +1183,11 @@ class InstructionsState {
if (!I->isBinaryOp())
return nullptr;
BinOpSameOpcodeHelper Converter(MainOp);
- if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp())
- return MainOp;
- return AltOp;
+ if (!Converter.add(I) || !Converter.add(MainOp))
+ return nullptr;
+ if (Converter.hasAltOp() && !isAltShuffle())
+ return nullptr;
+ return Converter.hasAltOp() ? AltOp : MainOp;
}
/// Checks if main/alt instructions are shift operations.
@@ -1220,6 +1235,48 @@ class InstructionsState {
InstructionsState(Instruction *MainOp, Instruction *AltOp)
: MainOp(MainOp), AltOp(AltOp) {}
static InstructionsState invalid() { return {nullptr, nullptr}; }
+
+ bool isCopyableElement(Value *V) const {
+ assert(valid() && "InstructionsState is invalid.");
+ if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I && isa<PoisonValue>(V))
+ return false;
+ // FIXME: remove doesNotNeedToBeScheduled() and isa<PHINode>() check once
+ // scheduling is supported.
+ return !I ||
+ (I->getParent() != MainOp->getParent() &&
+ (!isVectorLikeInstWithConstOps(I) ||
+ !isVectorLikeInstWithConstOps(MainOp))) ||
+ (I->getOpcode() != MainOp->getOpcode() &&
+ (isa<PHINode>(I) || doesNotNeedToBeScheduled(I)) &&
+ (!I->isBinaryOp() || getMatchingMainOpOrAltOp(I) != MainOp));
+ }
+
+ bool areInstructionsWithCopyableElements(ArrayRef<Value *> VL) const {
+ assert(valid() && "InstructionsState is invalid.");
+ bool HasAtLeastOneCopyableElement = false;
+ auto IsCopyableElement = [&](Value *V) {
+ bool IsCopyable = isCopyableElement(V);
+ HasAtLeastOneCopyableElement |= IsCopyable;
+ return IsCopyable;
+ };
+ return !isAltShuffle() && all_of(VL, [&](Value *V) {
+ if (V == MainOp || isa<PoisonValue>(V))
+ return true;
+ if (IsCopyableElement(V))
+ return true;
+ auto *I = dyn_cast<Instruction>(V);
+ if (getOpcode() == Instruction::GetElementPtr && !I)
+ return true;
+ return I->getType() == MainOp->getType() &&
+ (I->getParent() == MainOp->getParent() ||
+ (isVectorLikeInstWithConstOps(I) &&
+ isVectorLikeInstWithConstOps(MainOp))) &&
+ getMatchingMainOpOrAltOp(cast<Instruction>(V)) == MainOp;
+ }) && HasAtLeastOneCopyableElement;
+ }
};
std::pair<Instruction *, SmallVector<Value *>>
@@ -2878,9 +2935,6 @@ class BoUpSLP {
for (OperandDataVec &Ops : OpsVec)
Ops.resize(NumLanes);
for (unsigned Lane : seq<unsigned>(NumLanes)) {
- Value *V = VL[Lane];
- assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
- "Expected instruction or poison value");
// Our tree has just 3 nodes: the root and two operands.
// It is therefore trivial to get the APO. We only need to check the
// opcode of V and whether the operand at OpIdx is the LHS or RHS
@@ -2891,13 +2945,20 @@ class BoUpSLP {
// Since operand reordering is performed on groups of commutative
// operations or alternating sequences (e.g., +, -), we can safely tell
// the inverse operations by checking commutativity.
- if (isa<PoisonValue>(V)) {
+ auto *I = dyn_cast<Instruction>(VL[Lane]);
+ if (!I && isa<PoisonValue>(VL[Lane])) {
for (unsigned OpIdx : seq<unsigned>(NumOperands))
OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
continue;
}
- auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S);
- bool IsInverseOperation = !isCommutative(SelectedOp);
+ bool IsInverseOperation = false;
+ if (S.isCopyableElement(VL[Lane])) {
+ // The value is a copyable element.
+ IsInverseOperation = !isCommutative(MainOp);
+ } else {
+ auto [SelectedOp, Ops] = convertTo(I, S);
+ IsInverseOperation = !isCommutative(SelectedOp);
+ }
for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
@@ -3905,6 +3966,14 @@ class BoUpSLP {
bool hasState() const { return S.valid(); }
+ /// Returns true if \p V is a copyable element.
+ bool isCopyableElement(Value *V) const { return S.isCopyableElement(V); }
+
+ /// Returns true if any scalar in the list is a copyable element.
+ bool hasCopyableElements() const {
+ return S.areInstructionsWithCopyableElements(Scalars);
+ }
+
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
int findLaneForValue(Value *V) const {
@@ -4153,7 +4222,7 @@ class BoUpSLP {
} else if (!Last->isGather()) {
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
- if (isa<PoisonValue>(V))
+ if (isa<PoisonValue>(V) || S.isCopyableElement(V))
continue;
auto It = ScalarToTreeEntries.find(V);
if (It == ScalarToTreeEntries.end()) {
@@ -4168,14 +4237,20 @@ class BoUpSLP {
// Update the scheduler bundle to point to this TreeEntry.
assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) ||
isVectorLikeInstWithConstOps(S.getMainOp()) ||
- doesNotNeedToSchedule(VL)) &&
+ doesNotNeedToSchedule(VL) ||
+ all_of(VL,
+ [&](Value *V) {
+ return S.isCopyableElement(V) ||
+ doesNotNeedToBeScheduled(V);
+ })) &&
"Bundle and VL out of sync");
if (!Bundle.getBundle().empty()) {
#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
auto *BundleMember = Bundle.getBundle().begin();
SmallPtrSet<Value *, 4> Processed;
for (Value *V : VL) {
- if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second)
+ if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V) ||
+ !Processed.insert(V).second)
continue;
++BundleMember;
}
@@ -4284,7 +4359,8 @@ class BoUpSLP {
/// in general.
ScalarsVectorizationLegality
getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) const;
+ const EdgeInfo &UserTreeIdx,
+ bool TryCopyableElementsVectorization) const;
/// Checks if the specified list of the instructions/values can be vectorized
/// and fills required data before actual scheduling of the instructions.
@@ -4996,7 +5072,8 @@ class BoUpSLP {
/// Build a bundle from the ScheduleData nodes corresponding to the
/// scalar instruction for each lane.
- ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
+ ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
+ const InstructionsState &S);
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
@@ -7893,7 +7970,7 @@ void BoUpSLP::buildExternalUses(
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
- if (!isa<Instruction>(Scalar))
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
continue;
// All uses must be replaced already? No need to do it again.
auto It = ScalarToExtUses.find(Scalar);
@@ -9617,7 +9694,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
PoisonValue::get(UniqueValues.front()->getType()));
// Check that extended with poisons operations are still valid for
// vectorization (div/rem are not allowed).
- if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) {
+ if (!S.areInstructionsWithCopyableElements(PaddedUniqueValues) &&
+ !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
ReuseShuffleIndices.clear();
return false;
@@ -9766,13 +9844,112 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
}
namespace {
-/// Class accepts incoming list of values and generates the list of values
-/// for scheduling and list of operands for the new nodes.
+/// Class accepts incoming list of values, checks if it is able to model
+/// "copyable" values as compatible operations, and generates the list of values
+/// for scheduling and list of operands doe the new nodes.
class InstructionsCompatibilityAnalysis {
DominatorTree &DT;
const DataLayout &DL;
const TargetTransformInfo &TTI;
const TargetLibraryInfo &TLI;
+ unsigned MainOpcode = 0;
+ Instruction *MainOp = nullptr;
+
+ /// Identifies the best candidate value, which represents main opcode
+ /// operation.
+ /// Currently the best candidate is the Add instruction with the parent
+ /// block with the highest DFS incoming number (block, that dominates other).
+ void findAndSetMainInstruction(ArrayRef<Value *> VL) {
+ BasicBlock *Parent = nullptr;
+ // Checks if the instruction has supported opcode.
+ auto IsSupportedOpcode = [](Instruction *I) {
+ return I && I->getOpcode() == Instruction::Add;
+ };
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (!DT.isReachableFromEntry(I->getParent()))
+ continue;
+ if (!MainOp) {
+ MainOp = I;
+ Parent = I->getParent();
+ continue;
+ }
+ if (Parent == I->getParent()) {
+ if (!IsSupportedOpcode(MainOp))
+ MainOp = I;
+ if (MainOp->getOpcode() == I->getOpcode() &&
+ doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I))
+ MainOp = I;
+ continue;
+ }
+ auto *NodeA = DT.getNode(Parent);
+ auto *NodeB = DT.getNode(I->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) ==
+ (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
+ MainOp = I;
+ Parent = I->getParent();
+ }
+ }
+ // FIXME: remove second part of the check, once the scheduling support
+ // for copyable instructions is landed.
+ if (!IsSupportedOpcode(MainOp) || any_of(VL, [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && I->getOpcode() != MainOp->getOpcode() &&
+ I->getParent() == MainOp->getParent() && !isa<PHINode>(I) &&
+ !doesNotNeedToBeScheduled(I);
+ })) {
+ MainOp = nullptr;
+ return;
+ }
+ MainOpcode = MainOp->getOpcode();
+ }
+
+ /// Returns the idempotent value for the \p MainOp with the detected \p
+ /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
+ /// the operand itself, since V or V == V.
+ Value *selectBestIdempotentValue() const {
+ switch (MainOpcode) {
+ case Instruction::Add:
+ return ConstantInt::getNullValue(MainOp->getType());
+ default:
+ break;
+ }
+ llvm_unreachable("Unsupported opcode");
+ }
+
+ unsigned getNumberOfOperands() const {
+ switch (MainOpcode) {
+ case Instruction::Add:
+ return 2;
+ default:
+ break;
+ }
+ llvm_unreachable("Unsupported opcode");
+ }
+
+ /// Returns the value and operands for the \p V, considering if it is original
+ /// instruction and its actual operands should be returned, or it is a
+ /// copyable element and its should be represented as idempotent instruction.
+ SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
+ bool MatchesMainOp = !S.isCopyableElement(V);
+ switch (MainOpcode) {
+ case Instruction::Add:
+ if (isa<PoisonValue>(V))
+ return {V, V};
+ if (MatchesMainOp)
+ return SmallVector<Value *>(cast<Instruction>(V)->operands());
+ return {V, selectBestIdempotentValue()};
+ default:
+ break;
+ }
+ llvm_unreachable("Unsupported opcode");
+ }
/// Builds operands for the original instructions.
void
@@ -9933,22 +10110,122 @@ class InstructionsCompatibilityAnalysis {
const TargetLibraryInfo &TLI)
: DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
+ InstructionsState
+ buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
+ bool TryCopyableElementsVectorization,
+ bool WithProfitabilityCheck = false) {
+ InstructionsState S = getSameOpcode(VL, TLI);
+ if (S)
+ return S;
+ if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
+ return S;
+ findAndSetMainInstruction(VL);
+ if (!MainOp)
+ return InstructionsState::invalid();
+ S = InstructionsState(MainOp, MainOp);
+ if (!WithProfitabilityCheck)
+ return S;
+ // Check if it is profitable to vectorize the instruction.
+ SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
+ if (VL.size() == 2) {
+ // Check if the operands allow better vectorization.
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+ Candidates.emplace_back(Operands[0][0], Operands[0][1]);
+ Candidates.emplace_back(Operands[1][0], Operands[1][1]);
+ if (isCommutative(MainOp)) {
+ Candidates.emplace_back(Operands[0][0], Operands[1][1]);
+ Candidates.emplace_back(Operands[1][0], Operands[0][1]);
+ }
+ // No good candidates - not profitable.
+ if (!R.findBestRootPair(Candidates,
+ BoUpSLP::LookAheadHeuristics::ScoreSplat)) {
+ // Deeper analysis for 2 splats/constants.
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
+ Candidates1.emplace_back(Operands[0][0], Operands[0][1]);
+ Candidates2.emplace_back(Operands[1][0], Operands[1][1]);
+ bool Res = R.findBestRootPair(Candidates1) &&
+ R.findBestRootPair(Candidates2);
+ if (!Res && isCommutative(MainOp)) {
+ Candidates1.clear();
+ Candidates2.clear();
+ Candidates1.emplace_back(Operands[0][0], Operands[1][1]);
+ Candidates2.emplace_back(Operands[1][0], Operands[0][1]);
+ Res = R.findBestRootPair(Candidates1) &&
+ R.findBestRootPair(Candidates2);
+ }
+ if (!Res)
+ return InstructionsState::invalid();
+ }
+ }
+ assert(Operands.size() == 2 && "Unexpected number of operands!");
+ unsigned CopyableNum =
+ count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
+ if (CopyableNum <= VL.size() / 2)
+ return S;
+ // Check profitability if number of copyables > VL.size() / 2.
+ // 1. Reorder operands for better matching.
+ if (isCommutative(MainOp)) {
+ for (auto &Ops : Operands) {
+ // Make instructions the first operands.
+ if (isa<Instruction>(Ops.back())) {
+ std::swap(Ops.front(), Ops.back());
+ continue;
+ }
+ // Make constants the second operands.
+ if (isa<Constant>(Ops.front())) {
+ std::swap(Ops.front(), Ops.back());
+ continue;
+ }
+ }
+ }
+ // 2. Check, if operands can be vectorized.
+ if (!allConstant(Operands.back()))
+ return InstructionsState::invalid();
+ bool Res = allConstant(Operands.front()) || isSplat(Operands.front());
+ if (!Res) {
+ // First operand not a constant or splat? Last attempt - check for
+ // potential vectorization.
+ InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+ if (!Analysis.buildInstructionsState(
+ Operands.front(), R,
+ /*TryCopyableElementsVectorization=*/true))
+ return InstructionsState::invalid();
+ }
+
+ return S;
+ }
+
SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
ArrayRef<Value *> VL) {
assert(S && "Invalid state!");
SmallVector<BoUpSLP::ValueList> Operands;
- buildOriginalOperands(S, VL, Operands);
+ if (S.areInstructionsWithCopyableElements(VL)) {
+ MainOp = S.getMainOp();
+ MainOpcode = S.getOpcode();
+ Operands.assign(getNumberOfOperands(),
+ BoUpSLP::ValueList(VL.size(), nullptr));
+ for (auto [Idx, V] : enumerate(VL)) {
+ SmallVector<Value *> OperandsForValue = getOperands(S, V);
+ for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
+ Operands[OperandIdx][Idx] = Operand;
+ }
+ } else {
+ buildOriginalOperands(S, VL, Operands);
+ }
return Operands;
}
};
} // namespace
-BoUpSLP::ScalarsVectorizationLegality
-BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) const {
+BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
+ ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
+ bool TryCopyableElementsVectorization) const {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
- InstructionsState S = getSameOpcode(VL, *TLI);
+ InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+ InstructionsState S = Analysis.buildInstructionsState(
+ VL, *this, TryCopyableElementsVectorization,
+ /*WithProfitabilityCheck=*/true);
// Don't go into catchswitch blocks, which can happen with PHIs.
// Such blocks can only have PHIs and the catchswitch. There is no
@@ -10247,9 +10524,9 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
return true;
};
- ScalarsVectorizationLegality Legality =
- getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
- const InstructionsState &S = Legality.getInstructionsState();
+ ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
+ VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+ InstructionsState S = Legality.getInstructionsState();
if (!Legality.isLegal()) {
if (Legality.trySplitVectorize()) {
auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
@@ -10257,11 +10534,18 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
return;
}
- if (Legality.tryToFindD...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/140279
More information about the llvm-commits
mailing list