[llvm] cd341f3 - [SLP]Add final resize to ShuffleCostEstimator::finalize member function and basic add member functions.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 18 05:55:08 PDT 2023
Author: Alexey Bataev
Date: 2023-04-18T05:51:23-07:00
New Revision: cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc
URL: https://github.com/llvm/llvm-project/commit/cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc
DIFF: https://github.com/llvm/llvm-project/commit/cd341f3f4878137d1c9e7a05c4c3a7bd8ff216dc.diff
LOG: [SLP]Add final resize to ShuffleCostEstimator::finalize member function and basic add member functions.
Implemented the reshuffling in finalize member function + add basic
support for add member functions, used during vector build.
Part of D110978
Differential Revision: https://reviews.llvm.org/D148279
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a1fabec2a8d7c..163e3581ea7be 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -946,9 +946,14 @@ static bool isSimple(Instruction *I) {
}
/// Shuffles \p Mask in accordance with the given \p SubMask.
-static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
+/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
+/// one but two input vectors.
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
+ bool ExtendingManyInputs = false) {
if (SubMask.empty())
return;
+ assert((!ExtendingManyInputs || SubMask.size() > Mask.size()) &&
+ "SubMask with many inputs support must be larger than the mask.");
if (Mask.empty()) {
Mask.append(SubMask.begin(), SubMask.end());
return;
@@ -956,8 +961,9 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);
int TermValue = std::min(Mask.size(), SubMask.size());
for (int I = 0, E = SubMask.size(); I < E; ++I) {
- if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
- Mask[SubMask[I]] >= TermValue)
+ if ((!ExtendingManyInputs &&
+ (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)) ||
+ SubMask[I] == UndefMaskElem)
continue;
NewMask[I] = Mask[SubMask[I]];
}
@@ -6788,6 +6794,8 @@ class BaseShuffleAnalysis {
/// analysis/transformations.
class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
bool IsFinalized = false;
+ SmallVector<int> CommonMask;
+ SmallVector<Value *, 2> InVectors;
const TargetTransformInfo &TTI;
InstructionCost Cost = 0;
ArrayRef<Value *> VectorizedVals;
@@ -7009,19 +7017,53 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
VecTy, std::nullopt, CostKind, 0, EEVTy);
}
}
+ InVectors.assign(1, VecBase);
return VecBase;
}
+ void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(
+ 2, Constant::getNullValue(FixedVectorType::get(
+ E1->Scalars.front()->getType(),
+ std::max(E1->getVectorFactor(), E2->getVectorFactor()))));
+ }
+ void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+ CommonMask.assign(Mask.begin(), Mask.end());
+ InVectors.assign(
+ 1, Constant::getNullValue(FixedVectorType::get(
+ E1->Scalars.front()->getType(), E1->getVectorFactor())));
+ }
void gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
Cost += getBuildVectorCost(VL, Root);
+ if (!Root) {
+ assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
+ InVectors.assign(1, Constant::getNullValue(FixedVectorType::get(
+ VL.front()->getType(), VL.size())));
+ }
}
/// Finalize emission of the shuffles.
- InstructionCost finalize() {
+ InstructionCost finalize(ArrayRef<int> ExtMask) {
IsFinalized = true;
- return Cost;
+ ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
+ if (CommonMask.empty())
+ return Cost;
+ int Limit = CommonMask.size() * 2;
+ if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(CommonMask))
+ return Cost;
+ return Cost +
+ TTI.getShuffleCost(InVectors.size() == 2 ? TTI::SK_PermuteTwoSrc
+ : TTI::SK_PermuteSingleSrc,
+ FixedVectorType::get(
+ cast<VectorType>(InVectors.front()->getType())
+ ->getElementType(),
+ CommonMask.size()),
+ CommonMask);
}
~ShuffleCostEstimator() {
- assert(IsFinalized && "Shuffle construction must be finalized.");
+ assert((IsFinalized || CommonMask.empty()) &&
+ "Shuffle construction must be finalized.");
}
};
@@ -7109,35 +7151,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (Mask[I] != UndefMaskElem)
GatheredScalars[I] = PoisonValue::get(ScalarTy);
}
- InstructionCost GatherCost = 0;
- int Limit = Mask.size() * 2;
- if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
- ShuffleVectorInst::isIdentityMask(Mask)) {
- // Perfect match in the graph, will reuse the previously vectorized
- // node. Cost is 0.
- LLVM_DEBUG(
- dbgs()
- << "SLP: perfect diamond match for gather bundle that starts with "
- << *VL.front() << ".\n");
- if (NeedToShuffleReuses)
- GatherCost =
- TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- FinalVecTy, E->ReuseShuffleIndices);
- } else {
- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
- << " entries for bundle that starts with "
- << *VL.front() << ".\n");
- // Detected that instead of gather we can emit a shuffle of single/two
- // previously vectorized nodes. Add the cost of the permutation rather
- // than gather.
- ::addMask(Mask, E->ReuseShuffleIndices);
- GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
- }
+ LLVM_DEBUG(
+ int Limit = Mask.size() * 2;
+ if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+ all_of(Mask, [=](int Idx) { return Idx < Limit; }) &&
+ ShuffleVectorInst::isIdentityMask(Mask)) {
+ // Perfect match in the graph, will reuse the previously
+ // vectorized node. Cost is 0.
+ dbgs() << "SLP: perfect diamond match for gather bundle "
+ "that starts with "
+ << *VL.front() << ".\n";
+ } else {
+ dbgs() << "SLP: shuffled " << Entries.size()
+ << " entries for bundle that starts with " << *VL.front()
+ << ".\n";
+ });
+ if (Entries.size() == 1)
+ Estimator.add(Entries.front(), Mask);
+ else
+ Estimator.add(Entries.front(), Entries.back(), Mask);
Estimator.gather(
GatheredScalars,
Constant::getNullValue(FixedVectorType::get(
GatheredScalars.front()->getType(), GatheredScalars.size())));
- return GatherCost + Estimator.finalize();
+ return Estimator.finalize(E->ReuseShuffleIndices);
}
if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
// Check that gather of extractelements can be represented as just a
@@ -7147,17 +7184,15 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// single input vector or of 2 input vectors.
InstructionCost Cost =
computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI);
- if (NeedToShuffleReuses)
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- FinalVecTy, E->ReuseShuffleIndices);
- return Cost + Estimator.finalize();
- }
- InstructionCost ReuseShuffleCost = 0;
- if (NeedToShuffleReuses)
- ReuseShuffleCost = TTI->getShuffleCost(
- TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
- Estimator.gather(GatheredScalars);
- return ReuseShuffleCost + Estimator.finalize();
+ return Cost + Estimator.finalize(E->ReuseShuffleIndices);
+ }
+ Estimator.gather(
+ GatheredScalars,
+ (ExtractShuffle || GatherShuffle)
+ ? Constant::getNullValue(FixedVectorType::get(
+ GatheredScalars.front()->getType(), GatheredScalars.size()))
+ : nullptr);
+ return Estimator.finalize(E->ReuseShuffleIndices);
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index c5b6ac647aa32..f26d14c44db87 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -93,20 +93,36 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
}
define i1 @logical_and_icmp_
diff _preds(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_
diff _preds(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
-; CHECK-NEXT: ret i1 [[S3]]
+; SSE-LABEL: @logical_and_icmp_
diff _preds(
+; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
+; SSE-NEXT: [[TMP4:%.*]] = icmp slt <2 x i32> [[TMP2]], [[TMP3]]
+; SSE-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0
+; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP5]], i1 false
+; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP6]], i1 false
+; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
+; SSE-NEXT: ret i1 [[S3]]
+;
+; AVX-LABEL: @logical_and_icmp_
diff _preds(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; AVX-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
+; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
+; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
+; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
+; AVX-NEXT: ret i1 [[S3]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
More information about the llvm-commits
mailing list