[llvm] 8c48d77 - [SLP]Improve cost estimation/emission of externally used extractelements.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 3 10:28:04 PDT 2021
Author: Alexey Bataev
Date: 2021-06-03T10:26:59-07:00
New Revision: 8c48d77cdfe5c286dc98b9bf06bd2939d00c4bb4
URL: https://github.com/llvm/llvm-project/commit/8c48d77cdfe5c286dc98b9bf06bd2939d00c4bb4
DIFF: https://github.com/llvm/llvm-project/commit/8c48d77cdfe5c286dc98b9bf06bd2939d00c4bb4.diff
LOG: [SLP]Improve cost estimation/emission of externally used extractelements.
No need to recalculate the cost of extractelements, just no need to
compensate the cost of all extractelements, need to check before if this
is actually going to be removed at the vectorization. Also, no need to
generate new extractelement instruction, we may just regenerate the
original one. It may improve the final vectorization.
Differential Revision: https://reviews.llvm.org/D102933
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e628b122c9d8..b73a2377f1b0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -635,7 +635,7 @@ class BoUpSLP {
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
- InstructionCost getTreeCost();
+ InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -1549,10 +1549,12 @@ class BoUpSLP {
private:
/// Checks if all users of \p I are the part of the vectorization tree.
- bool areAllUsersVectorized(Instruction *I) const;
+ bool areAllUsersVectorized(Instruction *I,
+ ArrayRef<Value *> VectorizedVals) const;
/// \returns the cost of the vectorizable entry.
- InstructionCost getEntryCost(const TreeEntry *E);
+ InstructionCost getEntryCost(const TreeEntry *E,
+ ArrayRef<Value *> VectorizedVals);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
@@ -3505,8 +3507,10 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
return ShouldKeepOrder;
}
-bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
- return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
+bool BoUpSLP::areAllUsersVectorized(Instruction *I,
+ ArrayRef<Value *> VectorizedVals) const {
+ return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
+ llvm::all_of(I->users(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0;
});
}
@@ -3597,7 +3601,8 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
return Cost;
}
-InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
+InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
+ ArrayRef<Value *> VectorizedVals) {
ArrayRef<Value*> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
@@ -3626,16 +3631,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
}
// FIXME: it tries to fix a problem with MSVC buildbots.
TargetTransformInfo &TTIRef = *TTI;
- auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL,
- VecTy](InstructionCost &Cost, bool IsGather) {
+ auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
+ VectorizedVals](InstructionCost &Cost,
+ bool IsGather) {
DenseMap<Value *, int> ExtractVectorsTys;
for (auto *V : VL) {
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
// instruction as dead and remove its cost from the final cost of the
// vectorized tree.
- if (IsGather && (!areAllUsersVectorized(cast<Instruction>(V)) ||
- ScalarToTreeEntry.count(V)))
+ if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+ (IsGather && ScalarToTreeEntry.count(V)))
continue;
auto *EE = cast<ExtractElementInst>(V);
unsigned Idx = *getExtractIndex(EE);
@@ -4389,7 +4395,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
return Cost;
}
-InstructionCost BoUpSLP::getTreeCost() {
+InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
@@ -4399,7 +4405,7 @@ InstructionCost BoUpSLP::getTreeCost() {
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I].get();
- InstructionCost C = getEntryCost(&TE);
+ InstructionCost C = getEntryCost(&TE, VectorizedVals);
Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for bundle that starts with " << *TE.Scalars[0]
@@ -4429,6 +4435,11 @@ InstructionCost BoUpSLP::getTreeCost() {
if (isa<FixedVectorType>(EU.Scalar->getType()))
continue;
+ // Already counted the cost for external uses when tried to adjust the cost
+ // for extractelements, no need to add it again.
+ if (isa<ExtractElementInst>(EU.Scalar))
+ continue;
+
// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
if (EU.User && isa<InsertElementInst>(EU.User)) {
@@ -5566,7 +5577,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Value *Lane = Builder.getInt32(ExternalUse.Lane);
auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
if (Scalar->getType() != Vec->getType()) {
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Value *Ex;
+ // "Reuse" the existing extract to improve final codegen.
+ if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+ Ex = Builder.CreateExtractElement(ES->getOperand(0),
+ ES->getOperand(1));
+ } else {
+ Ex = Builder.CreateExtractElement(Vec, Lane);
+ }
// If necessary, sign-extend or zero-extend ScalarRoot
// to the larger type.
if (!MinBWs.count(ScalarRoot))
@@ -5574,12 +5592,11 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
if (MinBWs[ScalarRoot].second)
return Builder.CreateSExt(Ex, Scalar->getType());
return Builder.CreateZExt(Ex, Scalar->getType());
- } else {
- assert(isa<FixedVectorType>(Scalar->getType()) &&
- isa<InsertElementInst>(Scalar) &&
- "In-tree scalar of vector type is not insertelement?");
- return Vec;
}
+ assert(isa<FixedVectorType>(Scalar->getType()) &&
+ isa<InsertElementInst>(Scalar) &&
+ "In-tree scalar of vector type is not insertelement?");
+ return Vec;
};
// If User == nullptr, the Scalar is used as extra arg. Generate
// ExtractElement instruction and update the record for this scalar in
@@ -7651,7 +7668,8 @@ class HorizontalReduction {
V.computeMinimumValueSizes();
// Estimate cost.
- InstructionCost TreeCost = V.getTreeCost();
+ InstructionCost TreeCost =
+ V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
InstructionCost ReductionCost =
getReductionCost(TTI, ReducedVals[i], ReduxWidth);
InstructionCost Cost = TreeCost + ReductionCost;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
index d1754c0bbc54..add973306f6b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
@@ -8,7 +8,7 @@ define void @f1(<2 x i16> %x, i16* %a) {
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[X]], i32 0
; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]], align 2
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2
@@ -40,7 +40,7 @@ define void @f2(<2 x i16> %x, i16* %a) {
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 0
; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
@@ -87,7 +87,7 @@ define void @f3(<2 x i16> %x, i16* %a) {
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 1
; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
index 756676d7ee14..cfd6f59cb774 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
@@ -143,9 +143,9 @@ define void @extract_reverse_order(<2 x double>* %ptr.1, <4 x double>* %ptr.2) {
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0
; CHECK-NEXT: call void @use(double [[TMP3]])
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1
; CHECK-NEXT: call void @use(double [[TMP4]])
; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8
; CHECK-NEXT: ret void
More information about the llvm-commits
mailing list