[llvm] [SLP]: Introduce and use getDataFlowCost (PR #112999)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 21 14:37:28 PDT 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/112999
>From bbda2b1f8662bc65ad6aeb9d199dc362cbcbafd1 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 16 Oct 2024 10:54:39 -0700
Subject: [PATCH 1/2] [SLP] NFC: Introduce and use getDataFlowCost
Change-Id: I6a9155f4af3f8ccc943ab9d46c07dab07dc9b5c5
---
.../llvm/Analysis/TargetTransformInfo.h | 13 +++
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 4 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 7 ++
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 8 ++
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 +
.../Transforms/Vectorize/SLPVectorizer.cpp | 91 +++++++++++++++++--
7 files changed, 123 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a4..934012b2e53f5c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1534,6 +1534,14 @@ class TargetTransformInfo {
Function *F, Type *RetTy, ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
+ /// \returns The cost of propagating Type \p DataType through Basic Block /
+ /// function boundaries. If \p IsCallingConv is specified, then \p DataType is
+ /// associated with either a function argument or return. Otherwise, \p
+ /// DataType is used in either a GEP instruction, or spans across BasicBlocks
+ /// (this is relevant because SelectionDAG builder may, for example, scalarize
+ /// illegal vectors across blocks, which introduces extract/insert code).
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const;
+
/// \returns The number of pieces into which the provided type must be
/// split during legalization. Zero is returned when the answer is unknown.
unsigned getNumberOfParts(Type *Tp) const;
@@ -2096,6 +2104,8 @@ class TargetTransformInfo::Concept {
virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind) = 0;
+ virtual InstructionCost getDataFlowCost(Type *DataType,
+ bool IsCallingConv) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
virtual InstructionCost
getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
@@ -2781,6 +2791,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
TTI::TargetCostKind CostKind) override {
return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) override {
+ return Impl.getDataFlowCost(DataType, IsCallingConv);
+ }
unsigned getNumberOfParts(Type *Tp) override {
return Impl.getNumberOfParts(Tp);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43c..5a25a88c3eb460 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -772,6 +772,10 @@ class TargetTransformInfoImplBase {
return 1;
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const {
+ return 0;
+ }
+
// Assume that we have a register of the right size for the type.
unsigned getNumberOfParts(Type *Tp) const { return 1; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 9f8d3ded9b3c1a..c6a5c38a1b3fd5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2410,6 +2410,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return 10;
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) {
+ return 0;
+ }
+
unsigned getNumberOfParts(Type *Tp) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
return LT.first.isValid() ? *LT.first.getValue() : 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f0..edef9afa747d62 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1116,6 +1116,13 @@ TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
return Cost;
}
+InstructionCost TargetTransformInfo::getDataFlowCost(Type *DataType,
+ bool IsCallingConv) const {
+ InstructionCost Cost = TTIImpl->getDataFlowCost(DataType, IsCallingConv);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return TTIImpl->getNumberOfParts(Tp);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b60..5d58cc62dbde09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -306,6 +306,14 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
return !F || !ST->isSingleLaneExecution(*F);
}
+InstructionCost GCNTTIImpl::getDataFlowCost(Type *DataType,
+ bool IsCallingConv) {
+ if (isTypeLegal(DataType) || IsCallingConv)
+ return BaseT::getDataFlowCost(DataType, IsCallingConv);
+
+ return getNumberOfParts(DataType);
+}
+
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
// registers. See getRegisterClassForType for the implementation.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df17302ca2..c195c860075eb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -161,6 +161,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv);
+
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
ArrayRef<unsigned> Indices = {}) const;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ae0819c964bef3..42617eb4cf2095 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9044,6 +9044,51 @@ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
return ArgTys;
}
+// The cost model may determine that vectorizing and eliminating a series of
+// ExtractElements is beneficial. However, if the input vector is a function
+// argument, the calling convention may require extractions in the geneerated
+// code. In this scenario, vectorizaino would then not eliminate the
+// ExtractElement sequence, but would add additional vectorization code.
+// getCCCostFromScalars does the proper accounting for this.
+static unsigned getCCCostFromScalars(ArrayRef<Value *> &Scalars,
+ unsigned ScalarSize,
+ TargetTransformInfo *TTI) {
+ SetVector<Value *> ArgRoots;
+ for (unsigned I = 0; I < ScalarSize; I++) {
+ auto *Scalar = Scalars[I];
+ if (!Scalar)
+ continue;
+ auto *EE = dyn_cast<ExtractElementInst>(Scalar);
+ if (!EE)
+ continue;
+
+ auto *Vec = EE->getOperand(0);
+ if (!Vec->getType()->isVectorTy())
+ continue;
+
+ auto F = EE->getFunction();
+ auto FoundIt = find_if(
+ F->args(), [&Vec](Argument &I) { return Vec == cast<Value>(&I); });
+
+ if (FoundIt == F->arg_end())
+ continue;
+
+ if (!ArgRoots.contains(Vec))
+ ArgRoots.insert(Vec);
+ }
+
+ if (!ArgRoots.size())
+ return 0;
+
+ unsigned Cost = 0;
+ for (auto ArgOp : ArgRoots) {
+ Cost += TTI->getDataFlowCost(ArgOp->getType(), /*IsCallingConv*/ true)
+ .getValue()
+ .value_or(0);
+ }
+ return Cost;
+}
+
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9075,15 +9120,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ InstructionCost CommonCost = getCCCostFromScalars(VL, VL.size(), TTI);
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
- return 0;
+ return CommonCost;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
- return processBuildVector<ShuffleCostEstimator, InstructionCost>(
- E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
+ return CommonCost +
+ processBuildVector<ShuffleCostEstimator, InstructionCost>(
+ E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
}
- InstructionCost CommonCost = 0;
SmallVector<int> Mask;
bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
if (!E->ReorderIndices.empty() &&
@@ -10241,6 +10287,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
+
+ // Calculate the cost difference of propagating a vector vs series of scalars
+ // across blocks. This may be nonzero in the case of illegal vectors.
+ Instruction *VL0 = TE.getMainOp();
+ bool IsAPhi = VL0 && isa<PHINode>(VL0);
+ bool HasNextEntry = VL0 && ((I + 1) < VectorizableTree.size());
+ bool LiveThru = false;
+ if (HasNextEntry) {
+ Instruction *VL1 = VectorizableTree[I + 1]->getMainOp();
+ LiveThru = VL1 && (VL0->getParent() != VL1->getParent());
+ }
+ if (IsAPhi || LiveThru) {
+ VectorType *VTy = dyn_cast<VectorType>(VL0->getType());
+ Type *ScalarTy = VTy ? VTy->getElementType() : VL0->getType();
+ if (ScalarTy && isValidElementType(ScalarTy)) {
+ InstructionCost ScalarDFlow =
+ TTI->getDataFlowCost(ScalarTy,
+ /*IsCallingConv*/ false) *
+ TE.getVectorFactor();
+ InstructionCost VectorDFlow =
+ TTI->getDataFlowCost(FixedVectorType::get(ScalarTy, TE.getVectorFactor()), /*IsCallingConv*/ false);
+ Cost += (VectorDFlow - ScalarDFlow);
+ }
+ }
+
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
<< shortBundleName(TE.Scalars) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
@@ -10257,8 +10328,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
- !ExtractCostCalculated.insert(EU.Scalar).second)
+ !ExtractCostCalculated.insert(EU.Scalar).second) {
continue;
+ }
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
@@ -10266,6 +10338,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (EphValues.count(EU.User))
continue;
+ // Account for any additional costs required by CallingConvention for the
+ // type.
+ if (isa_and_nonnull<ReturnInst>(EU.User)) {
+ Cost +=
+ TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true);
+ continue;
+ }
+
// No extract cost for vector "scalar"
if (isa<FixedVectorType>(EU.Scalar->getType()))
continue;
@@ -10566,7 +10646,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
#endif
-
return Cost;
}
>From 13c369a0025f2e69212cec197504ca14474a7ccd Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 21 Oct 2024 13:24:36 -0700
Subject: [PATCH 2/2] Defer getCCCostFromScalars / review comments
Change-Id: I5c7ee6604012880bd96d137c69d3d8f6fb6ff1f8
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 106 ++++++------------
1 file changed, 34 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 42617eb4cf2095..3a66dad1d10e64 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9044,51 +9044,6 @@ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
return ArgTys;
}
-// The cost model may determine that vectorizing and eliminating a series of
-// ExtractElements is beneficial. However, if the input vector is a function
-// argument, the calling convention may require extractions in the geneerated
-// code. In this scenario, vectorizaino would then not eliminate the
-// ExtractElement sequence, but would add additional vectorization code.
-// getCCCostFromScalars does the proper accounting for this.
-static unsigned getCCCostFromScalars(ArrayRef<Value *> &Scalars,
- unsigned ScalarSize,
- TargetTransformInfo *TTI) {
- SetVector<Value *> ArgRoots;
- for (unsigned I = 0; I < ScalarSize; I++) {
- auto *Scalar = Scalars[I];
- if (!Scalar)
- continue;
- auto *EE = dyn_cast<ExtractElementInst>(Scalar);
- if (!EE)
- continue;
-
- auto *Vec = EE->getOperand(0);
- if (!Vec->getType()->isVectorTy())
- continue;
-
- auto F = EE->getFunction();
- auto FoundIt = find_if(
- F->args(), [&Vec](Argument &I) { return Vec == cast<Value>(&I); });
-
- if (FoundIt == F->arg_end())
- continue;
-
- if (!ArgRoots.contains(Vec))
- ArgRoots.insert(Vec);
- }
-
- if (!ArgRoots.size())
- return 0;
-
- unsigned Cost = 0;
- for (auto ArgOp : ArgRoots) {
- Cost += TTI->getDataFlowCost(ArgOp->getType(), /*IsCallingConv*/ true)
- .getValue()
- .value_or(0);
- }
- return Cost;
-}
-
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9120,7 +9075,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- InstructionCost CommonCost = getCCCostFromScalars(VL, VL.size(), TTI);
+ InstructionCost CommonCost = 0;
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return CommonCost;
@@ -9268,6 +9223,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
OpTE->Scalars.size());
}
+ // Calculate the cost difference of propagating a vector vs series of
+ // scalars across blocks. This may be nonzero in the case of illegal
+ // vectors.
+ Type *ScalarTy = VL0->getType()->getScalarType();
+ if (ScalarTy && isValidElementType(ScalarTy)) {
+ ScalarCost += TTI->getDataFlowCost(ScalarTy,
+ /*IsCallingConv=*/false) *
+ EntryVF;
+ CommonCost += TTI->getDataFlowCost(
+ FixedVectorType::get(ScalarTy, EntryVF), /*IsCallingConv=*/false);
+ }
+
return CommonCost - ScalarCost;
}
case Instruction::ExtractValue:
@@ -10291,24 +10258,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Calculate the cost difference of propagating a vector vs series of scalars
// across blocks. This may be nonzero in the case of illegal vectors.
Instruction *VL0 = TE.getMainOp();
- bool IsAPhi = VL0 && isa<PHINode>(VL0);
- bool HasNextEntry = VL0 && ((I + 1) < VectorizableTree.size());
- bool LiveThru = false;
- if (HasNextEntry) {
+ if (VL0 && ((I + 1) < VectorizableTree.size())) {
Instruction *VL1 = VectorizableTree[I + 1]->getMainOp();
- LiveThru = VL1 && (VL0->getParent() != VL1->getParent());
- }
- if (IsAPhi || LiveThru) {
- VectorType *VTy = dyn_cast<VectorType>(VL0->getType());
- Type *ScalarTy = VTy ? VTy->getElementType() : VL0->getType();
- if (ScalarTy && isValidElementType(ScalarTy)) {
- InstructionCost ScalarDFlow =
- TTI->getDataFlowCost(ScalarTy,
- /*IsCallingConv*/ false) *
- TE.getVectorFactor();
- InstructionCost VectorDFlow =
- TTI->getDataFlowCost(FixedVectorType::get(ScalarTy, TE.getVectorFactor()), /*IsCallingConv*/ false);
- Cost += (VectorDFlow - ScalarDFlow);
+ if (VL1 && (VL0->getParent() != VL1->getParent())) {
+ Type *ScalarTy = VL0->getType()->getScalarType();
+ if (ScalarTy && isValidElementType(ScalarTy)) {
+ InstructionCost ScalarDFlow =
+ TTI->getDataFlowCost(ScalarTy,
+ /*IsCallingConv=*/false) *
+ TE.getVectorFactor();
+ InstructionCost VectorDFlow = TTI->getDataFlowCost(
+ FixedVectorType::get(ScalarTy, TE.getVectorFactor()),
+ /*IsCallingConv=*/false);
+ Cost += (VectorDFlow - ScalarDFlow);
+ }
}
}
@@ -10338,17 +10301,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (EphValues.count(EU.User))
continue;
- // Account for any additional costs required by CallingConvention for the
- // type.
- if (isa_and_nonnull<ReturnInst>(EU.User)) {
- Cost +=
- TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true);
- continue;
- }
-
// No extract cost for vector "scalar"
- if (isa<FixedVectorType>(EU.Scalar->getType()))
+ if (isa<FixedVectorType>(EU.Scalar->getType())) {
+ // Account for any additional costs required by CallingConvention for the
+ // type.
+ if (isa_and_nonnull<ReturnInst>(EU.User))
+ Cost +=
+ TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv=*/true);
continue;
+ }
// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
@@ -10646,6 +10607,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
#endif
+
return Cost;
}
More information about the llvm-commits
mailing list