[llvm] [SLP] NFC: Introduce and use getDataFlowCost (PR #112999)

Fri Oct 18 16:22:36 PDT 2024

https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/112999

This adds getDataFlowCost to the cost model. For certain vector types (e.g. vectors of illegal types), there may be costs which are not currently captured by the cost model. For example, selectionDAGBuilder will likely scalarize vectors of illegal types that cost basic block boundaries. Similar scalarization may occur when handling illegal vector arguments or return values. This scalarization is ultimately a cost of vectorization, and it should be accounted. That said, for legal types, this type of legalization scalarization will not occur. Moreover, when it does occur, the scalarization cost is the same as the cost of the scalarized version. However, AMDGPU has code in place to reduce this type of scalarization; thus the target override.

>From bbda2b1f8662bc65ad6aeb9d199dc362cbcbafd1 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 16 Oct 2024 10:54:39 -0700
Subject: [PATCH] [SLP] NFC: Introduce and use getDataFlowCost

Change-Id: I6a9155f4af3f8ccc943ab9d46c07dab07dc9b5c5
---
 .../llvm/Analysis/TargetTransformInfo.h       | 13 +++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  4 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  4 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  7 ++
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  8 ++
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  2 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 91 +++++++++++++++++--
 7 files changed, 123 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a4..934012b2e53f5c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1534,6 +1534,14 @@ class TargetTransformInfo {
       Function *F, Type *RetTy, ArrayRef<Type *> Tys,
       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
 
+  /// \returns The cost of propagating Type \p DataType through Basic Block /
+  /// function boundaries. If \p IsCallingConv is specified, then \p DataType is
+  /// associated with either a function argument or return. Otherwise, \p
+  /// DataType is used in either a GEP instruction, or spans across BasicBlocks
+  /// (this is relevant because SelectionDAG builder may, for example, scalarize
+  /// illegal vectors across blocks, which introduces extract/insert code).
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const;
+
   /// \returns The number of pieces into which the provided type must be
   /// split during legalization. Zero is returned when the answer is unknown.
   unsigned getNumberOfParts(Type *Tp) const;
@@ -2096,6 +2104,8 @@ class TargetTransformInfo::Concept {
   virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy,
                                            ArrayRef<Type *> Tys,
                                            TTI::TargetCostKind CostKind) = 0;
+  virtual InstructionCost getDataFlowCost(Type *DataType,
+                                          bool IsCallingConv) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
   virtual InstructionCost
   getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
@@ -2781,6 +2791,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                    TTI::TargetCostKind CostKind) override {
     return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
   }
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) override {
+    return Impl.getDataFlowCost(DataType, IsCallingConv);
+  }
   unsigned getNumberOfParts(Type *Tp) override {
     return Impl.getNumberOfParts(Tp);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43c..5a25a88c3eb460 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -772,6 +772,10 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const {
+    return 0;
+  }
+
   // Assume that we have a register of the right size for the type.
   unsigned getNumberOfParts(Type *Tp) const { return 1; }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 9f8d3ded9b3c1a..c6a5c38a1b3fd5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2410,6 +2410,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return 10;
   }
 
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) {
+    return 0;
+  }
+
   unsigned getNumberOfParts(Type *Tp) {
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
     return LT.first.isValid() ? *LT.first.getValue() : 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f0..edef9afa747d62 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1116,6 +1116,13 @@ TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getDataFlowCost(Type *DataType,
+                                                     bool IsCallingConv) const {
+  InstructionCost Cost = TTIImpl->getDataFlowCost(DataType, IsCallingConv);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
   return TTIImpl->getNumberOfParts(Tp);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b60..5d58cc62dbde09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -306,6 +306,14 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
   return !F || !ST->isSingleLaneExecution(*F);
 }
 
+InstructionCost GCNTTIImpl::getDataFlowCost(Type *DataType,
+                                            bool IsCallingConv) {
+  if (isTypeLegal(DataType) || IsCallingConv)
+    return BaseT::getDataFlowCost(DataType, IsCallingConv);
+
+  return getNumberOfParts(DataType);
+}
+
 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
   // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
   // registers. See getRegisterClassForType for the implementation.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df17302ca2..c195c860075eb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -161,6 +161,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr);
 
+  InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv);
+
   bool isInlineAsmSourceOfDivergence(const CallInst *CI,
                                      ArrayRef<unsigned> Indices = {}) const;
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ae0819c964bef3..42617eb4cf2095 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9044,6 +9044,51 @@ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
   return ArgTys;
 }
 
+// The cost model may determine that vectorizing and eliminating a series of
+// ExtractElements is beneficial. However, if the input vector is a function
+// argument, the calling convention may require extractions in the geneerated
+// code. In this scenario, vectorizaino would then not eliminate the
+// ExtractElement sequence, but would add additional vectorization code.
+// getCCCostFromScalars does the proper accounting for this.
+static unsigned getCCCostFromScalars(ArrayRef<Value *> &Scalars,
+                                     unsigned ScalarSize,
+                                     TargetTransformInfo *TTI) {
+  SetVector<Value *> ArgRoots;
+  for (unsigned I = 0; I < ScalarSize; I++) {
+    auto *Scalar = Scalars[I];
+    if (!Scalar)
+      continue;
+    auto *EE = dyn_cast<ExtractElementInst>(Scalar);
+    if (!EE)
+      continue;
+
+    auto *Vec = EE->getOperand(0);
+    if (!Vec->getType()->isVectorTy())
+      continue;
+
+    auto F = EE->getFunction();
+    auto FoundIt = find_if(
+        F->args(), [&Vec](Argument &I) { return Vec == cast<Value>(&I); });
+
+    if (FoundIt == F->arg_end())
+      continue;
+
+    if (!ArgRoots.contains(Vec))
+      ArgRoots.insert(Vec);
+  }
+
+  if (!ArgRoots.size())
+    return 0;
+
+  unsigned Cost = 0;
+  for (auto ArgOp : ArgRoots) {
+    Cost += TTI->getDataFlowCost(ArgOp->getType(), /*IsCallingConv*/ true)
+                .getValue()
+                .value_or(0);
+  }
+  return Cost;
+}
+
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                       SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9075,15 +9120,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+  InstructionCost CommonCost = getCCCostFromScalars(VL, VL.size(), TTI);
   if (E->State == TreeEntry::NeedToGather) {
     if (allConstant(VL))
-      return 0;
+      return CommonCost;
     if (isa<InsertElementInst>(VL[0]))
       return InstructionCost::getInvalid();
-    return processBuildVector<ShuffleCostEstimator, InstructionCost>(
-        E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
+    return CommonCost +
+           processBuildVector<ShuffleCostEstimator, InstructionCost>(
+               E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
   }
-  InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   if (!E->ReorderIndices.empty() &&
@@ -10241,6 +10287,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
     Cost += C;
+
+    // Calculate the cost difference of propagating a vector vs series of scalars
+    // across blocks. This may be nonzero in the case of illegal vectors.
+    Instruction *VL0 = TE.getMainOp();
+    bool IsAPhi = VL0 && isa<PHINode>(VL0);
+    bool HasNextEntry = VL0 && ((I + 1) < VectorizableTree.size());
+    bool LiveThru = false;
+    if (HasNextEntry) {
+      Instruction *VL1 = VectorizableTree[I + 1]->getMainOp();
+      LiveThru = VL1 && (VL0->getParent() != VL1->getParent());
+    }
+    if (IsAPhi || LiveThru) {
+      VectorType *VTy = dyn_cast<VectorType>(VL0->getType());
+      Type *ScalarTy = VTy ? VTy->getElementType() : VL0->getType();
+      if (ScalarTy && isValidElementType(ScalarTy)) {
+        InstructionCost ScalarDFlow =
+            TTI->getDataFlowCost(ScalarTy,
+                                 /*IsCallingConv*/ false) *
+            TE.getVectorFactor();
+        InstructionCost VectorDFlow =
+            TTI->getDataFlowCost(FixedVectorType::get(ScalarTy, TE.getVectorFactor()), /*IsCallingConv*/ false);
+        Cost += (VectorDFlow - ScalarDFlow);
+      }
+    }
+
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
                       << shortBundleName(TE.Scalars) << ".\n"
                       << "SLP: Current total cost = " << Cost << "\n");
@@ -10257,8 +10328,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
     if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
-        !ExtractCostCalculated.insert(EU.Scalar).second)
+        !ExtractCostCalculated.insert(EU.Scalar).second) {
       continue;
+    }
 
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
@@ -10266,6 +10338,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     if (EphValues.count(EU.User))
       continue;
 
+    // Account for any additional costs required by CallingConvention for the
+    // type.
+    if (isa_and_nonnull<ReturnInst>(EU.User)) {
+      Cost +=
+          TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true);
+      continue;
+    }
+
     // No extract cost for vector "scalar"
     if (isa<FixedVectorType>(EU.Scalar->getType()))
       continue;
@@ -10566,7 +10646,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   if (ViewSLPTree)
     ViewGraph(this, "SLP" + F->getName(), false, Str);
 #endif
-
   return Cost;
 }