[llvm] [VPlan] Implement VPReductionRecipe::computeCost(). NFC (PR #107790)

Thu Oct 3 18:05:21 PDT 2024

https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/107790

>From 4aba2d3cff75caade80e01b4635844ac149335a2 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Wed, 4 Sep 2024 20:52:14 -0700
Subject: [PATCH 1/4] [VPlan] Implment VPReductionRecipe::computeCost(). NFC

Implementation of `computeCost()` function for `VPReductionRecipe`.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  4 ++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 24 +++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c4567362eaffc7..56a2d074991684 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2393,6 +2393,10 @@ class VPReductionRecipe : public VPSingleDefRecipe {
   /// Generate the reduction in the loop
   void execute(VPTransformState &State) override;
 
+  /// Return the cost of VPReductionRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 75908638532950..8137e4f4579fc8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2022,6 +2022,30 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) {
   State.set(this, NewRed, /*IsScalar*/ true);
 }
 
+InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
+                                               VPCostContext &Ctx) const {
+  RecurKind RdxKind = RdxDesc.getRecurrenceKind();
+  Type *ElementTy = RdxDesc.getRecurrenceType();
+  auto *VectorTy = dyn_cast<VectorType>(ToVectorTy(ElementTy, VF));
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  unsigned Opcode = RdxDesc.getOpcode();
+
+  if (VectorTy == nullptr)
+    return InstructionCost::getInvalid();
+
+  // Cost = Reduction cost + BinOp cost
+  InstructionCost Cost =
+      Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);
+  if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
+    Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
+    return Cost + Ctx.TTI.getMinMaxReductionCost(
+                      Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+  }
+
+  return Cost + Ctx.TTI.getArithmeticReductionCost(
+                    Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {

>From f2c2b8ff1a89885f48788a98679a660825fcbf4e Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Tue, 24 Sep 2024 23:31:24 -0700
Subject: [PATCH 2/4] Address comments.

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8137e4f4579fc8..01e4a4d9494abd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2026,13 +2026,10 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   RecurKind RdxKind = RdxDesc.getRecurrenceKind();
   Type *ElementTy = RdxDesc.getRecurrenceType();
-  auto *VectorTy = dyn_cast<VectorType>(ToVectorTy(ElementTy, VF));
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ElementTy, VF));
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   unsigned Opcode = RdxDesc.getOpcode();
 
-  if (VectorTy == nullptr)
-    return InstructionCost::getInvalid();
-
   // Cost = Reduction cost + BinOp cost
   InstructionCost Cost =
       Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);

>From 6a388173e8ae1480b08b5c8fa58ef4fa6597fa2d Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Thu, 3 Oct 2024 00:56:56 -0700
Subject: [PATCH 3/4] Recoginize the reduction patterns from D93476 in
 computeCost().

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  15 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 151 +++++++++++++++++-
 2 files changed, 159 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 792e0e17dd8719..39d95856c226ef 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7220,6 +7220,9 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
                                                  ChainOps.end());
     // Also include the operands of instructions in the chain, as the cost-model
     // may mark extends as free.
+    // We only handle the reduction cost in the VPlan-based cost model
+    // currently.
+    // TODO: Handle this calculation in VPWidenRecipe and VPWidenCastRecipe.
     for (auto *ChainOp : ChainOps) {
       for (Value *Op : ChainOp->operands()) {
         if (auto *I = dyn_cast<Instruction>(Op))
@@ -7227,6 +7230,18 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
       }
     }
 
+    // Since we implemented the reduction cost for the VPReductionRecipe,
+    // removing the instruction here to prevent VPReductionRecipe::computeCost
+    // be skiped.
+    // TODO: Remove following checks when we can fully support reduction pattern
+    // cost in the VPlan-based cost model.
+    for (auto *I : ChainOpsAndOperands) {
+      if (I->getOpcode() == RdxDesc.getOpcode()) {
+        ChainOpsAndOperands.remove(I);
+        break;
+      }
+    }
+
     // Pre-compute the cost for I, if it has a reduction pattern cost.
     for (Instruction *I : ChainOpsAndOperands) {
       auto ReductionCost = CM.getReductionPatternCost(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 01e4a4d9494abd..1edf524c7f35ed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2022,6 +2022,11 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) {
   State.set(this, NewRed, /*IsScalar*/ true);
 }
 
+static bool isZExtOrSExt(Instruction::CastOps CastOpcode) {
+  return CastOpcode == Instruction::CastOps::ZExt ||
+         CastOpcode == Instruction::CastOps::SExt;
+}
+
 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   RecurKind RdxKind = RdxDesc.getRecurrenceKind();
@@ -2030,17 +2035,149 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   unsigned Opcode = RdxDesc.getOpcode();
 
-  // Cost = Reduction cost + BinOp cost
-  InstructionCost Cost =
-      Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);
+  InstructionCost BaseCost;
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
-    return Cost + Ctx.TTI.getMinMaxReductionCost(
-                      Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+    BaseCost = Ctx.TTI.getMinMaxReductionCost(
+        Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+  } else {
+    BaseCost = Ctx.TTI.getArithmeticReductionCost(
+        Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+  }
+
+  // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
+  // normal fmul instruction to the cost of the fadd reduction.
+  if (RdxKind == RecurKind::FMulAdd)
+    BaseCost +=
+        Ctx.TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
+
+  // If we're using ordered reductions then we can just return the base cost
+  // here, since getArithmeticReductionCost calculates the full ordered
+  // reduction cost when FP reassociation is not allowed.
+  if (IsOrdered && Opcode == Instruction::FAdd)
+    return BaseCost;
+
+  // Special case for arm from D93476
+  // The reduction instruction can be substituted in following condition.
+  //
+  //       %sa = sext <16 x i8> A to <16 x i32>
+  //       %sb = sext <16 x i8> B to <16 x i32>
+  //       %m = mul <16 x i32> %sa, %sb
+  //       %r = vecreduce.add(%m)
+  //       ->
+  //       R = VMLADAV A, B
+  //
+  // There are other instructions for performing add reductions of
+  // v4i32/v8i16/v16i8 into i32 (VADDV), for doing the same with v4i32->i64
+  // (VADDLV) and for performing a v4i32/v8i16 MLA into an i64 (VMLALDAV).
+  //
+  // We are looking for a pattern of, and finding the minimal acceptable cost:
+  //       reduce.add(ext(mul(ext(A), ext(B)))) or
+  //       reduce(ext(A)) or
+  //       reduce.add(mul(ext(A), ext(B))) or
+  //       reduce.add(mul(A, B)) or
+  //       reduce(A).
+
+  // Try to match reduce(ext(...))
+  auto *Ext = dyn_cast<VPWidenCastRecipe>(getVecOp());
+  if (Ext && isZExtOrSExt(Ext->getOpcode())) {
+    bool isUnsigned = Ext->getOpcode() == Instruction::CastOps::ZExt;
+
+    // Try to match reduce.add(ext(mul(...)))
+    auto *ExtTy = cast<VectorType>(
+        ToVectorTy(Ext->getOperand(0)->getUnderlyingValue()->getType(), VF));
+    auto *Mul = dyn_cast_if_present<VPWidenRecipe>(
+        Ext->getOperand(0)->getDefiningRecipe());
+    if (Mul && Mul->getOpcode() == Instruction::Mul &&
+        Opcode == Instruction::Add) {
+      auto *MulTy = cast<VectorType>(
+          ToVectorTy(Mul->getUnderlyingValue()->getType(), VF));
+      auto *InnerExt0 = dyn_cast<VPWidenCastRecipe>(Mul->getOperand(0));
+      auto *InnerExt1 = dyn_cast<VPWidenCastRecipe>(Mul->getOperand(1));
+
+      // Match reduce.add(ext(mul(ext(A), ext(B))))
+      if (InnerExt0 && isZExtOrSExt(InnerExt0->getOpcode()) && InnerExt1 &&
+          isZExtOrSExt(InnerExt1->getOpcode()) &&
+          InnerExt0->getOpcode() == InnerExt1->getOpcode()) {
+        Type *InnerExt0Ty =
+            InnerExt0->getOperand(0)->getUnderlyingValue()->getType();
+        Type *InnerExt1Ty =
+            InnerExt1->getOperand(0)->getUnderlyingValue()->getType();
+        // Get the largest type.
+        auto *MaxExtVecTy = cast<VectorType>(
+            ToVectorTy(InnerExt0Ty->getIntegerBitWidth() >
+                               InnerExt1Ty->getIntegerBitWidth()
+                           ? InnerExt0Ty
+                           : InnerExt1Ty,
+                       VF));
+        InstructionCost RedCost = Ctx.TTI.getMulAccReductionCost(
+            isUnsigned, ElementTy, MaxExtVecTy, CostKind);
+        InstructionCost InnerExtCost =
+            Ctx.TTI.getCastInstrCost(InnerExt0->getOpcode(), MulTy, MaxExtVecTy,
+                                     TTI::CastContextHint::None, CostKind);
+        InstructionCost MulCost =
+            Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, MulTy, CostKind);
+        InstructionCost ExtCost =
+            Ctx.TTI.getCastInstrCost(Ext->getOpcode(), VectorTy, ExtTy,
+                                     TTI::CastContextHint::None, CostKind);
+        if (RedCost.isValid() &&
+            RedCost < InnerExtCost * 2 + MulCost + ExtCost + BaseCost)
+          return RedCost;
+      }
+    }
+
+    // Match reduce(ext(A))
+    InstructionCost RedCost =
+        Ctx.TTI.getExtendedReductionCost(Opcode, isUnsigned, ElementTy, ExtTy,
+                                         RdxDesc.getFastMathFlags(), CostKind);
+    InstructionCost ExtCost =
+        Ctx.TTI.getCastInstrCost(Ext->getOpcode(), VectorTy, ExtTy,
+                                 TTI::CastContextHint::None, CostKind);
+    if (RedCost.isValid() && RedCost < RedCost + ExtCost)
+      return RedCost;
+  }
+
+  // Try to match reduce.add(mul(...))
+  auto *Mul =
+      dyn_cast_if_present<VPWidenRecipe>(getVecOp()->getDefiningRecipe());
+  if (Mul && Mul->getOpcode() == Instruction::Mul &&
+      Opcode == Instruction::Add) {
+    // Match reduce.add(mul(ext(A), ext(B)))
+    auto *InnerExt0 = dyn_cast<VPWidenCastRecipe>(Mul->getOperand(0));
+    auto *InnerExt1 = dyn_cast<VPWidenCastRecipe>(Mul->getOperand(1));
+    auto *MulTy =
+        cast<VectorType>(ToVectorTy(Mul->getUnderlyingValue()->getType(), VF));
+    InstructionCost MulCost =
+        Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, MulTy, CostKind);
+    if (InnerExt0 && isZExtOrSExt(InnerExt0->getOpcode()) && InnerExt1 &&
+        InnerExt0->getOpcode() == InnerExt1->getOpcode()) {
+      Type *InnerExt0Ty =
+          InnerExt0->getOperand(0)->getUnderlyingValue()->getType();
+      Type *InnerExt1Ty =
+          InnerExt1->getOperand(0)->getUnderlyingValue()->getType();
+      auto *MaxInnerExtVecTy = cast<VectorType>(ToVectorTy(
+          InnerExt0Ty->getIntegerBitWidth() > InnerExt1Ty->getIntegerBitWidth()
+              ? InnerExt0Ty
+              : InnerExt1Ty,
+          VF));
+      bool isUnsigned = InnerExt0->getOpcode() == Instruction::CastOps::ZExt;
+      InstructionCost RedCost = Ctx.TTI.getMulAccReductionCost(
+          isUnsigned, ElementTy, MaxInnerExtVecTy, CostKind);
+      InstructionCost InnerExtCost = Ctx.TTI.getCastInstrCost(
+          InnerExt0->getOpcode(), MulTy, MaxInnerExtVecTy,
+          TTI::CastContextHint::None, CostKind);
+      if (RedCost.isValid() && RedCost < BaseCost + MulCost + 2 * InnerExtCost)
+        return RedCost;
+    }
+    // Match reduce.add(mul)
+    InstructionCost RedCost =
+        Ctx.TTI.getMulAccReductionCost(true, ElementTy, VectorTy, CostKind);
+    if (RedCost.isValid() && RedCost < BaseCost + MulCost)
+      return RedCost;
   }
 
-  return Cost + Ctx.TTI.getArithmeticReductionCost(
-                    Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+  // Normal cost = Reduction cost + BinOp cost
+  return BaseCost + Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

>From faa86e500720fa73e3fa41800a6a042313069b33 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Thu, 3 Oct 2024 17:49:09 -0700
Subject: [PATCH 4/4] Address comments and use inferSalarType

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1edf524c7f35ed..d07826be40d45e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2030,11 +2030,19 @@ static bool isZExtOrSExt(Instruction::CastOps CastOpcode) {
 InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   RecurKind RdxKind = RdxDesc.getRecurrenceKind();
-  Type *ElementTy = RdxDesc.getRecurrenceType();
+  Type *ElementTy = Ctx.Types.inferScalarType(this->getVPSingleValue());
+  assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID());
+
   auto *VectorTy = cast<VectorType>(ToVectorTy(ElementTy, VF));
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   unsigned Opcode = RdxDesc.getOpcode();
 
+  // TODO: Remove the assertion when we support any-of reduction in VPlan-base
+  // cost model.
+  assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(
+             RdxDesc.getRecurrenceKind()) &&
+         "VPlan-base cost model not support any-of reduction.");
+
   InstructionCost BaseCost;
   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
@@ -2085,13 +2093,13 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
 
     // Try to match reduce.add(ext(mul(...)))
     auto *ExtTy = cast<VectorType>(
-        ToVectorTy(Ext->getOperand(0)->getUnderlyingValue()->getType(), VF));
+        ToVectorTy(Ctx.Types.inferScalarType(Ext->getOperand(0)), VF));
     auto *Mul = dyn_cast_if_present<VPWidenRecipe>(
         Ext->getOperand(0)->getDefiningRecipe());
     if (Mul && Mul->getOpcode() == Instruction::Mul &&
         Opcode == Instruction::Add) {
       auto *MulTy = cast<VectorType>(
-          ToVectorTy(Mul->getUnderlyingValue()->getType(), VF));
+          ToVectorTy(Ctx.Types.inferScalarType(Mul->getVPSingleValue()), VF));
       auto *InnerExt0 = dyn_cast<VPWidenCastRecipe>(Mul->getOperand(0));
       auto *InnerExt1 = dyn_cast<VPWidenCastRecipe>(Mul->getOperand(1));
 
@@ -2099,10 +2107,8 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
       if (InnerExt0 && isZExtOrSExt(InnerExt0->getOpcode()) && InnerExt1 &&
           isZExtOrSExt(InnerExt1->getOpcode()) &&
           InnerExt0->getOpcode() == InnerExt1->getOpcode()) {
-        Type *InnerExt0Ty =
-            InnerExt0->getOperand(0)->getUnderlyingValue()->getType();
-        Type *InnerExt1Ty =
-            InnerExt1->getOperand(0)->getUnderlyingValue()->getType();
+        Type *InnerExt0Ty = Ctx.Types.inferScalarType(InnerExt0->getOperand(0));
+        Type *InnerExt1Ty = Ctx.Types.inferScalarType(InnerExt1->getOperand(0));
         // Get the largest type.
         auto *MaxExtVecTy = cast<VectorType>(
             ToVectorTy(InnerExt0Ty->getIntegerBitWidth() >
@@ -2145,16 +2151,14 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
     // Match reduce.add(mul(ext(A), ext(B)))
     auto *InnerExt0 = dyn_cast<VPWidenCastRecipe>(Mul->getOperand(0));
     auto *InnerExt1 = dyn_cast<VPWidenCastRecipe>(Mul->getOperand(1));
-    auto *MulTy =
-        cast<VectorType>(ToVectorTy(Mul->getUnderlyingValue()->getType(), VF));
+    auto *MulTy = cast<VectorType>(
+        ToVectorTy(Ctx.Types.inferScalarType(Mul->getVPSingleValue()), VF));
     InstructionCost MulCost =
         Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, MulTy, CostKind);
     if (InnerExt0 && isZExtOrSExt(InnerExt0->getOpcode()) && InnerExt1 &&
         InnerExt0->getOpcode() == InnerExt1->getOpcode()) {
-      Type *InnerExt0Ty =
-          InnerExt0->getOperand(0)->getUnderlyingValue()->getType();
-      Type *InnerExt1Ty =
-          InnerExt1->getOperand(0)->getUnderlyingValue()->getType();
+      Type *InnerExt0Ty = Ctx.Types.inferScalarType(InnerExt0->getOperand(0));
+      Type *InnerExt1Ty = Ctx.Types.inferScalarType(InnerExt1->getOperand(0));
       auto *MaxInnerExtVecTy = cast<VectorType>(ToVectorTy(
           InnerExt0Ty->getIntegerBitWidth() > InnerExt1Ty->getIntegerBitWidth()
               ? InnerExt0Ty