[llvm] b28f407 - [SLP]Improve reduction cost model for scalars.

Wed Apr 12 11:35:15 PDT 2023

Author: Alexey Bataev
Date: 2023-04-12T11:32:51-07:00
New Revision: b28f407df9d0f63ca85cfaeddf9effc024c071d8

URL: https://github.com/llvm/llvm-project/commit/b28f407df9d0f63ca85cfaeddf9effc024c071d8
DIFF: https://github.com/llvm/llvm-project/commit/b28f407df9d0f63ca85cfaeddf9effc024c071d8.diff

LOG: [SLP]Improve reduction cost model for scalars.

Instead of abstract cost of the scalar reduction ops, try to use the
cost of actual reduction operation instructions, where possible. Also,
remove the estimation of the vectorized GEPs pointers for reduced loads,
since it is already handled in the tree.

Differential Revision: https://reviews.llvm.org/D148036

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2a20b332ea2e1..3faa5f4b8be2f 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1148,18 +1148,6 @@ class BoUpSLP {
   /// Construct a vectorizable tree that starts at \p Roots.
   void buildTree(ArrayRef<Value *> Roots);
 
-  /// Checks if the very first tree node is going to be vectorized.
-  bool isVectorizedFirstNode() const {
-    return !VectorizableTree.empty() &&
-           VectorizableTree.front()->State == TreeEntry::Vectorize;
-  }
-
-  /// Returns the main instruction for the very first node.
-  Instruction *getFirstNodeMainOp() const {
-    assert(!VectorizableTree.empty() && "No tree to get the first node from");
-    return VectorizableTree.front()->getMainOp();
-  }
-
   /// Returns whether the root node has in-tree uses.
   bool doesRootHaveInTreeUses() const {
     return !VectorizableTree.empty() &&
@@ -13340,22 +13328,7 @@ class HorizontalReduction {
         // Estimate cost.
         InstructionCost TreeCost = V.getTreeCost(VL);
         InstructionCost ReductionCost =
-            getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
-        if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {
-          Instruction *MainOp = V.getFirstNodeMainOp();
-          for (Value *V : VL) {
-            auto *VI = dyn_cast<LoadInst>(V);
-            // Add the costs of scalar GEP pointers, to be removed from the
-            // code.
-            if (!VI || VI == MainOp)
-              continue;
-            auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
-            if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
-              continue;
-            TreeCost -= TTI->getArithmeticInstrCost(
-                Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);
-          }
-        }
+            getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
         InstructionCost Cost = TreeCost + ReductionCost;
         LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
         if (!Cost.isValid())
@@ -13591,7 +13564,8 @@ class HorizontalReduction {
   /// Calculate the cost of a reduction.
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
                                    ArrayRef<Value *> ReducedVals,
-                                   unsigned ReduxWidth, FastMathFlags FMF) {
+                                   bool IsCmpSelMinMax, unsigned ReduxWidth,
+                                   FastMathFlags FMF) {
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     Value *FirstReducedVal = ReducedVals.front();
     Type *ScalarTy = FirstReducedVal->getType();
@@ -13600,6 +13574,35 @@ class HorizontalReduction {
     // If all of the reduced values are constant, the vector cost is 0, since
     // the reduction value can be calculated at the compile time.
     bool AllConsts = allConstant(ReducedVals);
+    auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
+      InstructionCost Cost = 0;
+      // Scalar cost is repeated for N-1 elements.
+      int Cnt = ReducedVals.size();
+      for (Value *RdxVal : ReducedVals) {
+        if (Cnt == 1)
+          break;
+        --Cnt;
+        if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
+          Cost += GenCostFn();
+          continue;
+        }
+        InstructionCost ScalarCost = 0;
+        for (User *U : RdxVal->users()) {
+          auto *RdxOp = cast<Instruction>(U);
+          if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+            ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
+            continue;
+          }
+          ScalarCost = InstructionCost::getInvalid();
+          break;
+        }
+        if (ScalarCost.isValid())
+          Cost += ScalarCost;
+        else
+          Cost += GenCostFn();
+      }
+      return Cost;
+    };
     switch (RdxKind) {
     case RecurKind::Add:
     case RecurKind::Mul:
@@ -13612,7 +13615,9 @@ class HorizontalReduction {
       if (!AllConsts)
         VectorCost =
             TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
-      ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+      ScalarCost = EvaluateScalarCost([&]() {
+        return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+      });
       break;
     }
     case RecurKind::FMax:
@@ -13626,10 +13631,12 @@ class HorizontalReduction {
                                         /*IsUnsigned=*/false, CostKind);
       }
       CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
-      ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
-                                           SclCondTy, RdxPred, CostKind) +
-                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                           SclCondTy, RdxPred, CostKind);
+      ScalarCost = EvaluateScalarCost([&]() {
+        return TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy,
+                                       RdxPred, CostKind) +
+               TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
+                                       RdxPred, CostKind);
+      });
       break;
     }
     case RecurKind::SMax:
@@ -13646,18 +13653,18 @@ class HorizontalReduction {
                                                  IsUnsigned, CostKind);
       }
       CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
-      ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
-                                           SclCondTy, RdxPred, CostKind) +
-                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                           SclCondTy, RdxPred, CostKind);
+      ScalarCost = EvaluateScalarCost([&]() {
+        return TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy,
+                                       RdxPred, CostKind) +
+               TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
+                                       RdxPred, CostKind);
+      });
       break;
     }
     default:
       llvm_unreachable("Expected arithmetic or min/max reduction operation");
     }
 
-    // Scalar cost is repeated for N-1 elements.
-    ScalarCost *= (ReduxWidth - 1);
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
                       << " for reduction that starts with " << *FirstReducedVal
                       << " (It is a splitting reduction)\n");

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 9c456df25f225..e72b6cc1a2402 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE4
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,THRESH
 
 @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
@@ -1113,41 +1113,18 @@ define i16 @smin_intrinsic_rdx_v8i16(ptr %p0) {
 }
 
 define i64 @umax_intrinsic_rdx_v4i64(ptr %p0) {
-; SSE2-LABEL: @umax_intrinsic_rdx_v4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
-; SSE2-NEXT:    [[T0:%.*]] = load i64, ptr [[P0]], align 4
-; SSE2-NEXT:    [[T1:%.*]] = load i64, ptr [[P1]], align 4
-; SSE2-NEXT:    [[T2:%.*]] = load i64, ptr [[P2]], align 4
-; SSE2-NEXT:    [[T3:%.*]] = load i64, ptr [[P3]], align 4
-; SSE2-NEXT:    [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
-; SSE2-NEXT:    [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
-; SSE2-NEXT:    [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
-; SSE2-NEXT:    ret i64 [[M]]
-;
-; SSE4-LABEL: @umax_intrinsic_rdx_v4i64(
-; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
-; SSE4-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
-; SSE4-NEXT:    ret i64 [[TMP2]]
-;
-; AVX-LABEL: @umax_intrinsic_rdx_v4i64(
-; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
-; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
-; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
-; AVX-NEXT:    [[T0:%.*]] = load i64, ptr [[P0]], align 4
-; AVX-NEXT:    [[T1:%.*]] = load i64, ptr [[P1]], align 4
-; AVX-NEXT:    [[T2:%.*]] = load i64, ptr [[P2]], align 4
-; AVX-NEXT:    [[T3:%.*]] = load i64, ptr [[P3]], align 4
-; AVX-NEXT:    [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
-; AVX-NEXT:    [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
-; AVX-NEXT:    [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
-; AVX-NEXT:    ret i64 [[M]]
-;
-; AVX2-LABEL: @umax_intrinsic_rdx_v4i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
-; AVX2-NEXT:    ret i64 [[TMP2]]
+; DEFAULT-LABEL: @umax_intrinsic_rdx_v4i64(
+; DEFAULT-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
+; DEFAULT-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
+; DEFAULT-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
+; DEFAULT-NEXT:    [[T0:%.*]] = load i64, ptr [[P0]], align 4
+; DEFAULT-NEXT:    [[T1:%.*]] = load i64, ptr [[P1]], align 4
+; DEFAULT-NEXT:    [[T2:%.*]] = load i64, ptr [[P2]], align 4
+; DEFAULT-NEXT:    [[T3:%.*]] = load i64, ptr [[P3]], align 4
+; DEFAULT-NEXT:    [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
+; DEFAULT-NEXT:    [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
+; DEFAULT-NEXT:    [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
+; DEFAULT-NEXT:    ret i64 [[M]]
 ;
 ; THRESH-LABEL: @umax_intrinsic_rdx_v4i64(
 ; THRESH-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
@@ -1252,5 +1229,3 @@ define void @PR49730() {
   %t14 = call i32 @llvm.umin.i32(i32 %t13, i32 93)
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE: {{.*}}

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll
index a838621a71d4b..15538789e46e2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 
@@ -22,10 +22,25 @@ define i32 @smax_v2i32(i32) {
 }
 
 define i32 @smax_v4i32(i32) {
-; CHECK-LABEL: @smax_v4i32(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; SSE2-LABEL: @smax_v4i32(
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
+; SSE2-NEXT:    [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
+; SSE2-NEXT:    [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
+; SSE2-NEXT:    ret i32 [[TMP8]]
+;
+; SSE4-LABEL: @smax_v4i32(
+; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
+; SSE4-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; SSE4-NEXT:    ret i32 [[TMP3]]
+;
+; AVX-LABEL: @smax_v4i32(
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
+; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; AVX-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = load i32, ptr @arr, align 16
   %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
@@ -100,8 +115,3 @@ define i32 @smax_v16i32(i32) {
   %32 = call i32 @llvm.smax.i32(i32 %31, i32 %17)
   ret i32 %32
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
-; SSE2: {{.*}}
-; SSE4: {{.*}}