[llvm] 88b9e46 - [SLP] Steer for the best chance in tryToVectorize() when rooting with binary ops.

Mon Apr 25 12:25:57 PDT 2022

Author: Valery N Dmitriev
Date: 2022-04-25T12:25:33-07:00
New Revision: 88b9e46fb54c5321742d16016be5b2920facc018

URL: https://github.com/llvm/llvm-project/commit/88b9e46fb54c5321742d16016be5b2920facc018
DIFF: https://github.com/llvm/llvm-project/commit/88b9e46fb54c5321742d16016be5b2920facc018.diff

LOG: [SLP] Steer for the best chance in tryToVectorize() when rooting with binary ops.

tryToVectorize() method implements one of searching paths for vectorizable tree roots in SLP vectorizer,
specifically for binary and comparison operations. Order of making probes for various scalar pairs
was defined by its implementation: the instruction operands, then climb over one operand if
the instruction is its sole user and then perform same actions for another operand if previous
attempts failed. Problem with this approach is that among these options we can have more than a
single vectorizable tree candidate and it is not necessarily the one that encountered first.
Trying to build vectorizable tree for each possible combination for just evaluation is expensive.
But we already have lookahead heuristics mechanism which we use for finding best pick among
operands of commutative instructions. It calculates cumulative score for candidates in two
consecutive lanes. This patch introduces use of the heuristics for choosing the best pair among
several combinations. We only try one that looks as most promising for vectorization.
Additional benefit is that we reduce total number of vectorization trees built for probes
because we skip those looking non-profitable early.

Reviewed By: Alexey Bataev (ABataev), Vasileios Porpodas (vporpo)
Differential Revision: https://reviews.llvm.org/D124309

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
    llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ce088827b5eda..74eb3f29a8843 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -164,6 +164,15 @@ static cl::opt<int> LookAheadMaxDepth(
     "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
     cl::desc("The maximum look-ahead depth for operand reordering scores"));
 
+// The maximum depth that the look-ahead score heuristic will explore
+// when it probing among candidates for vectorization tree roots.
+// The higher this value, the higher the compilation time overhead but unlike
+// similar limit for operands ordering this is less frequently used, hence
+// impact of higher value is less noticeable.
+static cl::opt<int> RootLookAheadMaxDepth(
+    "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
+    cl::desc("The maximum look-ahead depth for searching best rooting option"));
+
 static cl::opt<bool>
     ViewSLPTree("view-slp-tree", cl::Hidden,
                 cl::desc("Display the SLP trees with Graphviz"));
@@ -2000,6 +2009,29 @@ class BoUpSLP {
 #endif
   };
 
+  /// Evaluate each pair in \p Candidates and return index into \p Candidates
+  /// for a pair which have highest score deemed to have best chance to form
+  /// root of profitable tree to vectorize. Return None if no candidate scored
+  /// above the LookAheadHeuristics::ScoreFail.
+  Optional<int>
+  findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates) {
+    LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
+                                  RootLookAheadMaxDepth);
+    int BestScore = LookAheadHeuristics::ScoreFail;
+    Optional<int> Index = None;
+    for (int I : seq<int>(0, Candidates.size())) {
+      int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
+                                               Candidates[I].second,
+                                               /*U1=*/nullptr, /*U2=*/nullptr,
+                                               /*Level=*/1, None);
+      if (Score > BestScore) {
+        BestScore = Score;
+        Index = I;
+      }
+    }
+    return Index;
+  }
+
   /// Checks if the instruction is marked for deletion.
   bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
 
@@ -9150,7 +9182,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   if (!I)
     return false;
 
-  if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+  if ((!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) ||
+      isa<VectorType>(I->getType()))
     return false;
 
   Value *P = I->getParent();
@@ -9161,32 +9194,40 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
     return false;
 
-  // Try to vectorize V.
-  if (tryToVectorizePair(Op0, Op1, R))
-    return true;
+  // First collect all possible candidates
+  SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+  Candidates.emplace_back(Op0, Op1);
 
   auto *A = dyn_cast<BinaryOperator>(Op0);
   auto *B = dyn_cast<BinaryOperator>(Op1);
   // Try to skip B.
-  if (B && B->hasOneUse()) {
+  if (A && B && B->hasOneUse()) {
     auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
     auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-    if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
-      return true;
-    if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
-      return true;
+    if (B0 && B0->getParent() == P)
+      Candidates.emplace_back(A, B0);
+    if (B1 && B1->getParent() == P)
+      Candidates.emplace_back(A, B1);
   }
-
   // Try to skip A.
-  if (A && A->hasOneUse()) {
+  if (B && A && A->hasOneUse()) {
     auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
     auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-    if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
-      return true;
-    if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
-      return true;
+    if (A0 && A0->getParent() == P)
+      Candidates.emplace_back(A0, B);
+    if (A1 && A1->getParent() == P)
+      Candidates.emplace_back(A1, B);
   }
-  return false;
+
+  if (Candidates.size() == 1)
+    return tryToVectorizePair(Op0, Op1, R);
+
+  // We have multiple options. Try to pick the single best.
+  Optional<int> BestCandidate = R.findBestRootPair(Candidates);
+  if (!BestCandidate)
+    return false;
+  return tryToVectorizePair(Candidates[*BestCandidate].first,
+                            Candidates[*BestCandidate].second, R);
 }
 
 namespace {

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll
index cfd2463f9d857..8c9823a3923c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll
@@ -1,12 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S -pass-remarks-missed=slp-vectorizer 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
 ; This test check that slp vectorizer is not trying to vectorize instructions already vectorized.
-; CHECK: remark: <unknown>:0:0: Cannot SLP vectorize list: type <16 x i8> is unsupported by vectorizer
 
 define void @vector() {
+; CHECK-LABEL: @vector(
+; CHECK-NEXT:    [[LOAD0:%.*]] = tail call <16 x i8> @vector.load(<16 x i8>* undef, i32 1)
+; CHECK-NEXT:    [[LOAD1:%.*]] = tail call <16 x i8> @vector.load(<16 x i8>* undef, i32 2)
+; CHECK-NEXT:    [[ADD:%.*]] = add <16 x i8> [[LOAD1]], [[LOAD0]]
+; CHECK-NEXT:    tail call void @vector.store(<16 x i8> [[ADD]], <16 x i8>* undef, i32 1)
+; CHECK-NEXT:    ret void
+;
   %load0 = tail call <16 x i8> @vector.load(<16 x i8> *undef, i32 1)
   %load1 = tail call <16 x i8> @vector.load(<16 x i8> *undef, i32 2)
   %add = add <16 x i8> %load1, %load0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index b97d6018c3ddb..ef45de71314b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -157,8 +157,6 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])
 ; MINTREESIZE-NEXT:    ret <4 x float> undef

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 73c4187efa8f0..e42bc3beca3da 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -192,8 +192,6 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
 ; MINTREESIZE-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
 ; MINTREESIZE-NEXT:    call void @llvm.assume(i1 [[QI]])
 ; MINTREESIZE-NEXT:    ret <4 x float> undef

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
index 1874651446e48..9956ff5d7f02a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
@@ -17,27 +17,18 @@ define double @root_selection(double %a, double %b, double %c, double %d) local_
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fdiv fast <2 x double> [[TMP2]], <double 7.000000e+00, double 5.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT:    [[I11:%.*]] = fmul fast double [[TMP4]], undef
+; CHECK-NEXT:    [[I09:%.*]] = fmul fast double [[TMP4]], undef
+; CHECK-NEXT:    [[I10:%.*]] = fsub fast double undef, [[I09]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP3]], <double 3.000000e+00, double undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x double> undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <2 x double> undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[I11]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <2 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul fast <2 x double> [[TMP12]], <double 4.000000e+00, double 1.100000e+01>
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <2 x double> [[TMP12]], <double 4.000000e+00, double 1.100000e+01>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = fdiv fast <2 x double> [[TMP15]], <double 1.400000e+00, double 1.200000e+01>
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul fast <2 x double> [[TMP15]], <double 1.400000e+00, double 1.200000e+01>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP17]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd fast <2 x double> [[TMP18]], <double undef, double 1.400000e+00>
-; CHECK-NEXT:    [[TMP20:%.*]] = fdiv fast <2 x double> [[TMP18]], <double undef, double 1.400000e+00>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP20]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[TMP21]], i32 1
-; CHECK-NEXT:    [[I16:%.*]] = fadd fast double [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[I10]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], <double undef, double 1.100000e+01>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> [[TMP8]], <double 4.000000e+00, double 1.200000e+01>
+; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <2 x double> [[TMP9]], <double 1.400000e+00, double 1.400000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
+; CHECK-NEXT:    [[I07:%.*]] = fadd fast double undef, [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
+; CHECK-NEXT:    [[I16:%.*]] = fadd fast double [[I07]], [[TMP12]]
 ; CHECK-NEXT:    [[I17:%.*]] = fadd fast double [[I16]], [[C:%.*]]
 ; CHECK-NEXT:    [[I18:%.*]] = fadd fast double [[I17]], [[D:%.*]]
 ; CHECK-NEXT:    ret double [[I18]]