[llvm] 2829376 - [LV] Use VScaleForTuning to fine-tune the cost per lane.

Mon Nov 8 09:00:52 PST 2021

Author: Sander de Smalen
Date: 2021-11-08T16:59:46Z
New Revision: 2829376bb267f3364c1225ffaac8b1b8b5688ed1

URL: https://github.com/llvm/llvm-project/commit/2829376bb267f3364c1225ffaac8b1b8b5688ed1
DIFF: https://github.com/llvm/llvm-project/commit/2829376bb267f3364c1225ffaac8b1b8b5688ed1.diff

LOG: [LV] Use VScaleForTuning to fine-tune the cost per lane.

When targeting a specific CPU with scalable vectorization, the knowledge
of that particular CPU's vscale value can be used to tune the cost-model
and make the cost per lane less pessimistic.

If the target implements 'TTI.getVScaleForTuning()', the cost-per-lane
is calculated as:

  Cost / (VScaleForTuning * VF.KnownMinLanes)

Otherwise, it assumes a value of 1 meaning that the behavior
is unchanged and calculated as:

  Cost / VF.KnownMinLanes

Reviewed By: kmclaughlin, david-arm

Differential Revision: https://reviews.llvm.org/D113209

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 04399336f2cb..e3cf87612e9c 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -916,6 +916,9 @@ class TargetTransformInfo {
   ///  architectural maximum vector length, and None otherwise.
   Optional<unsigned> getMaxVScale() const;
 
+  /// \return the value of vscale to tune the cost model for.
+  Optional<unsigned> getVScaleForTuning() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -1590,6 +1593,7 @@ class TargetTransformInfo::Concept {
   virtual TypeSize getRegisterBitWidth(RegisterKind K) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() const = 0;
   virtual Optional<unsigned> getMaxVScale() const = 0;
+  virtual Optional<unsigned> getVScaleForTuning() const = 0;
   virtual bool shouldMaximizeVectorBandwidth() const = 0;
   virtual ElementCount getMinimumVF(unsigned ElemWidth,
                                     bool IsScalable) const = 0;
@@ -2060,6 +2064,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   Optional<unsigned> getMaxVScale() const override {
     return Impl.getMaxVScale();
   }
+  Optional<unsigned> getVScaleForTuning() const override {
+    return Impl.getVScaleForTuning();
+  }
   bool shouldMaximizeVectorBandwidth() const override {
     return Impl.shouldMaximizeVectorBandwidth();
   }

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c0efa0337916..6f02b88e17db 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -399,6 +399,7 @@ class TargetTransformInfoImplBase {
   unsigned getMinVectorRegisterBitWidth() const { return 128; }
 
   Optional<unsigned> getMaxVScale() const { return None; }
+  Optional<unsigned> getVScaleForTuning() const { return None; }
 
   bool shouldMaximizeVectorBandwidth() const { return false; }
 

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index f297a367d366..8f43caf1eb28 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -665,6 +665,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   }
 
   Optional<unsigned> getMaxVScale() const { return None; }
+  Optional<unsigned> getVScaleForTuning() const { return None; }
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 41fe2f641301..3200c72546b7 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -604,6 +604,10 @@ Optional<unsigned> TargetTransformInfo::getMaxVScale() const {
   return TTIImpl->getMaxVScale();
 }
 
+Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const {
+  return TTIImpl->getVScaleForTuning();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const {
   return TTIImpl->shouldMaximizeVectorBandwidth();
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index f7de7c0ee176..c4e20bb12f8c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -125,6 +125,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return ST->getMinVectorRegisterBitWidth();
   }
 
+  Optional<unsigned> getVScaleForTuning() const {
+    return ST->getVScaleForTuning();
+  }
 
   /// Try to return an estimate cost factor that can be used as a multiplier
   /// when scalarizing an operation for a vector with ElementCount \p VF.

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0ecce9e274c6..ec8105aa6122 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6021,19 +6021,27 @@ bool LoopVectorizationCostModel::isMoreProfitable(
     return RTCostA < RTCostB;
   }
 
-  // When set to preferred, for now assume vscale may be larger than 1, so
-  // that scalable vectorization is slightly favorable over fixed-width
-  // vectorization.
+  // Improve estimate for the vector width if it is scalable.
+  unsigned EstimatedWidthA = A.Width.getKnownMinValue();
+  unsigned EstimatedWidthB = B.Width.getKnownMinValue();
+  if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
+    if (A.Width.isScalable())
+      EstimatedWidthA *= VScale.getValue();
+    if (B.Width.isScalable())
+      EstimatedWidthB *= VScale.getValue();
+  }
+
+  // When set to preferred, for now assume vscale may be larger than 1 (or the
+  // one being tuned for), so that scalable vectorization is slightly favorable
+  // over fixed-width vectorization.
   if (Hints->isScalableVectorizationPreferred())
     if (A.Width.isScalable() && !B.Width.isScalable())
-      return (CostA * B.Width.getKnownMinValue()) <=
-             (CostB * A.Width.getKnownMinValue());
+      return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
 
   // To avoid the need for FP division:
   //      (CostA / A.Width) < (CostB / B.Width)
   // <=>  (CostA * B.Width) < (CostB * A.Width)
-  return (CostA * B.Width.getKnownMinValue()) <
-         (CostB * A.Width.getKnownMinValue());
+  return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
 }
 
 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
@@ -6063,11 +6071,22 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
 
     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
     VectorizationFactor Candidate(i, C.first);
-    LLVM_DEBUG(
-        dbgs() << "LV: Vector loop of width " << i << " costs: "
-               << (Candidate.Cost / Candidate.Width.getKnownMinValue())
-               << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
-               << ".\n");
+
+#ifndef NDEBUG
+    unsigned AssumedMinimumVscale = 1;
+    if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
+      AssumedMinimumVscale = VScale.getValue();
+    unsigned Width =
+        Candidate.Width.isScalable()
+            ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
+            : Candidate.Width.getFixedValue();
+    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+                      << " costs: " << (Candidate.Cost / Width));
+    if (i.isScalable())
+      LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
+                        << AssumedMinimumVscale << ")");
+    LLVM_DEBUG(dbgs() << ".\n");
+#endif
 
     if (!C.second && !ForceVectorization) {
       LLVM_DEBUG(

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
new file mode 100644
index 000000000000..b70510b84eb7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -0,0 +1,54 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64 -mattr=+sve -scalable-vectorization=on \
+; RUN:     -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+
+; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic -scalable-vectorization=on \
+; RUN:     -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 -scalable-vectorization=on \
+; RUN:     -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
+
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 -scalable-vectorization=on \
+; RUN:     -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-4
+
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 -scalable-vectorization=preferred \
+; RUN:     -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+
+; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
+; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+
+; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
+; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+
+; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
+; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
+
+; VF-4: <4 x i32>
+; VF-VSCALE4: <vscale x 4 x i32>
+define void @test0(i32* %a, i8* %b, i32* %c) #0 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+  %1 = load i8, i8* %arrayidx2, align 4
+  %zext = zext i8 %1 to i32
+  %add = add nsw i32 %zext, %0
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index 8d53ae5a0b5d..ea9860c73f99 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -10,7 +10,7 @@
 define void @test0(i32* %a, i8* %b, i32* %c) #0 {
 ; CHECK: LV: Checking a loop in "test0"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
 ; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
@@ -43,7 +43,7 @@ exit:
 define void @test1(i32* %a, i8* %b) #0 {
 ; CHECK: LV: Checking a loop in "test1"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
 ; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
@@ -79,7 +79,7 @@ define void @test2(i32* %a, i8* %b) #0 {
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 2
-; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 2
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
 ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 2

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
index 246dcd237088..73b9853b7171 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -187,9 +187,9 @@ exit:
 ; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
 ; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a more suitable value.
 ; CHECK-DBG: Found feasible scalable VF = vscale x 2
-; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-DBG: LV: Selecting VF: vscale x 2.
 ; CHECK-LABEL: @test4
-; CHECK: <4 x i32>
+; CHECK: <vscale x 2 x i32>
 define void @test4(i32* %a, i32* %b) #0 {
 entry:
   br label %loop