[llvm] 2829376 - [LV] Use VScaleForTuning to fine-tune the cost per lane.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 8 09:00:52 PST 2021
Author: Sander de Smalen
Date: 2021-11-08T16:59:46Z
New Revision: 2829376bb267f3364c1225ffaac8b1b8b5688ed1
URL: https://github.com/llvm/llvm-project/commit/2829376bb267f3364c1225ffaac8b1b8b5688ed1
DIFF: https://github.com/llvm/llvm-project/commit/2829376bb267f3364c1225ffaac8b1b8b5688ed1.diff
LOG: [LV] Use VScaleForTuning to fine-tune the cost per lane.
When targeting a specific CPU with scalable vectorization, the knowledge
of that particular CPU's vscale value can be used to tune the cost-model
and make the cost per lane less pessimistic.
If the target implements 'TTI.getVScaleForTuning()', the cost-per-lane
is calculated as:
Cost / (VScaleForTuning * VF.KnownMinLanes)
Otherwise, it assumes a value of 1 meaning that the behavior
is unchanged and calculated as:
Cost / VF.KnownMinLanes
Reviewed By: kmclaughlin, david-arm
Differential Revision: https://reviews.llvm.org/D113209
Added:
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 04399336f2cb..e3cf87612e9c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -916,6 +916,9 @@ class TargetTransformInfo {
/// architectural maximum vector length, and None otherwise.
Optional<unsigned> getMaxVScale() const;
+ /// \return the value of vscale to tune the cost model for.
+ Optional<unsigned> getVScaleForTuning() const;
+
/// \return True if the vectorization factor should be chosen to
/// make the vector of the smallest element type match the size of a
/// vector register. For wider element types, this could result in
@@ -1590,6 +1593,7 @@ class TargetTransformInfo::Concept {
virtual TypeSize getRegisterBitWidth(RegisterKind K) const = 0;
virtual unsigned getMinVectorRegisterBitWidth() const = 0;
virtual Optional<unsigned> getMaxVScale() const = 0;
+ virtual Optional<unsigned> getVScaleForTuning() const = 0;
virtual bool shouldMaximizeVectorBandwidth() const = 0;
virtual ElementCount getMinimumVF(unsigned ElemWidth,
bool IsScalable) const = 0;
@@ -2060,6 +2064,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
Optional<unsigned> getMaxVScale() const override {
return Impl.getMaxVScale();
}
+ Optional<unsigned> getVScaleForTuning() const override {
+ return Impl.getVScaleForTuning();
+ }
bool shouldMaximizeVectorBandwidth() const override {
return Impl.shouldMaximizeVectorBandwidth();
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c0efa0337916..6f02b88e17db 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -399,6 +399,7 @@ class TargetTransformInfoImplBase {
unsigned getMinVectorRegisterBitWidth() const { return 128; }
Optional<unsigned> getMaxVScale() const { return None; }
+ Optional<unsigned> getVScaleForTuning() const { return None; }
bool shouldMaximizeVectorBandwidth() const { return false; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index f297a367d366..8f43caf1eb28 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -665,6 +665,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
Optional<unsigned> getMaxVScale() const { return None; }
+ Optional<unsigned> getVScaleForTuning() const { return None; }
/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the demanded result elements need to be inserted and/or
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 41fe2f641301..3200c72546b7 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -604,6 +604,10 @@ Optional<unsigned> TargetTransformInfo::getMaxVScale() const {
return TTIImpl->getMaxVScale();
}
+Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const {
+ return TTIImpl->getVScaleForTuning();
+}
+
bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const {
return TTIImpl->shouldMaximizeVectorBandwidth();
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index f7de7c0ee176..c4e20bb12f8c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -125,6 +125,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return ST->getMinVectorRegisterBitWidth();
}
+ Optional<unsigned> getVScaleForTuning() const {
+ return ST->getVScaleForTuning();
+ }
/// Try to return an estimate cost factor that can be used as a multiplier
/// when scalarizing an operation for a vector with ElementCount \p VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0ecce9e274c6..ec8105aa6122 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6021,19 +6021,27 @@ bool LoopVectorizationCostModel::isMoreProfitable(
return RTCostA < RTCostB;
}
- // When set to preferred, for now assume vscale may be larger than 1, so
- // that scalable vectorization is slightly favorable over fixed-width
- // vectorization.
+ // Improve estimate for the vector width if it is scalable.
+ unsigned EstimatedWidthA = A.Width.getKnownMinValue();
+ unsigned EstimatedWidthB = B.Width.getKnownMinValue();
+ if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
+ if (A.Width.isScalable())
+ EstimatedWidthA *= VScale.getValue();
+ if (B.Width.isScalable())
+ EstimatedWidthB *= VScale.getValue();
+ }
+
+ // When set to preferred, for now assume vscale may be larger than 1 (or the
+ // one being tuned for), so that scalable vectorization is slightly favorable
+ // over fixed-width vectorization.
if (Hints->isScalableVectorizationPreferred())
if (A.Width.isScalable() && !B.Width.isScalable())
- return (CostA * B.Width.getKnownMinValue()) <=
- (CostB * A.Width.getKnownMinValue());
+ return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
// To avoid the need for FP division:
// (CostA / A.Width) < (CostB / B.Width)
// <=> (CostA * B.Width) < (CostB * A.Width)
- return (CostA * B.Width.getKnownMinValue()) <
- (CostB * A.Width.getKnownMinValue());
+ return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
}
VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
@@ -6063,11 +6071,22 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
VectorizationCostTy C = expectedCost(i, &InvalidCosts);
VectorizationFactor Candidate(i, C.first);
- LLVM_DEBUG(
- dbgs() << "LV: Vector loop of width " << i << " costs: "
- << (Candidate.Cost / Candidate.Width.getKnownMinValue())
- << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
- << ".\n");
+
+#ifndef NDEBUG
+ unsigned AssumedMinimumVscale = 1;
+ if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
+ AssumedMinimumVscale = VScale.getValue();
+ unsigned Width =
+ Candidate.Width.isScalable()
+ ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
+ : Candidate.Width.getFixedValue();
+ LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+ << " costs: " << (Candidate.Cost / Width));
+ if (i.isScalable())
+ LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
+ << AssumedMinimumVscale << ")");
+ LLVM_DEBUG(dbgs() << ".\n");
+#endif
if (!C.second && !ForceVectorization) {
LLVM_DEBUG(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
new file mode 100644
index 000000000000..b70510b84eb7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -0,0 +1,54 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64 -mattr=+sve -scalable-vectorization=on \
+; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+
+; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic -scalable-vectorization=on \
+; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 -scalable-vectorization=on \
+; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
+
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 -scalable-vectorization=on \
+; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-4
+
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 -scalable-vectorization=preferred \
+; RUN: -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
+; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+
+; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
+; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+
+; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
+; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+
+; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
+; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
+
+; VF-4: <4 x i32>
+; VF-VSCALE4: <vscale x 4 x i32>
+define void @test0(i32* %a, i8* %b, i32* %c) #0 {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+ %1 = load i8, i8* %arrayidx2, align 4
+ %zext = zext i8 %1 to i32
+ %add = add nsw i32 %zext, %0
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
+ store i32 %add, i32* %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index 8d53ae5a0b5d..ea9860c73f99 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -10,7 +10,7 @@
define void @test0(i32* %a, i8* %b, i32* %c) #0 {
; CHECK: LV: Checking a loop in "test0"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
@@ -43,7 +43,7 @@ exit:
define void @test1(i32* %a, i8* %b) #0 {
; CHECK: LV: Checking a loop in "test1"
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
@@ -79,7 +79,7 @@ define void @test2(i32* %a, i8* %b) #0 {
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
; CHECK_SCALABLE_ON: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 2
-; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4
+; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 2
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
index 246dcd237088..73b9853b7171 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -187,9 +187,9 @@ exit:
; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a more suitable value.
; CHECK-DBG: Found feasible scalable VF = vscale x 2
-; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-DBG: LV: Selecting VF: vscale x 2.
; CHECK-LABEL: @test4
-; CHECK: <4 x i32>
+; CHECK: <vscale x 2 x i32>
define void @test4(i32* %a, i32* %b) #0 {
entry:
br label %loop
More information about the llvm-commits
mailing list