[llvm] bb82f74 - Revert "Revert "[AArch64] Set maximum VF with shouldMaximizeVectorBandwidth""
Jingu Kang via llvm-commits
llvm-commits at lists.llvm.org
Mon May 23 08:18:22 PDT 2022
Author: Jingu Kang
Date: 2022-05-23T16:15:45+01:00
New Revision: bb82f746129f7deee5e09bd1577ce55eb88c6f9f
URL: https://github.com/llvm/llvm-project/commit/bb82f746129f7deee5e09bd1577ce55eb88c6f9f
DIFF: https://github.com/llvm/llvm-project/commit/bb82f746129f7deee5e09bd1577ce55eb88c6f9f.diff
LOG: Revert "Revert "[AArch64] Set maximum VF with shouldMaximizeVectorBandwidth""
This reverts commit 42ebfa8269470e6b1fe2de996d3f1db6d142e16a.
The commmit from https://reviews.llvm.org/D125918 has fixed the stage 2 build
failure.
Differential Revision: https://reviews.llvm.org/D118979
Added:
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 4704697b40d2e..2a4620e5abcac 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -937,7 +937,8 @@ class TargetTransformInfo {
/// creating vectors that span multiple vector registers.
/// If false, the vectorization factor will be chosen based on the
/// size of the widest element type.
- bool shouldMaximizeVectorBandwidth() const;
+ /// \p K Register Kind for vectorization.
+ bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
/// \return The minimum vectorization factor for types of given element
/// bit width, or 0 if there is no minimum VF. The returned value only
@@ -1640,7 +1641,8 @@ class TargetTransformInfo::Concept {
virtual unsigned getMinVectorRegisterBitWidth() const = 0;
virtual Optional<unsigned> getMaxVScale() const = 0;
virtual Optional<unsigned> getVScaleForTuning() const = 0;
- virtual bool shouldMaximizeVectorBandwidth() const = 0;
+ virtual bool
+ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0;
virtual ElementCount getMinimumVF(unsigned ElemWidth,
bool IsScalable) const = 0;
virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
@@ -2139,8 +2141,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
Optional<unsigned> getVScaleForTuning() const override {
return Impl.getVScaleForTuning();
}
- bool shouldMaximizeVectorBandwidth() const override {
- return Impl.shouldMaximizeVectorBandwidth();
+ bool shouldMaximizeVectorBandwidth(
+ TargetTransformInfo::RegisterKind K) const override {
+ return Impl.shouldMaximizeVectorBandwidth(K);
}
ElementCount getMinimumVF(unsigned ElemWidth,
bool IsScalable) const override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 383b91c7af13d..044e9bc179a5c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -417,7 +417,10 @@ class TargetTransformInfoImplBase {
Optional<unsigned> getMaxVScale() const { return None; }
Optional<unsigned> getVScaleForTuning() const { return None; }
- bool shouldMaximizeVectorBandwidth() const { return false; }
+ bool
+ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
+ return false;
+ }
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
return ElementCount::get(0, IsScalable);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3e4cacf17a40d..ac5c56d5bd964 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -626,8 +626,9 @@ Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const {
return TTIImpl->getVScaleForTuning();
}
-bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const {
- return TTIImpl->shouldMaximizeVectorBandwidth();
+bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
+ TargetTransformInfo::RegisterKind K) const {
+ return TTIImpl->shouldMaximizeVectorBandwidth(K);
}
ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cf61d779d8120..205d59f878d3f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -51,6 +51,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
return (CallerBits & CalleeBits) == CalleeBits;
}
+bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
+ TargetTransformInfo::RegisterKind K) const {
+ assert(K != TargetTransformInfo::RGK_Scalar);
+ return K == TargetTransformInfo::RGK_FixedWidthVector;
+}
+
/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 679ee5fecfbfb..a84cc89a6bf42 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -135,6 +135,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return ST->getVScaleForTuning();
}
+ bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
+
/// Try to return an estimate cost factor that can be used as a multiplier
/// when scalarizing an operation for a vector with ElementCount \p VF.
/// For scalable vectors this currently takes the most pessimistic view based
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 16245528b5f2e..7bbaf7ae9cb26 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -86,12 +86,11 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
unsigned getMinVectorRegisterBitWidth() const;
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const;
- bool shouldMaximizeVectorBandwidth() const {
+ bool
+ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
return true;
}
- bool supportsEfficientVectorElementLoadStore() {
- return false;
- }
+ bool supportsEfficientVectorElementLoadStore() { return false; }
bool hasBranchDivergence() {
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6a4159b06d577..cc9e7425ee55d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5216,9 +5216,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return ElementCount::getFixed(ClampedConstTripCount);
}
+ TargetTransformInfo::RegisterKind RegKind =
+ ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
- TTI.shouldMaximizeVectorBandwidth())) {
+ TTI.shouldMaximizeVectorBandwidth(RegKind))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
ComputeScalableMaxVF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
index 371d209bafffe..a1ca0fea79727 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
@@ -4,11 +4,12 @@
; are not profitable.
; Test with a loop that contains memory accesses of i8 and i32 types. The
-; default maximum VF for NEON is 4. And while we don't have an instruction to
-; load 4 x i8, vectorization might still be profitable.
+; maximum VF for NEON is calculated by 128/size of smallest type in loop.
+; And while we don't have an instruction to load 4 x i8, vectorization
+; might still be profitable.
define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
; CHECK-LABEL: @test_load_i8_store_i32(
-; CHECK: <4 x i8>
+; CHECK: <16 x i8>
;
entry:
br label %loop
@@ -32,7 +33,7 @@ exit:
; Same as test_load_i8_store_i32, but with types flipped for load and store.
define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
; CHECK-LABEL: @test_load_i32_store_i8(
-; CHECK: <4 x i8>
+; CHECK: <16 x i8>
;
entry:
br label %loop
@@ -84,7 +85,7 @@ exit:
; vectorization factor.
define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
; CHECK-LABEL: @test_load_i8_store_i64_large
-; CHECK: <2 x i64>
+; CHECK: <8 x i64>
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index e6e43375204da..28eabe382dfbd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -116,9 +116,9 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: @add_d(
-; CHECK: load <4 x i16>
-; CHECK: add nsw <4 x i32>
-; CHECK: store <4 x i32>
+; CHECK: load <8 x i16>
+; CHECK: add nsw <8 x i32>
+; CHECK: store <8 x i32>
define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp7 = icmp sgt i32 %len, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
index a95c0aa6f375f..071255c4f4f00 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -123,16 +123,16 @@ for.body:
; }
;
; CHECK: vector.body:
-; CHECK: phi <8 x i16>
-; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16>
-; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16>
-; CHECK: add <8 x i16>
-; CHECK: add <8 x i16>
+; CHECK: phi <16 x i16>
+; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16>
+; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16>
+; CHECK: add <16 x i16>
+; CHECK: add <16 x i16>
;
; CHECK: middle.block:
-; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
+; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
; CHECK: zext i16 [[Rdx]] to i32
;
define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index 27868480c23b5..262236075f7cd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -29,7 +29,7 @@
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
; VF-4: <4 x i32>
-; VF-VSCALE4: <vscale x 4 x i32>
+; VF-VSCALE4: <16 x i32>
define void @test0(i32* %a, i8* %b, i32* %c) #0 {
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index 0e96c62e2ad0b..d15199662be08 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -9,9 +9,9 @@
define void @test0(i32* %a, i8* %b, i32* %c) #0 {
; CHECK: LV: Checking a loop in 'test0'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16
entry:
@@ -40,9 +40,9 @@ exit:
define void @test1(i32* %a, i8* %b) #0 {
; CHECK: LV: Checking a loop in 'test1'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
entry:
@@ -72,9 +72,9 @@ exit:
define void @test2(i32* %a, i8* %b) #0 {
; CHECK: LV: Checking a loop in 'test2'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
entry:
@@ -104,9 +104,9 @@ exit:
define void @test3(i32* %a, i8* %b) #0 {
; CHECK: LV: Checking a loop in 'test3'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
-; CHECK_SCALABLE_ON: LV: Selecting VF: 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
index 4d0886f4d953d..43ef43c11507d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
@@ -83,11 +83,11 @@ for.end:
define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) {
; CHECK-LABEL: @uniform_store_i1
; CHECK: vector.body
-; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1
-; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]]
-; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x i64*> {{.*}}, i64 1
+; CHECK: %[[ICMP:.*]] = icmp eq <64 x i64*> %[[GEP]], %[[SPLAT:.*]]
+; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 0
; CHECK: store i1 %[[EXTRACT1]], i1* %dst
-; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1
+; CHECK: %[[EXTRACT2:.*]] = extractelement <64 x i1> %[[ICMP]], i32 1
; CHECK: store i1 %[[EXTRACT2]], i1* %dst
; CHECK-NOT: vscale
entry:
More information about the llvm-commits
mailing list