[llvm] [AArch64] Set MaxInterleaving to 4 for Neoverse V2 and V3 (PR #100385)
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 19 09:44:17 PST 2024
https://github.com/sjoerdmeijer updated https://github.com/llvm/llvm-project/pull/100385
>From cad286c78481cb442e29e49dc3143c89adba85a3 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer at nvidia.com>
Date: Tue, 19 Nov 2024 09:42:43 -0800
Subject: [PATCH] [AArch64] Set MaxInterleaving to 4 for Neoverse V2 and V3
---
.../llvm/Analysis/TargetTransformInfo.h | 8 ++
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 +
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 4 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 5 +-
llvm/lib/Target/AArch64/AArch64Subtarget.h | 4 +
.../AArch64/AArch64TargetTransformInfo.cpp | 4 +
.../AArch64/AArch64TargetTransformInfo.h | 2 +
.../Transforms/Vectorize/LoopVectorize.cpp | 7 +-
.../AArch64/interleaving-load-store.ll | 8 ++
.../AArch64/interleaving-reduction.ll | 8 ++
.../AArch64/neoverse-epilogue-vect.ll | 118 ++++++++++++++++++
.../AArch64/sve-epilog-vect-vscale-tune.ll | 7 +-
13 files changed, 176 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ccace59d6d368..e37bce3118bcb2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -630,6 +630,10 @@ class TargetTransformInfo {
AssumptionCache &AC, TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) const;
+ // Query the target for which minimum vectorization factor epilogue
+ // vectorization should be considered.
+ unsigned getEpilogueVectorizationMinVF() const;
+
/// Query the target whether it would be prefered to create a predicated
/// vector loop, which can avoid the need to emit a scalar epilogue loop.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const;
@@ -1912,6 +1916,7 @@ class TargetTransformInfo::Concept {
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) = 0;
+ virtual unsigned getEpilogueVectorizationMinVF() = 0;
virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0;
virtual TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0;
@@ -2392,6 +2397,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
HardwareLoopInfo &HWLoopInfo) override {
return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
+ unsigned getEpilogueVectorizationMinVF() override {
+ return Impl.getEpilogueVectorizationMinVF();
+ }
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) override {
return Impl.preferPredicateOverEpilogue(TFI);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c3c5629d61c919..72038c090b7922 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -199,6 +199,8 @@ class TargetTransformInfoImplBase {
return false;
}
+ unsigned getEpilogueVectorizationMinVF() const { return 16; }
+
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; }
TailFoldingStyle
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c2e48284c68acb..3b098c42f2741c 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -666,6 +666,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
+ unsigned getEpilogueVectorizationMinVF() {
+ return BaseT::getEpilogueVectorizationMinVF();
+ }
+
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
return BaseT::preferPredicateOverEpilogue(TFI);
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bc6a528c9dab3e..174e5e87abe538 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -359,6 +359,10 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
+unsigned TargetTransformInfo::getEpilogueVectorizationMinVF() const {
+ return TTIImpl->getEpilogueVectorizationMinVF();
+}
+
bool TargetTransformInfo::preferPredicateOverEpilogue(
TailFoldingInfo *TFI) const {
return TTIImpl->preferPredicateOverEpilogue(TFI);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 2a9a7533f86259..e37e2cacc7852e 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -255,12 +255,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 16;
break;
case NeoverseV2:
- // Specialize cost for Neoverse-V2.
+ case NeoverseV3:
+ EpilogueVectorizationMinVF = 8;
+ MaxInterleaveFactor = 4;
ScatterOverhead = 13;
LLVM_FALLTHROUGH;
case NeoverseN2:
case NeoverseN3:
- case NeoverseV3:
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(32);
MaxBytesForLoopAlignment = 16;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 91fef0e9a1ae99..d860c29e2291ae 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -56,6 +56,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool ATTRIBUTE = DEFAULT;
#include "AArch64GenSubtargetInfo.inc"
+ unsigned EpilogueVectorizationMinVF = 16;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 2;
uint16_t CacheLineSize = 0;
@@ -237,6 +238,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
hasFuseAdrpAdd() || hasFuseLiterals();
}
+ unsigned getEpilogueVectorizationMinVF() const {
+ return EpilogueVectorizationMinVF;
+ }
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
unsigned getVectorInsertExtractBaseCost() const;
unsigned getCacheLineSize() const override { return CacheLineSize; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 84212b03686b19..ec7bb71fd111ff 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4736,6 +4736,10 @@ static bool containsDecreasingPointers(Loop *TheLoop,
return false;
}
+unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
+ return ST->getEpilogueVectorizationMinVF();
+}
+
bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
if (!ST->hasSVE())
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a01d061c4c407c..201bc831b816b3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return ST->useFixedOverScalableIfEqualCost();
}
+ unsigned getEpilogueVectorizationMinVF() const;
+
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
bool supportsScalableVectors() const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9884dcb97ec5c6..6e7ce398ec7d90 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -186,7 +186,7 @@ static cl::opt<unsigned> EpilogueVectorizationForceVF(
"loops."));
static cl::opt<unsigned> EpilogueVectorizationMinVF(
- "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
+ "epilogue-vectorization-minimum-VF", cl::Hidden,
cl::desc("Only loops with vectorization factor equal to or larger than "
"the specified value are considered for epilogue vectorization."));
@@ -4701,8 +4701,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// See related "TODO: extend to support scalable VFs." in
// selectEpilogueVectorizationFactor.
unsigned Multiplier = VF.isFixed() ? IC : 1;
+ unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
+ ? EpilogueVectorizationMinVF
+ : TTI.getEpilogueVectorizationMinVF();
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >=
- EpilogueVectorizationMinVF;
+ MinVFThreshold;
}
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
index 2b881fe19902eb..8320608d67588c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
@@ -5,6 +5,8 @@
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
; Tests for selecting interleave counts for loops with loads and stores.
@@ -213,6 +215,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
; INTERLEAVE-2: exit:
; INTERLEAVE-2-NEXT: ret void
;
+; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store(
+; INTERLEAVE-4-VLA: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index bf64dccdb26676..fc2f8a0dcabf50 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -5,6 +5,8 @@
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
; Tests for selecting the interleave count for loops with reductions.
@@ -138,6 +140,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-2-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; INTERLEAVE-2-NEXT: ret i32 [[RED_NEXT_LCSSA]]
;
+; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
+; INTERLEAVE-4-VLA: add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
+;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
new file mode 100644
index 00000000000000..9e42c3c5dcab77
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
@@ -0,0 +1,118 @@
+; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+define noundef i32 @V1(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #0 {
+; CHECK-LABEL: @V1(
+; CHECK-NOT: vec.epilog.ph:
+; CHECK-NOT: vec.epilog.vector.body:
+; CHECK-NOT: vec.epilog.middle.block:
+; CHECK-NOT: vec.epilog.scalar.ph:
+;
+entry:
+ %4 = icmp sgt i32 %2, 0
+ br i1 %4, label %5, label %8
+
+5:
+ %6 = zext nneg i32 %2 to i64
+ br label %9
+
+7:
+ br label %8
+
+8:
+ ret i32 42
+
+9:
+ %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+ %11 = getelementptr inbounds double, ptr %0, i64 %10
+ %12 = load double, ptr %11, align 8
+ %13 = getelementptr inbounds double, ptr %1, i64 %10
+ %14 = load double, ptr %13, align 8
+ %15 = fadd fast double %14, %12
+ store double %15, ptr %11, align 8
+ %16 = add nuw nsw i64 %10, 1
+ %17 = icmp eq i64 %16, %6
+ br i1 %17, label %7, label %9
+}
+
+define noundef i32 @V2(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #1 {
+;
+; CHECK-LABEL: @V2(
+; CHECK: vec.epilog.ph:
+; CHECK: vec.epilog.vector.body:
+; CHECK: vec.epilog.middle.block:
+; CHECK: vec.epilog.scalar.ph:
+;
+entry:
+ %4 = icmp sgt i32 %2, 0
+ br i1 %4, label %5, label %8
+
+5:
+ %6 = zext nneg i32 %2 to i64
+ br label %9
+
+7:
+ br label %8
+
+8:
+ ret i32 42
+
+9:
+ %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+ %11 = getelementptr inbounds double, ptr %0, i64 %10
+ %12 = load double, ptr %11, align 8
+ %13 = getelementptr inbounds double, ptr %1, i64 %10
+ %14 = load double, ptr %13, align 8
+ %15 = fadd fast double %14, %12
+ store double %15, ptr %11, align 8
+ %16 = add nuw nsw i64 %10, 1
+ %17 = icmp eq i64 %16, %6
+ br i1 %17, label %7, label %9
+}
+
+; TODO: The V3 will generate a scalable vector body, so doesn't need a
+; epilogue loop, but will need to be checked that is really the best thing to
+; for the V3.
+;
+define noundef i32 @V3(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #2 {
+;
+; CHECK-LABEL: @V3(
+; CHECK-NOT: vec.epilog.ph:
+; CHECK-NOT: vec.epilog.vector.body:
+; CHECK-NOT: vec.epilog.middle.block:
+; CHECK-NOT: vec.epilog.scalar.ph:
+;
+entry:
+ %4 = icmp sgt i32 %2, 0
+ br i1 %4, label %5, label %8
+
+5:
+ %6 = zext nneg i32 %2 to i64
+ br label %9
+
+7:
+ br label %8
+
+8:
+ ret i32 42
+
+9:
+ %10 = phi i64 [ 0, %5 ], [ %16, %9 ]
+ %11 = getelementptr inbounds double, ptr %0, i64 %10
+ %12 = load double, ptr %11, align 8
+ %13 = getelementptr inbounds double, ptr %1, i64 %10
+ %14 = load double, ptr %13, align 8
+ %15 = fadd fast double %14, %12
+ store double %15, ptr %11, align 8
+ %16 = add nuw nsw i64 %10, 1
+ %17 = icmp eq i64 %16, %6
+ br i1 %17, label %7, label %9
+}
+
+attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve2" }
+
+attributes #1 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve2" }
+
+attributes #2 = { vscale_range(1,16) "target-cpu"="neoverse-v3" "target-features"="+sve2" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
index 454a9789142f80..52d343e4105c7f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
@@ -1,7 +1,7 @@
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
-; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
+; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-EPILOG-V2
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
; RUN: -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
@@ -12,6 +12,11 @@ define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i6
; CHECK-EPILOG: vec.epilog.vector.body:
; CHECK-EPILOG: load <vscale x 4 x i16>
+; The epilogue loop gets vectorised vscale x 2 x i16 wide.
+; CHECK-EPILOG-V2: vec.epilog.ph:
+; CHECK-EPILOG-V2: vec.epilog.vector.body:
+; CHECK-EPILOG-V2: load <vscale x 2 x i16>
+
; CHECK-NO-EPILOG-NOT: vec.epilog.vector.ph:
; CHECK-NO-EPILOG-NOT: vec.epilog.vector.body:
entry:
More information about the llvm-commits
mailing list