[llvm] [AArch64][TTI] Reduce cost for splatting whole first vector segment (SVE) (PR #145701)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 27 05:26:15 PDT 2025
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/145701
>From 8b55fa1b74d8310723ce0f4a5b3c0206f1bc78a8 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 25 Jun 2025 12:52:52 +0000
Subject: [PATCH 1/3] Test precommit
---
.../segmented-shufflevector-patterns.ll | 48 +++++++++++++++++++
1 file changed, 48 insertions(+)
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
index 790f49f1d3b82..4189b6fe371dd 100644
--- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -49,5 +49,53 @@ define void @dup_within_each_segment_512b() #1 {
ret void
}
+define void @dup_whole_segment_256b() #0 {
+; CHECK-LABEL: 'dup_whole_segment_256b'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 1, i32 2, i32 3>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+ i32 0, i32 1, i32 2, i32 3>
+ %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1,
+ i32 0, i32 1, i32 0, i32 1>
+ %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3,
+ i32 poison, i32 1, i32 2, i32 3>
+ ret void
+}
+
+define void @dup_whole_segment_512b() #1 {
+; CHECK-LABEL: 'dup_whole_segment_512b'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 1, i32 2, i32 3>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+ i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+ i32 0, i32 1, i32 2, i32 3>
+ %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1,
+ i32 0, i32 1, i32 0, i32 1>
+ %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3,
+ i32 poison, i32 1, i32 2, i32 3>
+ ret void
+}
+
attributes #0 = { noinline vscale_range(2,2) }
attributes #1 = { noinline vscale_range(4,4) }
>From 39644916f8e6448e47fd5ffe2bc494fa1694573d Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 25 Jun 2025 13:20:11 +0000
Subject: [PATCH 2/3] Match first segment splat
---
.../Target/AArch64/AArch64PerfectShuffle.h | 15 ++++++++++++
.../AArch64/AArch64TargetTransformInfo.cpp | 13 ++++++----
.../segmented-shufflevector-patterns.ll | 24 +++++++++----------
3 files changed, 36 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index e9bc6d947b0d9..f7beca1b8b77e 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6750,6 +6750,21 @@ inline std::optional<unsigned> isDUPQMask(ArrayRef<int> Mask, unsigned Segments,
return std::nullopt;
}
+/// isDUPFirstSegmentMask - matches a splat of the first 128b segment.
+inline bool isDUPFirstSegmentMask(ArrayRef<int> Mask, unsigned Segments,
+ unsigned SegmentSize) {
+ // Make sure there's no size changes.
+ if (SegmentSize * Segments != Mask.size())
+ return false;
+
+ // Check that all lanes refer to the equivalent lane in the first segment.
+ // Undef/poison lanes (<0) are also accepted.
+ return all_of(enumerate(Mask), [&](auto P) {
+ const unsigned IndexWithinSegment = P.index() % SegmentSize;
+ return P.value() < 0 || unsigned(P.value()) == IndexWithinSegment;
+ });
+}
+
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 79b2dc2b3845e..a348a039df73e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5600,9 +5600,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
}
// Segmented shuffle matching.
- if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
- ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc &&
- isa<FixedVectorType>(SrcTy) && !Mask.empty() &&
+ if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
+ !Mask.empty() &&
SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
AArch64::SVEBitsPerBlock)) {
@@ -5612,7 +5611,13 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
unsigned SegmentElts = VTy->getNumElements() / Segments;
// dupq zd.t, zn.t[idx]
- if (isDUPQMask(Mask, Segments, SegmentElts))
+ if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
+ ST->isSVEorStreamingSVEAvailable() &&
+ isDUPQMask(Mask, Segments, SegmentElts))
+ return LT.first;
+
+ // mov zd.q, vn
+ if (isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
return LT.first;
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
index 4189b6fe371dd..8b94cefbad63b 100644
--- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -51,12 +51,12 @@ define void @dup_within_each_segment_512b() #1 {
define void @dup_whole_segment_256b() #0 {
; CHECK-LABEL: 'dup_whole_segment_256b'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 1, i32 2, i32 3>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 1, i32 2, i32 3>
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
@@ -75,12 +75,12 @@ define void @dup_whole_segment_256b() #0 {
define void @dup_whole_segment_512b() #1 {
; CHECK-LABEL: 'dup_whole_segment_512b'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 1, i32 2, i32 3>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 1, i32 2, i32 3>
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
>From e3502f27bd2f5a33e9e8a860208b3423d9568f7c Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 27 Jun 2025 10:53:24 +0000
Subject: [PATCH 3/3] Only use with SVE, since the tables for NEON work better
in most cases.
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a348a039df73e..7d33845f48100 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5617,7 +5617,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
return LT.first;
// mov zd.q, vn
- if (isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
+ if (ST->isSVEorStreamingSVEAvailable() &&
+ isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
return LT.first;
}
More information about the llvm-commits
mailing list