[llvm] [AArch64][CostModel] Lower cost of dupq (SVE2.1) (PR #144918)

Tue Jun 24 04:20:10 PDT 2025

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/144918

>From a42ffba3f261fd2e57cf744097875b4b27aa877b Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 19 Jun 2025 15:29:48 +0000
Subject: [PATCH 01/11] Test precommit

---
 .../segmented-shufflevector-patterns.ll       | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
new file mode 100644
index 0000000000000..466b2990a548b
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+
+;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
+define void @dup_within_each_segment() #0 {
+; CHECK-LABEL: 'dup_within_each_segment'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                            i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %dupq_h2  = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                              i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %dupq_s3  = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                           i32 7, i32 7, i32 7, i32 7>
+  %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1" }

>From e160c70b4504122e47581b6834f530bfbe599e45 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 19 Jun 2025 15:31:23 +0000
Subject: [PATCH 02/11] Return lower cost for dupq

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 20 +++++++++++++++++++
 .../segmented-shufflevector-patterns.ll       | 10 +++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 15e38e6cb2408..7522f5773fc58 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5599,6 +5599,26 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
     SrcTy = DstTy;
   }
 
+  // Segmented shuffle matching.
+  if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput &&
+      Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
+      Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) {
+
+    FixedVectorType *VTy = cast<FixedVectorType>(Tp);
+    unsigned Segments = VTy->getPrimitiveSizeInBits() / 128;
+    unsigned SegmentElts = VTy->getNumElements() / Segments;
+
+    // dupq zd.t, zn.t[idx]
+    unsigned Lane = (unsigned)Mask[0];
+    if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) {
+      bool IsDupQ = true;
+      for (unsigned I = 1; I < Mask.size(); ++I)
+        IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts);
+      if (IsDupQ)
+        return LT.first;
+    }
+  }
+
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
   // folded into a LD1R. That's what we check and return here. For performance
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
index 466b2990a548b..e6a57d1687254 100644
--- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -4,11 +4,11 @@
 ;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
 define void @dup_within_each_segment() #0 {
 ; CHECK-LABEL: 'dup_within_each_segment'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,

>From 542faf0d401bb4423ee523b7b00a64f88db140ce Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 20 Jun 2025 15:27:46 +0000
Subject: [PATCH 03/11] * Refactor to share isDUPQMask * Support SME2p1 *
 Remove hardcoded magic number * Return the same result for other cost kinds

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 34 +++++--------------
 .../Target/AArch64/AArch64PerfectShuffle.h    | 24 +++++++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 18 ++++------
 3 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1f98d69edb473..0387721087ce3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13402,30 +13402,6 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
-/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
-/// the first vector operand.
-static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
-  assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size");
-  unsigned Lane = (unsigned)M[0];
-  unsigned Segments = VT.getFixedSizeInBits() / 128;
-  unsigned SegmentElts = VT.getVectorNumElements() / Segments;
-
-  // Make sure there's no size changes.
-  if (SegmentElts * Segments != M.size())
-    return std::nullopt;
-
-  // Check the first index corresponds to one of the lanes in the first segment.
-  if (Lane >= SegmentElts)
-    return std::nullopt;
-
-  // Check that all lanes match the first, adjusted for segment.
-  for (unsigned I = 0; I < M.size(); ++I)
-    if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts)))
-      return std::nullopt;
-
-  return Lane;
-}
-
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
@@ -30026,8 +30002,14 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
     }
 
-    if (Subtarget->hasSVE2p1()) {
-      if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
+    if (Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) {
+      assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
+             "Unsupported SVE vector size");
+
+      unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;
+      unsigned SegmentElts = VT.getVectorNumElements() / Segments;
+      if (std::optional<unsigned> Lane =
+              isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
         SDValue IID =
             DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
         return convertFromScalableVector(
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index 7b044cf7c238f..01d8fbc705a5f 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace llvm {
 
@@ -6723,6 +6724,29 @@ inline bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
   return true;
 }
 
+/// isDUPQMask - matches a splat of equivalent lanes within segments of a given
+///              number of elements.
+inline std::optional<unsigned> isDUPQMask(ArrayRef<int> M, unsigned Segments,
+                                          unsigned NumElts) {
+  unsigned Lane = (unsigned)M[0];
+
+  // Make sure there's no size changes.
+  if (NumElts * Segments != M.size())
+    return std::nullopt;
+
+  // Check the first index corresponds to one of the lanes in the first segment.
+  if (Lane >= NumElts)
+    return std::nullopt;
+
+  // Check that all lanes match the first, adjusted for segment.
+  if (all_of(enumerate(M), [&](auto P) {
+        return (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts;
+      }))
+    return Lane;
+
+  return std::nullopt;
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7522f5773fc58..cf37069337723 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5600,23 +5600,19 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   }
 
   // Segmented shuffle matching.
-  if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput &&
+  if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
       Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
-      Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) {
+      Tp->getPrimitiveSizeInBits().isKnownMultipleOf(
+          AArch64::SVEBitsPerBlock)) {
 
     FixedVectorType *VTy = cast<FixedVectorType>(Tp);
-    unsigned Segments = VTy->getPrimitiveSizeInBits() / 128;
+    unsigned Segments =
+        VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
     unsigned SegmentElts = VTy->getNumElements() / Segments;
 
     // dupq zd.t, zn.t[idx]
-    unsigned Lane = (unsigned)Mask[0];
-    if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) {
-      bool IsDupQ = true;
-      for (unsigned I = 1; I < Mask.size(); ++I)
-        IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts);
-      if (IsDupQ)
-        return LT.first;
-    }
+    if (isDUPQMask(Mask, Segments, SegmentElts))
+      return LT.first;
   }
 
   // Check for broadcast loads, which are supported by the LD1R instruction.

>From e72d339cbbde782aed30cb02190f1a84642d0f1c Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 23 Jun 2025 09:49:39 +0000
Subject: [PATCH 04/11] Improve SME check, add runline to test for it

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp       | 3 ++-
 .../CostModel/AArch64/segmented-shufflevector-patterns.ll    | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cf37069337723..110f3cdbe01b0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5600,7 +5600,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   }
 
   // Segmented shuffle matching.
-  if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
+  if ((ST->hasSVE2p1() ||
+       (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) &&
       Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
       Tp->getPrimitiveSizeInBits().isKnownMultipleOf(
           AArch64::SVEBitsPerBlock)) {
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
index e6a57d1687254..6fb11e1bdac17 100644
--- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s
 
 ;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
 define void @dup_within_each_segment() #0 {
@@ -22,4 +23,4 @@ define void @dup_within_each_segment() #0 {
   ret void
 }
 
-attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1" }
+attributes #0 = { noinline vscale_range(2,2) }

>From a03c040d70bf0480d960c8cfe4f9d80589544e3e Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 23 Jun 2025 10:15:12 +0000
Subject: [PATCH 05/11] Update ISel and codegen test too

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  3 +-
 .../CodeGen/AArch64/sve2p1-vector-shuffles.ll | 49 +++++++++++++++----
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0387721087ce3..3f39e982c4c16 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30002,7 +30002,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
     }
 
-    if (Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) {
+    if (Subtarget->hasSVE2p1() ||
+        (Subtarget->hasSME2p1() && Subtarget->isSVEorStreamingSVEAvailable())) {
       assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
              "Unsupported SVE vector size");
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
index 40d4d0ff60148..3fe087044332e 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1,+bf16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
 
 define void @dupq_i8_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_i8_256b:
@@ -71,13 +72,43 @@ define void @dupq_f16_256b(ptr %addr) #0 {
 }
 
 define void @dupq_bf16_256b(ptr %addr) #0 {
-; CHECK-LABEL: dupq_bf16_256b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    dup v0.8h, v0.h[2]
-; CHECK-NEXT:    dup v1.8h, v1.h[2]
-; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    ret
+; SVE-LABEL: dupq_bf16_256b:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ldp q0, q1, [x0]
+; SVE-NEXT:    dup v0.8h, v0.h[2]
+; SVE-NEXT:    dup v1.8h, v1.h[2]
+; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    ret
+;
+; SME-LABEL: dupq_bf16_256b:
+; SME:       // %bb.0:
+; SME-NEXT:    ldp q1, q0, [x0]
+; SME-NEXT:    str q0, [sp, #-64]!
+; SME-NEXT:    .cfi_def_cfa_offset 64
+; SME-NEXT:    ldr h0, [sp, #4]
+; SME-NEXT:    str q1, [sp, #32]
+; SME-NEXT:    str h0, [sp, #30]
+; SME-NEXT:    str h0, [sp, #28]
+; SME-NEXT:    str h0, [sp, #26]
+; SME-NEXT:    str h0, [sp, #24]
+; SME-NEXT:    str h0, [sp, #22]
+; SME-NEXT:    str h0, [sp, #20]
+; SME-NEXT:    str h0, [sp, #18]
+; SME-NEXT:    str h0, [sp, #16]
+; SME-NEXT:    ldr h0, [sp, #36]
+; SME-NEXT:    ldr q1, [sp, #16]
+; SME-NEXT:    str h0, [sp, #62]
+; SME-NEXT:    str h0, [sp, #60]
+; SME-NEXT:    str h0, [sp, #58]
+; SME-NEXT:    str h0, [sp, #56]
+; SME-NEXT:    str h0, [sp, #54]
+; SME-NEXT:    str h0, [sp, #52]
+; SME-NEXT:    str h0, [sp, #50]
+; SME-NEXT:    str h0, [sp, #48]
+; SME-NEXT:    ldr q0, [sp, #48]
+; SME-NEXT:    stp q0, q1, [x0]
+; SME-NEXT:    add sp, sp, #64
+; SME-NEXT:    ret
   %load = load <16 x bfloat>, ptr %addr
   %splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
                                                                                       i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
@@ -112,4 +143,4 @@ define void @dupq_f64_256b(ptr %addr) #0 {
   ret void
 }
 
-attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }
+attributes #0 = { noinline vscale_range(2,2) }

>From 6899d1cc7e3a204b79e7371d3928a3eb0e1b40e5 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 23 Jun 2025 12:31:50 +0000
Subject: [PATCH 06/11] Rebase, getShuffleCost params changed

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 110f3cdbe01b0..b0b76faa803ae 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5602,11 +5602,11 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   // Segmented shuffle matching.
   if ((ST->hasSVE2p1() ||
        (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) &&
-      Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
-      Tp->getPrimitiveSizeInBits().isKnownMultipleOf(
+      Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
+      SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
           AArch64::SVEBitsPerBlock)) {
 
-    FixedVectorType *VTy = cast<FixedVectorType>(Tp);
+    FixedVectorType *VTy = cast<FixedVectorType>(SrcTy);
     unsigned Segments =
         VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
     unsigned SegmentElts = VTy->getNumElements() / Segments;

>From 57aaebe4d673e134156f9940c066c88968366fbc Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 23 Jun 2025 12:50:04 +0000
Subject: [PATCH 07/11] Only check for isStreaming()

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp        | 2 +-
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3f39e982c4c16..e1c2e1144b51c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30003,7 +30003,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
     }
 
     if (Subtarget->hasSVE2p1() ||
-        (Subtarget->hasSME2p1() && Subtarget->isSVEorStreamingSVEAvailable())) {
+        (Subtarget->hasSME2p1() && Subtarget->isStreaming())) {
       assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
              "Unsupported SVE vector size");
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b0b76faa803ae..a010c8c378a11 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5600,8 +5600,7 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   }
 
   // Segmented shuffle matching.
-  if ((ST->hasSVE2p1() ||
-       (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) &&
+  if ((ST->hasSVE2p1() || (ST->hasSME2p1() && ST->isStreaming())) &&
       Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
       SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
           AArch64::SVEBitsPerBlock)) {

>From 3a7c14ca2331d4b0eb64ab1ba682e65c0d9cb219 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 23 Jun 2025 13:20:00 +0000
Subject: [PATCH 08/11] Revert "Only check for isStreaming()"

This reverts commit 57aaebe4d673e134156f9940c066c88968366fbc.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp        | 2 +-
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e1c2e1144b51c..3f39e982c4c16 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30003,7 +30003,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
     }
 
     if (Subtarget->hasSVE2p1() ||
-        (Subtarget->hasSME2p1() && Subtarget->isStreaming())) {
+        (Subtarget->hasSME2p1() && Subtarget->isSVEorStreamingSVEAvailable())) {
       assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
              "Unsupported SVE vector size");
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a010c8c378a11..b0b76faa803ae 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5600,7 +5600,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   }
 
   // Segmented shuffle matching.
-  if ((ST->hasSVE2p1() || (ST->hasSME2p1() && ST->isStreaming())) &&
+  if ((ST->hasSVE2p1() ||
+       (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) &&
       Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
       SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
           AArch64::SVEBitsPerBlock)) {

>From f45e0b21f2588b134e4d2bd3bf2a6333b3b2d456 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 24 Jun 2025 09:20:14 +0000
Subject: [PATCH 09/11] * Check for mask being empty * Handle poison lanes

---
 llvm/lib/Target/AArch64/AArch64PerfectShuffle.h    |  4 +++-
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  |  2 +-
 .../AArch64/segmented-shufflevector-patterns.ll    |  3 +++
 .../test/CodeGen/AArch64/sve2p1-vector-shuffles.ll | 14 ++++++++++++++
 4 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index 01d8fbc705a5f..cd79251610fe1 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6739,8 +6739,10 @@ inline std::optional<unsigned> isDUPQMask(ArrayRef<int> M, unsigned Segments,
     return std::nullopt;
 
   // Check that all lanes match the first, adjusted for segment.
+  // Undef/poison lanes (<0) are also accepted.
   if (all_of(enumerate(M), [&](auto P) {
-        return (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts;
+        return P.value() < 0 ||
+               (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts;
       }))
     return Lane;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b0b76faa803ae..e00519f0faf5c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5604,7 +5604,7 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
        (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) &&
       Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
       SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
-          AArch64::SVEBitsPerBlock)) {
+          AArch64::SVEBitsPerBlock) && !Mask.empty()) {
 
     FixedVectorType *VTy = cast<FixedVectorType>(SrcTy);
     unsigned Segments =
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
index 6fb11e1bdac17..72dca2d9ab1da 100644
--- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -10,6 +10,7 @@ define void @dup_within_each_segment() #0 {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3, i32 7, i32 poison, i32 7, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
@@ -20,6 +21,8 @@ define void @dup_within_each_segment() #0 {
                                                                            i32 7, i32 7, i32 7, i32 7>
   %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3,
+                                                                                      i32 7, i32 poison, i32 7, i32 7>
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
index 3fe087044332e..da83b27ce4d55 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
@@ -143,4 +143,18 @@ define void @dupq_f64_256b(ptr %addr) #0 {
   ret void
 }
 
+define void @dupq_f32_256b_with_poison(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f32_256b_with_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.s, z0.s[3]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <8 x float>, ptr %addr
+  %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 3, i32 3,
+                                                                                 i32 7, i32 7, i32 7, i32 poison>
+  store <8 x float> %splat.lanes, ptr %addr
+  ret void
+}
+
 attributes #0 = { noinline vscale_range(2,2) }

>From 0c1cdff8a7813d74df3b32e65c7650486acb81fa Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 24 Jun 2025 10:09:44 +0000
Subject: [PATCH 10/11] Make isDUPQMask clearer, add 512b function to cost test

---
 .../Target/AArch64/AArch64PerfectShuffle.h    | 15 +++++-----
 .../AArch64/AArch64TargetTransformInfo.cpp    |  3 +-
 .../segmented-shufflevector-patterns.ll       | 28 +++++++++++++++++--
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index cd79251610fe1..e9bc6d947b0d9 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6726,23 +6726,24 @@ inline bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
 
 /// isDUPQMask - matches a splat of equivalent lanes within segments of a given
 ///              number of elements.
-inline std::optional<unsigned> isDUPQMask(ArrayRef<int> M, unsigned Segments,
-                                          unsigned NumElts) {
-  unsigned Lane = (unsigned)M[0];
+inline std::optional<unsigned> isDUPQMask(ArrayRef<int> Mask, unsigned Segments,
+                                          unsigned SegmentSize) {
+  unsigned Lane = unsigned(Mask[0]);
 
   // Make sure there's no size changes.
-  if (NumElts * Segments != M.size())
+  if (SegmentSize * Segments != Mask.size())
     return std::nullopt;
 
   // Check the first index corresponds to one of the lanes in the first segment.
-  if (Lane >= NumElts)
+  if (Lane >= SegmentSize)
     return std::nullopt;
 
   // Check that all lanes match the first, adjusted for segment.
   // Undef/poison lanes (<0) are also accepted.
-  if (all_of(enumerate(M), [&](auto P) {
+  if (all_of(enumerate(Mask), [&](auto P) {
+        const unsigned SegmentIndex = P.index() / SegmentSize;
         return P.value() < 0 ||
-               (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts;
+               unsigned(P.value()) == Lane + SegmentIndex * SegmentSize;
       }))
     return Lane;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e00519f0faf5c..1ec223068722c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5604,7 +5604,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
        (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) &&
       Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
       SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
-          AArch64::SVEBitsPerBlock) && !Mask.empty()) {
+          AArch64::SVEBitsPerBlock) &&
+      !Mask.empty()) {
 
     FixedVectorType *VTy = cast<FixedVectorType>(SrcTy);
     unsigned Segments =
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
index 72dca2d9ab1da..790f49f1d3b82 100644
--- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -3,8 +3,8 @@
 ; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s
 
 ;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
-define void @dup_within_each_segment() #0 {
-; CHECK-LABEL: 'dup_within_each_segment'
+define void @dup_within_each_segment_256b() #0 {
+; CHECK-LABEL: 'dup_within_each_segment_256b'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
@@ -26,4 +26,28 @@ define void @dup_within_each_segment() #0 {
   ret void
 }
 
+define void @dup_within_each_segment_512b() #1 {
+; CHECK-LABEL: 'dup_within_each_segment_512b'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3, i32 7, i32 poison, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                            i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %dupq_h2  = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                              i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %dupq_s3  = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                           i32 7, i32 7, i32 7, i32 7>
+  %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3,
+                                                                                      i32 7, i32 poison, i32 7, i32 7>
+  ret void
+}
+
 attributes #0 = { noinline vscale_range(2,2) }
+attributes #1 = { noinline vscale_range(4,4) }

>From 80b073c7992d23d135c9ab733f67062c48b75505 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 24 Jun 2025 11:14:47 +0000
Subject: [PATCH 11/11] Correction to feature checking

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp        | 4 ++--
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3f39e982c4c16..bfbcc14baf18c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30002,8 +30002,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
     }
 
-    if (Subtarget->hasSVE2p1() ||
-        (Subtarget->hasSME2p1() && Subtarget->isSVEorStreamingSVEAvailable())) {
+    if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
+        Subtarget->isSVEorStreamingSVEAvailable()) {
       assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
              "Unsupported SVE vector size");
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1ec223068722c..3387dee8aa4c8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5600,12 +5600,11 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   }
 
   // Segmented shuffle matching.
-  if ((ST->hasSVE2p1() ||
-       (ST->hasSME2p1() && ST->isSVEorStreamingSVEAvailable())) &&
-      Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
+  if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
+      ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc &&
+      isa<FixedVectorType>(SrcTy) && !Mask.empty() &&
       SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
-          AArch64::SVEBitsPerBlock) &&
-      !Mask.empty()) {
+          AArch64::SVEBitsPerBlock)) {
 
     FixedVectorType *VTy = cast<FixedVectorType>(SrcTy);
     unsigned Segments =