[llvm] [AArch64][CostModel] Lower cost of dupq (SVE2.1) (PR #144918)

Fri Jun 20 08:37:47 PDT 2025

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/144918

>From 1b8c7cb85ef26abf1567a9f9d34d1eccca568c4d Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 19 Jun 2025 15:29:48 +0000
Subject: [PATCH 1/3] Test precommit

---
 .../segmented-shufflevector-patterns.ll       | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
new file mode 100644
index 0000000000000..466b2990a548b
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+
+;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
+define void @dup_within_each_segment() #0 {
+; CHECK-LABEL: 'dup_within_each_segment'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                            i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %dupq_h2  = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                              i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %dupq_s3  = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                           i32 7, i32 7, i32 7, i32 7>
+  %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1" }

>From bc8af08dadb22e665d596ac63604b128eb7944ec Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Thu, 19 Jun 2025 15:31:23 +0000
Subject: [PATCH 2/3] Return lower cost for dupq

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 20 +++++++++++++++++++
 .../segmented-shufflevector-patterns.ll       | 10 +++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ed051f295752e..6ff0efd117dbd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5583,6 +5583,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
     Kind = TTI::SK_PermuteSingleSrc;
   }
 
+  // Segmented shuffle matching.
+  if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput &&
+      Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
+      Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) {
+
+    FixedVectorType *VTy = cast<FixedVectorType>(Tp);
+    unsigned Segments = VTy->getPrimitiveSizeInBits() / 128;
+    unsigned SegmentElts = VTy->getNumElements() / Segments;
+
+    // dupq zd.t, zn.t[idx]
+    unsigned Lane = (unsigned)Mask[0];
+    if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) {
+      bool IsDupQ = true;
+      for (unsigned I = 1; I < Mask.size(); ++I)
+        IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts);
+      if (IsDupQ)
+        return LT.first;
+    }
+  }
+
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
   // folded into a LD1R. That's what we check and return here. For performance
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
index 466b2990a548b..e6a57d1687254 100644
--- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -4,11 +4,11 @@
 ;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
 define void @dup_within_each_segment() #0 {
 ; CHECK-LABEL: 'dup_within_each_segment'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,

>From 5ca3a48b14654ccd4d4d7393cdb777d99179b59f Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 20 Jun 2025 15:27:46 +0000
Subject: [PATCH 3/3] * Refactor to share isDUPQMask * Support SME2p1 * Remove
 hardcoded magic number * Return the same result for other cost kinds

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 34 +++++--------------
 .../Target/AArch64/AArch64PerfectShuffle.h    | 24 +++++++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 18 ++++------
 3 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 581f152776026..3fbbc543fc57f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13391,30 +13391,6 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
-/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
-/// the first vector operand.
-static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
-  assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size");
-  unsigned Lane = (unsigned)M[0];
-  unsigned Segments = VT.getFixedSizeInBits() / 128;
-  unsigned SegmentElts = VT.getVectorNumElements() / Segments;
-
-  // Make sure there's no size changes.
-  if (SegmentElts * Segments != M.size())
-    return std::nullopt;
-
-  // Check the first index corresponds to one of the lanes in the first segment.
-  if (Lane >= SegmentElts)
-    return std::nullopt;
-
-  // Check that all lanes match the first, adjusted for segment.
-  for (unsigned I = 0; I < M.size(); ++I)
-    if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts)))
-      return std::nullopt;
-
-  return Lane;
-}
-
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
@@ -30005,8 +29981,14 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
     }
 
-    if (Subtarget->hasSVE2p1()) {
-      if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
+    if (Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) {
+      assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
+             "Unsupported SVE vector size");
+
+      unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;
+      unsigned SegmentElts = VT.getVectorNumElements() / Segments;
+      if (std::optional<unsigned> Lane =
+              isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
         SDValue IID =
             DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
         return convertFromScalableVector(
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index 7b044cf7c238f..01d8fbc705a5f 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace llvm {
 
@@ -6723,6 +6724,29 @@ inline bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
   return true;
 }
 
+/// isDUPQMask - matches a splat of equivalent lanes within segments of a given
+///              number of elements.
+inline std::optional<unsigned> isDUPQMask(ArrayRef<int> M, unsigned Segments,
+                                          unsigned NumElts) {
+  unsigned Lane = (unsigned)M[0];
+
+  // Make sure there's no size changes.
+  if (NumElts * Segments != M.size())
+    return std::nullopt;
+
+  // Check the first index corresponds to one of the lanes in the first segment.
+  if (Lane >= NumElts)
+    return std::nullopt;
+
+  // Check that all lanes match the first, adjusted for segment.
+  if (all_of(enumerate(M), [&](auto P) {
+        return (unsigned)P.value() == Lane + (P.index() / NumElts) * NumElts;
+      }))
+    return Lane;
+
+  return std::nullopt;
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6ff0efd117dbd..6cbc6faae6949 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5584,23 +5584,19 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
   }
 
   // Segmented shuffle matching.
-  if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput &&
+  if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
       Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
-      Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) {
+      Tp->getPrimitiveSizeInBits().isKnownMultipleOf(
+          AArch64::SVEBitsPerBlock)) {
 
     FixedVectorType *VTy = cast<FixedVectorType>(Tp);
-    unsigned Segments = VTy->getPrimitiveSizeInBits() / 128;
+    unsigned Segments =
+        VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
     unsigned SegmentElts = VTy->getNumElements() / Segments;
 
     // dupq zd.t, zn.t[idx]
-    unsigned Lane = (unsigned)Mask[0];
-    if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) {
-      bool IsDupQ = true;
-      for (unsigned I = 1; I < Mask.size(); ++I)
-        IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts);
-      if (IsDupQ)
-        return LT.first;
-    }
+    if (isDUPQMask(Mask, Segments, SegmentElts))
+      return LT.first;
   }
 
   // Check for broadcast loads, which are supported by the LD1R instruction.