[llvm] [AArch64][CostModel] Lower cost of dupq (SVE2.1) (PR #144918)

Thu Jun 19 08:49:00 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis

Author: Graham Hunter (huntergr-arm)

<details>
<summary>Changes</summary>

With codegen in place to match shuffles to dupq, we can now lower the cost to something reasonable.

---
Full diff: https://github.com/llvm/llvm-project/pull/144918.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+20) 
- (added) llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll (+25) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ed051f295752e..6ff0efd117dbd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5583,6 +5583,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
     Kind = TTI::SK_PermuteSingleSrc;
   }
 
+  // Segmented shuffle matching.
+  if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput &&
+      Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
+      Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) {
+
+    FixedVectorType *VTy = cast<FixedVectorType>(Tp);
+    unsigned Segments = VTy->getPrimitiveSizeInBits() / 128;
+    unsigned SegmentElts = VTy->getNumElements() / Segments;
+
+    // dupq zd.t, zn.t[idx]
+    unsigned Lane = (unsigned)Mask[0];
+    if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) {
+      bool IsDupQ = true;
+      for (unsigned I = 1; I < Mask.size(); ++I)
+        IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts);
+      if (IsDupQ)
+        return LT.first;
+    }
+  }
+
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
   // folded into a LD1R. That's what we check and return here. For performance
diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
new file mode 100644
index 0000000000000..e6a57d1687254
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+
+;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
+define void @dup_within_each_segment() #0 {
+; CHECK-LABEL: 'dup_within_each_segment'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                            i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %dupq_h2  = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                              i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %dupq_s3  = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                           i32 7, i32 7, i32 7, i32 7>
+  %dupq_d0  = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1" }

``````````

</details>


https://github.com/llvm/llvm-project/pull/144918