[llvm] [AArch64] Use dupq (SVE2.1) for segmented lane splats (PR #144482)

Tue Jun 17 09:26:36 PDT 2025

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/144482

>From e84850652710cfbf97ef32bed42fed19bdfc42ef Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 13 Jun 2025 15:45:27 +0000
Subject: [PATCH 1/8] [AArch64] Use dupq for segmented lane splats (SVE 2.1)

---
 .../CodeGen/AArch64/sve2p1-vector-shuffles.ll | 137 ++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll

diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
new file mode 100644
index 0000000000000..703d2af0e0534
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+define void @dupq_i8_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_i8_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI0_0
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x8]
+; CHECK-NEXT:    tbl z0.b, { z0.b }, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <32 x i8>, ptr %addr
+  %splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
+                                                                              i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  store <32 x i8> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_i16_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_i16_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI1_0
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x8]
+; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <16 x i16>, ptr %addr
+  %splat.lanes = shufflevector <16 x i16> %load, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                                i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  store <16 x i16> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_i32_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_i32_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI2_0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8]
+; CHECK-NEXT:    tbl z0.s, { z0.s }, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <8 x i32>, ptr %addr
+  %splat.lanes = shufflevector <8 x i32> %load, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                             i32 7, i32 7, i32 7, i32 7>
+  store <8 x i32> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_i64_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_i64_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    trn1 z0.d, z0.d, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <4 x i64>, ptr %addr
+  %splat.lanes = shufflevector <4 x i64> %load, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  store <4 x i64> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_f16_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f16_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI4_0
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x8]
+; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <16 x half>, ptr %addr
+  %splat.lanes = shufflevector <16 x half> %load, <16 x half> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                                  i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  store <16 x half> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_bf16_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_bf16_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    dup v0.8h, v0.h[2]
+; CHECK-NEXT:    dup v1.8h, v1.h[2]
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %load = load <16 x bfloat>, ptr %addr
+  %splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
+                                                                                      i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  store <16 x bfloat> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_f32_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f32_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    adrp x8, .LCPI6_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI6_0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8]
+; CHECK-NEXT:    tbl z0.s, { z0.s }, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <8 x float>, ptr %addr
+  %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
+                                                                                 i32 7, i32 7, i32 7, i32 7>
+  store <8 x float> %splat.lanes, ptr %addr
+  ret void
+}
+
+define void @dupq_f64_256b(ptr %addr) #0 {
+; CHECK-LABEL: dupq_f64_256b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    trn1 z0.d, z0.d, z0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %load = load <4 x double>, ptr %addr
+  %splat.lanes = shufflevector <4 x double> %load, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  store <4 x double> %splat.lanes, ptr %addr
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }

>From 5dd61ba34da529dcf28beba3d4ab20f933d10d56 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 13 Jun 2025 15:58:02 +0000
Subject: [PATCH 2/8] Add dupq sdnodes and patterns to match

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td |  6 ++++++
 llvm/lib/Target/AArch64/SVEInstrFormats.td  | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f90f12b5ac3c7..b99e9c2cfefa0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -855,6 +855,12 @@ def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
 def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
 def AArch64duplane128 : SDNode<"AArch64ISD::DUPLANE128", SDT_AArch64DupLane>;
 
+// Scalar-to-vector segmented duplication
+def AArch64duplaneq8  : SDNode<"AArch64ISD::DUPLANEQ8", SDT_AArch64DupLane>;
+def AArch64duplaneq16 : SDNode<"AArch64ISD::DUPLANEQ16", SDT_AArch64DupLane>;
+def AArch64duplaneq32 : SDNode<"AArch64ISD::DUPLANEQ32", SDT_AArch64DupLane>;
+def AArch64duplaneq64 : SDNode<"AArch64ISD::DUPLANEQ64", SDT_AArch64DupLane>;
+
 def AArch64insr      : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>;
 
 // Vector shuffles
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 848b1c30bbeb5..7108243305dee 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10613,6 +10613,23 @@ multiclass sve2p1_dupq<string mnemonic, SDPatternOperator Op> {
   def : SVE_2_Op_Imm_Pat<nxv4f32, Op, nxv4f32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Imm_Pat<nxv2f64, Op, nxv2f64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
   def : SVE_2_Op_Imm_Pat<nxv8bf16, Op, nxv8bf16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
+
+  def : Pat<(nxv16i8 (AArch64duplaneq8 nxv16i8:$Op1, VectorIndexB32b_timm:$imm)),
+            (!cast<Instruction>(NAME # _B) $Op1, $imm)>;
+  def : Pat<(nxv8i16 (AArch64duplaneq16 nxv8i16:$Op1, VectorIndexH32b_timm:$imm)),
+            (!cast<Instruction>(NAME # _H) $Op1, $imm)>;
+  def : Pat<(nxv4i32 (AArch64duplaneq32 nxv4i32:$Op1, VectorIndexS32b_timm:$imm)),
+            (!cast<Instruction>(NAME # _S) $Op1, $imm)>;
+  def : Pat<(nxv2i64 (AArch64duplaneq64 nxv2i64:$Op1, VectorIndexD32b_timm:$imm)),
+            (!cast<Instruction>(NAME # _D) $Op1, $imm)>;
+  def : Pat<(nxv8f16 (AArch64duplaneq16 nxv8f16:$Op1, VectorIndexH32b_timm:$imm)),
+            (!cast<Instruction>(NAME # _H) $Op1, $imm)>;
+  def : Pat<(nxv4f32 (AArch64duplaneq32 nxv4f32:$Op1, VectorIndexS32b_timm:$imm)),
+            (!cast<Instruction>(NAME # _S) $Op1, $imm)>;
+  def : Pat<(nxv2f64 (AArch64duplaneq64 nxv2f64:$Op1, VectorIndexD32b_timm:$imm)),
+            (!cast<Instruction>(NAME # _D) $Op1, $imm)>;
+  def : Pat<(nxv8bf16 (AArch64duplaneq16 nxv8bf16:$Op1, VectorIndexH32b_timm:$imm)),
+            (!cast<Instruction>(NAME # _H) $Op1, $imm)>;
 }
 
 

>From 5ffdce6946136b5a2aec0dcf6d33e4afd9ca201d Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 13 Jun 2025 16:33:40 +0000
Subject: [PATCH 3/8] Lower to new sdnodes

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 48 +++++++++++++++
 .../CodeGen/AArch64/sve2p1-vector-shuffles.ll | 60 ++++++-------------
 2 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7519ac5260a64..80d036f3e7bf9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13430,6 +13430,28 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
+/// isDUPQMask - matches a splat of equivalent lanes within 128b segments
+static bool isDUPQMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  WhichResult = (unsigned)M[0];
+  unsigned Segments = VT.getFixedSizeInBits() / 128;
+  unsigned SegmentElts = VT.getVectorNumElements() / Segments;
+  if (SegmentElts * Segments != M.size())
+    return false;
+
+  for (unsigned I = 0; I < Segments; ++I) {
+    unsigned Broadcast = (unsigned)M[I * SegmentElts];
+    if (Broadcast - (I * SegmentElts) > SegmentElts)
+      return false;
+    for (unsigned J = 0; J < SegmentElts; ++J) {
+      int Idx = M[(I * SegmentElts) + J];
+      if ((unsigned)Idx != Broadcast)
+        return false;
+    }
+  }
+
+  return true;
+}
+
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
@@ -30013,6 +30035,32 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
       return convertFromScalableVector(
           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
     }
+
+    if (Subtarget->hasSVE2p1()) {
+      if (isDUPQMask(ShuffleMask, VT, WhichResult)) {
+        unsigned DupOp;
+        switch (VT.getScalarSizeInBits()) {
+        default:
+          llvm_unreachable("Unsupported scalar size");
+        case 8:
+          DupOp = AArch64ISD::DUPLANEQ8;
+          break;
+        case 16:
+          DupOp = AArch64ISD::DUPLANEQ16;
+          break;
+        case 32:
+          DupOp = AArch64ISD::DUPLANEQ32;
+          break;
+        case 64:
+          DupOp = AArch64ISD::DUPLANEQ64;
+          break;
+        }
+        return convertFromScalableVector(
+          DAG, VT, DAG.getNode(DupOp, DL, ContainerVT, Op1,
+                               DAG.getConstant(WhichResult, DL, MVT::i32,
+                                               /*isTarget=*/true)));
+      }
+    }
   }
 
   // Try to widen the shuffle before generating a possibly expensive SVE TBL.
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
index 703d2af0e0534..61c3b76bd2466 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
@@ -4,13 +4,9 @@
 define void @dupq_i8_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_i8_256b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl32
-; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    add x8, x8, :lo12:.LCPI0_0
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x8]
-; CHECK-NEXT:    tbl z0.b, { z0.b }, z1.b
-; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.b, z0.b[11]
+; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
   %load = load <32 x i8>, ptr %addr
   %splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
@@ -22,13 +18,9 @@ define void @dupq_i8_256b(ptr %addr) #0 {
 define void @dupq_i16_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_i16_256b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    adrp x8, .LCPI1_0
-; CHECK-NEXT:    add x8, x8, :lo12:.LCPI1_0
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x8]
-; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.h, z0.h[2]
+; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
   %load = load <16 x i16>, ptr %addr
   %splat.lanes = shufflevector <16 x i16> %load, <16 x i16> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
@@ -40,13 +32,9 @@ define void @dupq_i16_256b(ptr %addr) #0 {
 define void @dupq_i32_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_i32_256b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    adrp x8, .LCPI2_0
-; CHECK-NEXT:    add x8, x8, :lo12:.LCPI2_0
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8]
-; CHECK-NEXT:    tbl z0.s, { z0.s }, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.s, z0.s[3]
+; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
   %load = load <8 x i32>, ptr %addr
   %splat.lanes = shufflevector <8 x i32> %load, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
@@ -58,10 +46,9 @@ define void @dupq_i32_256b(ptr %addr) #0 {
 define void @dupq_i64_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_i64_256b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    trn1 z0.d, z0.d, z0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
   %load = load <4 x i64>, ptr %addr
   %splat.lanes = shufflevector <4 x i64> %load, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -72,13 +59,9 @@ define void @dupq_i64_256b(ptr %addr) #0 {
 define void @dupq_f16_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_f16_256b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    adrp x8, .LCPI4_0
-; CHECK-NEXT:    add x8, x8, :lo12:.LCPI4_0
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x8]
-; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.h, z0.h[2]
+; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
   %load = load <16 x half>, ptr %addr
   %splat.lanes = shufflevector <16 x half> %load, <16 x half> poison, <16 x i32> <i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2, i32  2,
@@ -105,13 +88,9 @@ define void @dupq_bf16_256b(ptr %addr) #0 {
 define void @dupq_f32_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_f32_256b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    adrp x8, .LCPI6_0
-; CHECK-NEXT:    add x8, x8, :lo12:.LCPI6_0
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8]
-; CHECK-NEXT:    tbl z0.s, { z0.s }, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    dupq z0.s, z0.s[3]
+; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
   %load = load <8 x float>, ptr %addr
   %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
@@ -123,10 +102,9 @@ define void @dupq_f32_256b(ptr %addr) #0 {
 define void @dupq_f64_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_f64_256b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    trn1 z0.d, z0.d, z0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
   %load = load <4 x double>, ptr %addr
   %splat.lanes = shufflevector <4 x double> %load, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>

>From 1f055884f2ba50e9da21a960a28bd6735e410e66 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 17 Jun 2025 10:44:45 +0000
Subject: [PATCH 4/8] *Use optional result instead of reference argument
 *Remove sdnode and use intrinsic

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 44 +++++++------------
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  6 ---
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 17 -------
 3 files changed, 16 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 80d036f3e7bf9..125852337c19a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13430,26 +13430,27 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
-/// isDUPQMask - matches a splat of equivalent lanes within 128b segments
-static bool isDUPQMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  WhichResult = (unsigned)M[0];
+/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
+/// the first vector operand.
+static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
+  unsigned Lane = (unsigned)M[0];
   unsigned Segments = VT.getFixedSizeInBits() / 128;
   unsigned SegmentElts = VT.getVectorNumElements() / Segments;
   if (SegmentElts * Segments != M.size())
-    return false;
+    return std::nullopt;
 
   for (unsigned I = 0; I < Segments; ++I) {
     unsigned Broadcast = (unsigned)M[I * SegmentElts];
     if (Broadcast - (I * SegmentElts) > SegmentElts)
-      return false;
+      return std::nullopt;
     for (unsigned J = 0; J < SegmentElts; ++J) {
       int Idx = M[(I * SegmentElts) + J];
       if ((unsigned)Idx != Broadcast)
-        return false;
+        return std::nullopt;
     }
   }
 
-  return true;
+  return Lane;
 }
 
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
@@ -30037,28 +30038,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
     }
 
     if (Subtarget->hasSVE2p1()) {
-      if (isDUPQMask(ShuffleMask, VT, WhichResult)) {
-        unsigned DupOp;
-        switch (VT.getScalarSizeInBits()) {
-        default:
-          llvm_unreachable("Unsupported scalar size");
-        case 8:
-          DupOp = AArch64ISD::DUPLANEQ8;
-          break;
-        case 16:
-          DupOp = AArch64ISD::DUPLANEQ16;
-          break;
-        case 32:
-          DupOp = AArch64ISD::DUPLANEQ32;
-          break;
-        case 64:
-          DupOp = AArch64ISD::DUPLANEQ64;
-          break;
-        }
+      if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
+        SDValue IID = DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL,
+                                      MVT::i64);
         return convertFromScalableVector(
-          DAG, VT, DAG.getNode(DupOp, DL, ContainerVT, Op1,
-                               DAG.getConstant(WhichResult, DL, MVT::i32,
-                                               /*isTarget=*/true)));
+          DAG, VT, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
+                               {ContainerVT, MVT::i64},
+                               {IID, Op1,
+                               DAG.getConstant(*Lane, DL, MVT::i64,
+                                               /*isTarget=*/true)}));
       }
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b99e9c2cfefa0..f90f12b5ac3c7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -855,12 +855,6 @@ def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
 def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
 def AArch64duplane128 : SDNode<"AArch64ISD::DUPLANE128", SDT_AArch64DupLane>;
 
-// Scalar-to-vector segmented duplication
-def AArch64duplaneq8  : SDNode<"AArch64ISD::DUPLANEQ8", SDT_AArch64DupLane>;
-def AArch64duplaneq16 : SDNode<"AArch64ISD::DUPLANEQ16", SDT_AArch64DupLane>;
-def AArch64duplaneq32 : SDNode<"AArch64ISD::DUPLANEQ32", SDT_AArch64DupLane>;
-def AArch64duplaneq64 : SDNode<"AArch64ISD::DUPLANEQ64", SDT_AArch64DupLane>;
-
 def AArch64insr      : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>;
 
 // Vector shuffles
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 7108243305dee..848b1c30bbeb5 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10613,23 +10613,6 @@ multiclass sve2p1_dupq<string mnemonic, SDPatternOperator Op> {
   def : SVE_2_Op_Imm_Pat<nxv4f32, Op, nxv4f32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Imm_Pat<nxv2f64, Op, nxv2f64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
   def : SVE_2_Op_Imm_Pat<nxv8bf16, Op, nxv8bf16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
-
-  def : Pat<(nxv16i8 (AArch64duplaneq8 nxv16i8:$Op1, VectorIndexB32b_timm:$imm)),
-            (!cast<Instruction>(NAME # _B) $Op1, $imm)>;
-  def : Pat<(nxv8i16 (AArch64duplaneq16 nxv8i16:$Op1, VectorIndexH32b_timm:$imm)),
-            (!cast<Instruction>(NAME # _H) $Op1, $imm)>;
-  def : Pat<(nxv4i32 (AArch64duplaneq32 nxv4i32:$Op1, VectorIndexS32b_timm:$imm)),
-            (!cast<Instruction>(NAME # _S) $Op1, $imm)>;
-  def : Pat<(nxv2i64 (AArch64duplaneq64 nxv2i64:$Op1, VectorIndexD32b_timm:$imm)),
-            (!cast<Instruction>(NAME # _D) $Op1, $imm)>;
-  def : Pat<(nxv8f16 (AArch64duplaneq16 nxv8f16:$Op1, VectorIndexH32b_timm:$imm)),
-            (!cast<Instruction>(NAME # _H) $Op1, $imm)>;
-  def : Pat<(nxv4f32 (AArch64duplaneq32 nxv4f32:$Op1, VectorIndexS32b_timm:$imm)),
-            (!cast<Instruction>(NAME # _S) $Op1, $imm)>;
-  def : Pat<(nxv2f64 (AArch64duplaneq64 nxv2f64:$Op1, VectorIndexD32b_timm:$imm)),
-            (!cast<Instruction>(NAME # _D) $Op1, $imm)>;
-  def : Pat<(nxv8bf16 (AArch64duplaneq16 nxv8bf16:$Op1, VectorIndexH32b_timm:$imm)),
-            (!cast<Instruction>(NAME # _H) $Op1, $imm)>;
 }
 
 

>From 1fc5190829594e00233df8589249cccf657fe951 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 17 Jun 2025 12:36:06 +0000
Subject: [PATCH 5/8] use end lane index in 8b test

---
 llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
index 61c3b76bd2466..40d4d0ff60148 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
@@ -5,12 +5,12 @@ define void @dupq_i8_256b(ptr %addr) #0 {
 ; CHECK-LABEL: dupq_i8_256b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    dupq z0.b, z0.b[11]
+; CHECK-NEXT:    dupq z0.b, z0.b[15]
 ; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
   %load = load <32 x i8>, ptr %addr
-  %splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
-                                                                              i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
+  %splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15,
+                                                                              i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   store <32 x i8> %splat.lanes, ptr %addr
   ret void
 }

>From 31ee7259c42a4cb785e60c0e22b5484bc62e798c Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 17 Jun 2025 12:37:14 +0000
Subject: [PATCH 6/8] Formatting

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 125852337c19a..279e6cf5f0549 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30039,14 +30039,14 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
 
     if (Subtarget->hasSVE2p1()) {
       if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
-        SDValue IID = DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL,
-                                      MVT::i64);
+        SDValue IID =
+            DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
         return convertFromScalableVector(
-          DAG, VT, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
-                               {ContainerVT, MVT::i64},
-                               {IID, Op1,
-                               DAG.getConstant(*Lane, DL, MVT::i64,
-                                               /*isTarget=*/true)}));
+            DAG, VT,
+            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ContainerVT, MVT::i64},
+                        {IID, Op1,
+                         DAG.getConstant(*Lane, DL, MVT::i64,
+                                         /*isTarget=*/true)}));
       }
     }
   }

>From 6b380b0b7812e33ab8de4979d724864ca648ba46 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 17 Jun 2025 14:49:05 +0000
Subject: [PATCH 7/8] Single loop

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 279e6cf5f0549..1e7e40ab663be 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13436,19 +13436,20 @@ static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
   unsigned Lane = (unsigned)M[0];
   unsigned Segments = VT.getFixedSizeInBits() / 128;
   unsigned SegmentElts = VT.getVectorNumElements() / Segments;
+
+  // Make sure there's no size changes.
   if (SegmentElts * Segments != M.size())
     return std::nullopt;
 
-  for (unsigned I = 0; I < Segments; ++I) {
-    unsigned Broadcast = (unsigned)M[I * SegmentElts];
-    if (Broadcast - (I * SegmentElts) > SegmentElts)
+  // Check that the first index corresponds to one of the lanes in the first
+  // segment.
+  if ((unsigned)M[0] >= SegmentElts)
+    return std::nullopt;
+
+  // Check that all lanes match the first, adjusted for segment.
+  for (unsigned I = 0; I < M.size(); ++I)
+    if ((unsigned)M[I] != ((unsigned)M[0] + ((I / SegmentElts) * SegmentElts)))
       return std::nullopt;
-    for (unsigned J = 0; J < SegmentElts; ++J) {
-      int Idx = M[(I * SegmentElts) + J];
-      if ((unsigned)Idx != Broadcast)
-        return std::nullopt;
-    }
-  }
 
   return Lane;
 }

>From ddcf5d5f66649b37be0d7aeccb9604984ec180f5 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 17 Jun 2025 16:17:34 +0000
Subject: [PATCH 8/8] Drop extra VT

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1e7e40ab663be..b4779a1e2b604 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30044,7 +30044,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
             DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
         return convertFromScalableVector(
             DAG, VT,
-            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ContainerVT, MVT::i64},
+            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
                         {IID, Op1,
                          DAG.getConstant(*Lane, DL, MVT::i64,
                                          /*isTarget=*/true)}));