[llvm] [AArch64] Eliminate XTN/SSHLL for vector splats (PR #180913)

Wed Feb 11 07:51:31 PST 2026

https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/180913

>From 2a0b530660d9deff9e04cb4ddc31969f896fa174 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Wed, 11 Feb 2026 11:18:27 +0200
Subject: [PATCH 1/2] [AArch64] Eliminate XTN/SSHLL for vector splats

Combine:
  sext(duplane(insert_subvector(undef, trunc(X), 0), idx))
Into:
  duplane(X, idx)

This avoids XTN/SSHLL instruction sequences that occur when splatting
elements from boolean vectors after type legalization, which is common
when using shufflevector with comparison results.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 56 +++++++++++
 .../AArch64/vec-combine-dup-trunc-sext.ll     | 93 +++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d130ca7b56ec0..576a582445bd4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24022,6 +24022,59 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
 }
 
+// Combine:
+//   sext(duplane(insert_subvector(undef, trunc(X), 0), idx))
+// Into:
+//   duplane(X, idx)
+// This eliminates XTN/SSHLL sequences when splatting from boolean vectors.
+static SDValue performSExtDuplaneTruncCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Dup = N->getOperand(0);
+  unsigned DupOpc = Dup.getOpcode();
+  if (!Dup->hasOneUse() ||
+      (DupOpc != AArch64ISD::DUPLANE8 && DupOpc != AArch64ISD::DUPLANE16 &&
+       DupOpc != AArch64ISD::DUPLANE32))
+    return SDValue();
+
+  SDValue Insert = Dup.getOperand(0);
+  if (!Insert.hasOneUse() || Insert.getOpcode() != ISD::INSERT_SUBVECTOR ||
+      !Insert.getOperand(0).isUndef() || !isNullConstant(Insert.getOperand(2)))
+    return SDValue();
+
+  SDValue Trunc = Insert.getOperand(1);
+  if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  SDValue Src = Trunc.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = N->getValueType(0);
+  if (SrcVT != DstVT || !SrcVT.isFixedLengthVector())
+    return SDValue();
+
+  unsigned NewDupOpc;
+  switch (SrcVT.getScalarSizeInBits()) {
+  case 16:
+    NewDupOpc = AArch64ISD::DUPLANE16;
+    break;
+  case 32:
+    NewDupOpc = AArch64ISD::DUPLANE32;
+    break;
+  case 64:
+    NewDupOpc = AArch64ISD::DUPLANE64;
+    break;
+  default:
+    return SDValue();
+  }
+
+  SDLoc DL(N);
+  EVT WideVT = SrcVT.is64BitVector()
+                   ? SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext())
+                   : SrcVT;
+  SDValue WideSrc =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, DAG.getUNDEF(WideVT), Src,
+                  DAG.getConstant(0, DL, MVT::i64));
+  return DAG.getNode(NewDupOpc, DL, DstVT, WideSrc, Dup.getOperand(1));
+}
+
 static SDValue performExtendCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
@@ -24071,6 +24124,9 @@ static SDValue performExtendCombine(SDNode *N,
                        NewAnyExtend);
   }
 
+  if (SDValue R = performSExtDuplaneTruncCombine(N, DAG))
+    return R;
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll b/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
new file mode 100644
index 0000000000000..521539e5e41ac
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define <4 x i32> @dup_trunc_sext_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: dup_trunc_sext_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    dup v0.4s, v0.s[2]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <4 x i32> %a, %b
+  %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %sel = select <4 x i1> %splat, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @dup_trunc_sext_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: dup_trunc_sext_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <2 x i64> %a, %b
+  %splat = shufflevector <2 x i1> %cmp, <2 x i1> poison, <2 x i32> <i32 0, i32 0>
+  %sel = select <2 x i1> %splat, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %sel
+}
+
+define <4 x float> @dup_trunc_sext_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: dup_trunc_sext_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    dup v0.4s, v0.s[3]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+  %cmp = fcmp ole <4 x float> %a, %b
+  %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %sel = select <4 x i1> %splat, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %sel
+}
+
+define <2 x double> @dup_trunc_sext_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: dup_trunc_sext_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmge v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    dup v0.2d, v0.d[1]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+  %cmp = fcmp ole <2 x double> %a, %b
+  %splat = shufflevector <2 x i1> %cmp, <2 x i1> poison, <2 x i32> <i32 1, i32 1>
+  %sel = select <2 x i1> %splat, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %sel
+}
+
+define <4 x i32> @dup_trunc_sext_v4i32_idx0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: dup_trunc_sext_v4i32_idx0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <4 x i32> %a, %b
+  %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> zeroinitializer
+  %sel = select <4 x i1> %splat, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <2 x i32> @dup_trunc_sext_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: dup_trunc_sext_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    dup v0.2s, v0.s[1]
+; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <2 x i32> %a, %b
+  %splat = shufflevector <2 x i1> %cmp, <2 x i1> poison, <2 x i32> <i32 1, i32 1>
+  %sel = select <2 x i1> %splat, <2 x i32> %x, <2 x i32> %y
+  ret <2 x i32> %sel
+}
+
+define <8 x i16> @dup_trunc_sext_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: dup_trunc_sext_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[2]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <8 x i16> %a, %b
+  %splat = shufflevector <8 x i1> %cmp, <8 x i1> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %sel = select <8 x i1> %splat, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %sel
+}

>From f1f77e4d1e963d5bed198bb812a36640a5cdb8c5 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Wed, 11 Feb 2026 17:40:15 +0200
Subject: [PATCH 2/2] Address comments

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp  |  7 +++++++
 .../AArch64/vec-combine-dup-trunc-sext.ll        | 16 ++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 576a582445bd4..a019baea207d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24050,6 +24050,13 @@ static SDValue performSExtDuplaneTruncCombine(SDNode *N, SelectionDAG &DAG) {
   if (SrcVT != DstVT || !SrcVT.isFixedLengthVector())
     return SDValue();
 
+  // Verify that Src is already sign-extended from the truncated bit width.
+  EVT TruncVT = Trunc.getValueType();
+  unsigned SrcBits = SrcVT.getScalarSizeInBits();
+  unsigned TruncBits = TruncVT.getScalarSizeInBits();
+  if (DAG.ComputeNumSignBits(Src) <= SrcBits - TruncBits)
+    return SDValue();
+
   unsigned NewDupOpc;
   switch (SrcVT.getScalarSizeInBits()) {
   case 16:
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll b/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
index 521539e5e41ac..d593724c09b63 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
@@ -91,3 +91,19 @@ define <8 x i16> @dup_trunc_sext_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %x,
   %sel = select <8 x i1> %splat, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %sel
 }
+
+define <4 x i32> @negative_arbitrary_input(<4 x i32> %mask, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: negative_arbitrary_input:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    dup v0.4h, v0.h[2]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %trunc = trunc <4 x i32> %mask to <4 x i1>
+  %splat = shufflevector <4 x i1> %trunc, <4 x i1> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %sel = select <4 x i1> %splat, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %sel
+}