[llvm] [AArch64] Eliminate XTN/SSHLL for vector splats (PR #180913)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 11 07:51:31 PST 2026
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/180913
>From 2a0b530660d9deff9e04cb4ddc31969f896fa174 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Wed, 11 Feb 2026 11:18:27 +0200
Subject: [PATCH 1/2] [AArch64] Eliminate XTN/SSHLL for vector splats
Combine:
sext(duplane(insert_subvector(undef, trunc(X), 0), idx))
Into:
duplane(X, idx)
This avoids XTN/SSHLL instruction sequences that occur when splatting
elements from boolean vectors after type legalization, which is common
when using shufflevector with comparison results.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 56 +++++++++++
.../AArch64/vec-combine-dup-trunc-sext.ll | 93 +++++++++++++++++++
2 files changed, 149 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d130ca7b56ec0..576a582445bd4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24022,6 +24022,59 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
}
+// Combine:
+// sext(duplane(insert_subvector(undef, trunc(X), 0), idx))
+// Into:
+// duplane(X, idx)
+// This eliminates XTN/SSHLL sequences when splatting from boolean vectors.
+static SDValue performSExtDuplaneTruncCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue Dup = N->getOperand(0);
+ unsigned DupOpc = Dup.getOpcode();
+ if (!Dup->hasOneUse() ||
+ (DupOpc != AArch64ISD::DUPLANE8 && DupOpc != AArch64ISD::DUPLANE16 &&
+ DupOpc != AArch64ISD::DUPLANE32))
+ return SDValue();
+
+ SDValue Insert = Dup.getOperand(0);
+ if (!Insert.hasOneUse() || Insert.getOpcode() != ISD::INSERT_SUBVECTOR ||
+ !Insert.getOperand(0).isUndef() || !isNullConstant(Insert.getOperand(2)))
+ return SDValue();
+
+ SDValue Trunc = Insert.getOperand(1);
+ if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ SDValue Src = Trunc.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ EVT DstVT = N->getValueType(0);
+ if (SrcVT != DstVT || !SrcVT.isFixedLengthVector())
+ return SDValue();
+
+ unsigned NewDupOpc;
+ switch (SrcVT.getScalarSizeInBits()) {
+ case 16:
+ NewDupOpc = AArch64ISD::DUPLANE16;
+ break;
+ case 32:
+ NewDupOpc = AArch64ISD::DUPLANE32;
+ break;
+ case 64:
+ NewDupOpc = AArch64ISD::DUPLANE64;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ EVT WideVT = SrcVT.is64BitVector()
+ ? SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext())
+ : SrcVT;
+ SDValue WideSrc =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, DAG.getUNDEF(WideVT), Src,
+ DAG.getConstant(0, DL, MVT::i64));
+ return DAG.getNode(NewDupOpc, DL, DstVT, WideSrc, Dup.getOperand(1));
+}
+
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -24071,6 +24124,9 @@ static SDValue performExtendCombine(SDNode *N,
NewAnyExtend);
}
+ if (SDValue R = performSExtDuplaneTruncCombine(N, DAG))
+ return R;
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll b/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
new file mode 100644
index 0000000000000..521539e5e41ac
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define <4 x i32> @dup_trunc_sext_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: dup_trunc_sext_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: dup v0.4s, v0.s[2]
+; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT: ret
+ %cmp = icmp slt <4 x i32> %a, %b
+ %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ %sel = select <4 x i1> %splat, <4 x i32> %x, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @dup_trunc_sext_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: dup_trunc_sext_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: dup v0.2d, v0.d[0]
+; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT: ret
+ %cmp = icmp slt <2 x i64> %a, %b
+ %splat = shufflevector <2 x i1> %cmp, <2 x i1> poison, <2 x i32> <i32 0, i32 0>
+ %sel = select <2 x i1> %splat, <2 x i64> %x, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+define <4 x float> @dup_trunc_sext_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: dup_trunc_sext_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: dup v0.4s, v0.s[3]
+; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT: ret
+ %cmp = fcmp ole <4 x float> %a, %b
+ %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %sel = select <4 x i1> %splat, <4 x float> %x, <4 x float> %y
+ ret <4 x float> %sel
+}
+
+define <2 x double> @dup_trunc_sext_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: dup_trunc_sext_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcmge v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: dup v0.2d, v0.d[1]
+; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT: ret
+ %cmp = fcmp ole <2 x double> %a, %b
+ %splat = shufflevector <2 x i1> %cmp, <2 x i1> poison, <2 x i32> <i32 1, i32 1>
+ %sel = select <2 x i1> %splat, <2 x double> %x, <2 x double> %y
+ ret <2 x double> %sel
+}
+
+define <4 x i32> @dup_trunc_sext_v4i32_idx0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: dup_trunc_sext_v4i32_idx0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT: ret
+ %cmp = icmp slt <4 x i32> %a, %b
+ %splat = shufflevector <4 x i1> %cmp, <4 x i1> poison, <4 x i32> zeroinitializer
+ %sel = select <4 x i1> %splat, <4 x i32> %x, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i32> @dup_trunc_sext_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: dup_trunc_sext_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: dup v0.2s, v0.s[1]
+; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT: ret
+ %cmp = icmp slt <2 x i32> %a, %b
+ %splat = shufflevector <2 x i1> %cmp, <2 x i1> poison, <2 x i32> <i32 1, i32 1>
+ %sel = select <2 x i1> %splat, <2 x i32> %x, <2 x i32> %y
+ ret <2 x i32> %sel
+}
+
+define <8 x i16> @dup_trunc_sext_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: dup_trunc_sext_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: dup v0.8h, v0.h[2]
+; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT: ret
+ %cmp = icmp slt <8 x i16> %a, %b
+ %splat = shufflevector <8 x i1> %cmp, <8 x i1> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ %sel = select <8 x i1> %splat, <8 x i16> %x, <8 x i16> %y
+ ret <8 x i16> %sel
+}
>From f1f77e4d1e963d5bed198bb812a36640a5cdb8c5 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Wed, 11 Feb 2026 17:40:15 +0200
Subject: [PATCH 2/2] Address comments
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 7 +++++++
.../AArch64/vec-combine-dup-trunc-sext.ll | 16 ++++++++++++++++
2 files changed, 23 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 576a582445bd4..a019baea207d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24050,6 +24050,13 @@ static SDValue performSExtDuplaneTruncCombine(SDNode *N, SelectionDAG &DAG) {
if (SrcVT != DstVT || !SrcVT.isFixedLengthVector())
return SDValue();
+ // Verify that Src is already sign-extended from the truncated bit width.
+ EVT TruncVT = Trunc.getValueType();
+ unsigned SrcBits = SrcVT.getScalarSizeInBits();
+ unsigned TruncBits = TruncVT.getScalarSizeInBits();
+ if (DAG.ComputeNumSignBits(Src) <= SrcBits - TruncBits)
+ return SDValue();
+
unsigned NewDupOpc;
switch (SrcVT.getScalarSizeInBits()) {
case 16:
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll b/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
index 521539e5e41ac..d593724c09b63 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-dup-trunc-sext.ll
@@ -91,3 +91,19 @@ define <8 x i16> @dup_trunc_sext_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %x,
%sel = select <8 x i1> %splat, <8 x i16> %x, <8 x i16> %y
ret <8 x i16> %sel
}
+
+define <4 x i32> @negative_arbitrary_input(<4 x i32> %mask, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: negative_arbitrary_input:
+; CHECK: // %bb.0:
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: dup v0.4h, v0.h[2]
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: shl v0.4s, v0.4s, #31
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: ret
+ %trunc = trunc <4 x i32> %mask to <4 x i1>
+ %splat = shufflevector <4 x i1> %trunc, <4 x i1> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ %sel = select <4 x i1> %splat, <4 x i32> %x, <4 x i32> %y
+ ret <4 x i32> %sel
+}
More information about the llvm-commits
mailing list