[llvm] [AArch64] Improve select dagcombine (PR #169925)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 28 09:25:09 PST 2025
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/169925
>From 1262bd31f1fe83ba633d5e9cada988d7d17193a8 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 28 Nov 2025 14:27:48 +0000
Subject: [PATCH 1/4] [AArch64] Codegen test for select from canonical
fixed-width AnyOf
---
llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 145 ++++++++++++++++++
1 file changed, 145 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
new file mode 100644
index 0000000000000..5a5b92feb2415
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -o - | FileCheck %s
+target triple = "aarch64-linux-gnu"
+
+;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an
+;; integer of a bitwidth equal to the number of lanes being reduced, then
+;; compared against zero. To select between vectors for NEON, we then need to
+;; broadcast the result, but we must be careful when the bitwidth of the scalar
+;; result is smaller than the element size of the vectors being selected. We
+;; don't want to end up with scalarization.
+
+define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: any_of_select_vf4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: movi d3, #0000000000000000
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: movi v4.16b, #15
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: dup v0.16b, v3.b[0]
+; CHECK-NEXT: umov w8, v3.b[0]
+; CHECK-NEXT: umov w9, v0.b[1]
+; CHECK-NEXT: umov w10, v0.b[2]
+; CHECK-NEXT: umov w11, v0.b[7]
+; CHECK-NEXT: and x8, x8, #0xf
+; CHECK-NEXT: bfi x8, x9, #4, #4
+; CHECK-NEXT: umov w9, v0.b[3]
+; CHECK-NEXT: bfi x8, x10, #8, #4
+; CHECK-NEXT: umov w10, v0.b[4]
+; CHECK-NEXT: bfi x8, x9, #12, #4
+; CHECK-NEXT: umov w9, v0.b[5]
+; CHECK-NEXT: bfi x8, x10, #16, #4
+; CHECK-NEXT: umov w10, v0.b[6]
+; CHECK-NEXT: bfi x8, x9, #20, #4
+; CHECK-NEXT: umov w9, v0.b[8]
+; CHECK-NEXT: bfi x8, x10, #24, #4
+; CHECK-NEXT: lsl w10, w11, #28
+; CHECK-NEXT: umov w11, v0.b[9]
+; CHECK-NEXT: orr x8, x8, x10
+; CHECK-NEXT: and w9, w9, #0xf
+; CHECK-NEXT: umov w10, v0.b[10]
+; CHECK-NEXT: orr x8, x8, x9, lsl #32
+; CHECK-NEXT: and w9, w11, #0xf
+; CHECK-NEXT: umov w11, v0.b[11]
+; CHECK-NEXT: orr x8, x8, x9, lsl #36
+; CHECK-NEXT: and w9, w10, #0xf
+; CHECK-NEXT: umov w10, v0.b[12]
+; CHECK-NEXT: orr x8, x8, x9, lsl #40
+; CHECK-NEXT: and w9, w11, #0xf
+; CHECK-NEXT: umov w11, v0.b[13]
+; CHECK-NEXT: orr x8, x8, x9, lsl #44
+; CHECK-NEXT: and w9, w10, #0xf
+; CHECK-NEXT: umov w10, v0.b[14]
+; CHECK-NEXT: orr x8, x8, x9, lsl #48
+; CHECK-NEXT: and w9, w11, #0xf
+; CHECK-NEXT: orr x8, x8, x9, lsl #52
+; CHECK-NEXT: umov w9, v0.b[15]
+; CHECK-NEXT: and w10, w10, #0xf
+; CHECK-NEXT: orr x8, x8, x10, lsl #56
+; CHECK-NEXT: orr x8, x8, x9, lsl #60
+; CHECK-NEXT: dup v0.2d, x8
+; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %cmp = icmp slt <4 x i32> %mask, zeroinitializer
+ %cmp.bc = bitcast <4 x i1> %cmp to i4
+ %cmp.bc.not = icmp eq i4 %cmp.bc, 0
+ %res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: any_of_select_vf2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT: movi d3, #0000000000000000
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: movi v4.16b, #3
+; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: dup v0.16b, v3.b[0]
+; CHECK-NEXT: umov w8, v3.b[0]
+; CHECK-NEXT: umov w9, v0.b[1]
+; CHECK-NEXT: umov w10, v0.b[2]
+; CHECK-NEXT: umov w11, v0.b[7]
+; CHECK-NEXT: umov w12, v0.b[8]
+; CHECK-NEXT: and w8, w8, #0x3
+; CHECK-NEXT: umov w13, v0.b[3]
+; CHECK-NEXT: umov w14, v0.b[4]
+; CHECK-NEXT: umov w15, v0.b[10]
+; CHECK-NEXT: umov w16, v0.b[5]
+; CHECK-NEXT: bfi w8, w9, #2, #2
+; CHECK-NEXT: umov w9, v0.b[9]
+; CHECK-NEXT: ubfiz w11, w11, #14, #2
+; CHECK-NEXT: ubfiz w12, w12, #16, #2
+; CHECK-NEXT: bfi w8, w10, #4, #2
+; CHECK-NEXT: umov w10, v0.b[11]
+; CHECK-NEXT: ubfiz w15, w15, #20, #2
+; CHECK-NEXT: orr w11, w11, w12
+; CHECK-NEXT: umov w12, v0.b[13]
+; CHECK-NEXT: bfi w8, w13, #6, #2
+; CHECK-NEXT: umov w13, v0.b[12]
+; CHECK-NEXT: ubfiz w9, w9, #18, #2
+; CHECK-NEXT: bfi w8, w14, #8, #2
+; CHECK-NEXT: umov w14, v0.b[14]
+; CHECK-NEXT: orr w9, w11, w9
+; CHECK-NEXT: umov w11, v0.b[6]
+; CHECK-NEXT: ubfiz w10, w10, #22, #2
+; CHECK-NEXT: orr w9, w9, w15
+; CHECK-NEXT: ubfiz w13, w13, #24, #2
+; CHECK-NEXT: bfi w8, w16, #10, #2
+; CHECK-NEXT: orr w9, w9, w10
+; CHECK-NEXT: ubfiz w10, w12, #26, #2
+; CHECK-NEXT: orr w9, w9, w13
+; CHECK-NEXT: ubfiz w12, w14, #28, #2
+; CHECK-NEXT: umov w13, v0.b[15]
+; CHECK-NEXT: bfi w8, w11, #12, #2
+; CHECK-NEXT: orr w9, w9, w10
+; CHECK-NEXT: orr w9, w9, w12
+; CHECK-NEXT: orr w8, w8, w9
+; CHECK-NEXT: orr w8, w8, w13, lsl #30
+; CHECK-NEXT: orr x8, x8, x8, lsl #32
+; CHECK-NEXT: dup v0.2d, x8
+; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %cmp = icmp slt <2 x i64> %mask, zeroinitializer
+ %cmp.bc = bitcast <2 x i1> %cmp to i2
+ %cmp.bc.not = icmp eq i2 %cmp.bc, 0
+ %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
+ ret <2 x i64> %res
+}
>From 7c8b20f18b3d692d57b60506fa967ffce221f9aa Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 28 Nov 2025 14:42:38 +0000
Subject: [PATCH 2/4] Avoid choosing a bad ElementCount for splatting the
condition
---
.../Target/AArch64/AArch64ISelLowering.cpp | 5 +
llvm/test/CodeGen/AArch64/expand-select.ll | 50 +++-----
llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 121 ++----------------
3 files changed, 36 insertions(+), 140 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6072fd9d8f242..e6872dfe995d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26965,6 +26965,11 @@ static SDValue performSelectCombine(SDNode *N,
if (!ResVT.isVector() || NumMaskElts == 0)
return SDValue();
+ // Avoid creating vectors with excessive VFs for small types.
+ if (DCI.isBeforeLegalize() &&
+ SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits())
+ NumMaskElts = ResVT.getVectorNumElements();
+
SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index 1ca4719d9b6bf..8ad9ea3b7a8d5 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -4,20 +4,15 @@
define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: and w8, w0, #0x1
-; CHECK-NEXT: ldr x11, [sp]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: ldp x8, x10, [sp, #8]
-; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: tst w9, #0x1
-; CHECK-NEXT: csel x8, x5, x8, ne
-; CHECK-NEXT: csel x9, x4, x11, ne
-; CHECK-NEXT: stp x9, x8, [x10, #16]
-; CHECK-NEXT: csel x8, x3, x7, ne
-; CHECK-NEXT: csel x9, x2, x6, ne
-; CHECK-NEXT: stp x9, x8, [x10]
+; CHECK-NEXT: ldp x8, x9, [sp, #8]
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: ldr x10, [sp]
+; CHECK-NEXT: csel x8, x5, x8, eq
+; CHECK-NEXT: csel x10, x4, x10, eq
+; CHECK-NEXT: stp x10, x8, [x9, #16]
+; CHECK-NEXT: csel x8, x3, x7, eq
+; CHECK-NEXT: csel x10, x2, x6, eq
+; CHECK-NEXT: stp x10, x8, [x9]
; CHECK-NEXT: ret
%cond = and i32 %In1, 1
%cbool = icmp eq i32 %cond, 0
@@ -31,22 +26,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
; CHECK-LABEL: bar:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: and w8, w0, #0x1
-; CHECK-NEXT: ldr x10, [sp, #16]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: tst w9, #0x1
-; CHECK-NEXT: ldp x8, x9, [sp]
-; CHECK-NEXT: csel x11, x2, x6, ne
-; CHECK-NEXT: str x11, [x10]
-; CHECK-NEXT: csel x8, x4, x8, ne
-; CHECK-NEXT: stur x8, [x10, #12]
-; CHECK-NEXT: csel x8, x5, x9, ne
-; CHECK-NEXT: csel x9, x3, x7, ne
-; CHECK-NEXT: str w8, [x10, #20]
-; CHECK-NEXT: str w9, [x10, #8]
+; CHECK-NEXT: ldp x8, x10, [sp]
+; CHECK-NEXT: tst w0, #0x1
+; CHECK-NEXT: ldr x9, [sp, #16]
+; CHECK-NEXT: csel x11, x2, x6, eq
+; CHECK-NEXT: csel x8, x4, x8, eq
+; CHECK-NEXT: str x11, [x9]
+; CHECK-NEXT: stur x8, [x9, #12]
+; CHECK-NEXT: csel x8, x5, x10, eq
+; CHECK-NEXT: csel x10, x3, x7, eq
+; CHECK-NEXT: str w8, [x9, #20]
+; CHECK-NEXT: str w10, [x9, #8]
; CHECK-NEXT: ret
%cond = and i32 %In1, 1
%cbool = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
index 5a5b92feb2415..43abb6ac9b944 100644
--- a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
+++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
@@ -12,62 +12,13 @@ target triple = "aarch64-linux-gnu"
define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: any_of_select_vf4:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
-; CHECK-NEXT: movi d3, #0000000000000000
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
-; CHECK-NEXT: movi v4.16b, #15
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
-; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
-; CHECK-NEXT: dup v0.16b, v3.b[0]
-; CHECK-NEXT: umov w8, v3.b[0]
-; CHECK-NEXT: umov w9, v0.b[1]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[7]
-; CHECK-NEXT: and x8, x8, #0xf
-; CHECK-NEXT: bfi x8, x9, #4, #4
-; CHECK-NEXT: umov w9, v0.b[3]
-; CHECK-NEXT: bfi x8, x10, #8, #4
-; CHECK-NEXT: umov w10, v0.b[4]
-; CHECK-NEXT: bfi x8, x9, #12, #4
-; CHECK-NEXT: umov w9, v0.b[5]
-; CHECK-NEXT: bfi x8, x10, #16, #4
-; CHECK-NEXT: umov w10, v0.b[6]
-; CHECK-NEXT: bfi x8, x9, #20, #4
-; CHECK-NEXT: umov w9, v0.b[8]
-; CHECK-NEXT: bfi x8, x10, #24, #4
-; CHECK-NEXT: lsl w10, w11, #28
-; CHECK-NEXT: umov w11, v0.b[9]
-; CHECK-NEXT: orr x8, x8, x10
-; CHECK-NEXT: and w9, w9, #0xf
-; CHECK-NEXT: umov w10, v0.b[10]
-; CHECK-NEXT: orr x8, x8, x9, lsl #32
-; CHECK-NEXT: and w9, w11, #0xf
-; CHECK-NEXT: umov w11, v0.b[11]
-; CHECK-NEXT: orr x8, x8, x9, lsl #36
-; CHECK-NEXT: and w9, w10, #0xf
-; CHECK-NEXT: umov w10, v0.b[12]
-; CHECK-NEXT: orr x8, x8, x9, lsl #40
-; CHECK-NEXT: and w9, w11, #0xf
-; CHECK-NEXT: umov w11, v0.b[13]
-; CHECK-NEXT: orr x8, x8, x9, lsl #44
-; CHECK-NEXT: and w9, w10, #0xf
-; CHECK-NEXT: umov w10, v0.b[14]
-; CHECK-NEXT: orr x8, x8, x9, lsl #48
-; CHECK-NEXT: and w9, w11, #0xf
-; CHECK-NEXT: orr x8, x8, x9, lsl #52
-; CHECK-NEXT: umov w9, v0.b[15]
-; CHECK-NEXT: and w10, w10, #0xf
-; CHECK-NEXT: orr x8, x8, x10, lsl #56
-; CHECK-NEXT: orr x8, x8, x9, lsl #60
-; CHECK-NEXT: dup v0.2d, x8
-; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: umaxv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v0.4s, w8
+; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp slt <4 x i32> %mask, zeroinitializer
%cmp.bc = bitcast <4 x i1> %cmp to i4
@@ -79,63 +30,13 @@ define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b)
define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: any_of_select_vf2:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
-; CHECK-NEXT: movi d3, #0000000000000000
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
-; CHECK-NEXT: movi v4.16b, #3
-; CHECK-NEXT: addp d0, v0.2d
-; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
-; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b
-; CHECK-NEXT: dup v0.16b, v3.b[0]
-; CHECK-NEXT: umov w8, v3.b[0]
-; CHECK-NEXT: umov w9, v0.b[1]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[7]
-; CHECK-NEXT: umov w12, v0.b[8]
-; CHECK-NEXT: and w8, w8, #0x3
-; CHECK-NEXT: umov w13, v0.b[3]
-; CHECK-NEXT: umov w14, v0.b[4]
-; CHECK-NEXT: umov w15, v0.b[10]
-; CHECK-NEXT: umov w16, v0.b[5]
-; CHECK-NEXT: bfi w8, w9, #2, #2
-; CHECK-NEXT: umov w9, v0.b[9]
-; CHECK-NEXT: ubfiz w11, w11, #14, #2
-; CHECK-NEXT: ubfiz w12, w12, #16, #2
-; CHECK-NEXT: bfi w8, w10, #4, #2
-; CHECK-NEXT: umov w10, v0.b[11]
-; CHECK-NEXT: ubfiz w15, w15, #20, #2
-; CHECK-NEXT: orr w11, w11, w12
-; CHECK-NEXT: umov w12, v0.b[13]
-; CHECK-NEXT: bfi w8, w13, #6, #2
-; CHECK-NEXT: umov w13, v0.b[12]
-; CHECK-NEXT: ubfiz w9, w9, #18, #2
-; CHECK-NEXT: bfi w8, w14, #8, #2
-; CHECK-NEXT: umov w14, v0.b[14]
-; CHECK-NEXT: orr w9, w11, w9
-; CHECK-NEXT: umov w11, v0.b[6]
-; CHECK-NEXT: ubfiz w10, w10, #22, #2
-; CHECK-NEXT: orr w9, w9, w15
-; CHECK-NEXT: ubfiz w13, w13, #24, #2
-; CHECK-NEXT: bfi w8, w16, #10, #2
-; CHECK-NEXT: orr w9, w9, w10
-; CHECK-NEXT: ubfiz w10, w12, #26, #2
-; CHECK-NEXT: orr w9, w9, w13
-; CHECK-NEXT: ubfiz w12, w14, #28, #2
-; CHECK-NEXT: umov w13, v0.b[15]
-; CHECK-NEXT: bfi w8, w11, #12, #2
-; CHECK-NEXT: orr w9, w9, w10
-; CHECK-NEXT: orr w9, w9, w12
-; CHECK-NEXT: orr w8, w8, w9
-; CHECK-NEXT: orr w8, w8, w13, lsl #30
-; CHECK-NEXT: orr x8, x8, x8, lsl #32
+; CHECK-NEXT: umaxv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csetm x8, ne
; CHECK-NEXT: dup v0.2d, x8
-; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp slt <2 x i64> %mask, zeroinitializer
%cmp.bc = bitcast <2 x i1> %cmp to i2
>From dce36a4908bf03b2684425ce78afa12ecfb715c9 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 28 Nov 2025 16:56:43 +0000
Subject: [PATCH 3/4] Add suggested wider-than-legal test
---
llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 33 +++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
index 43abb6ac9b944..b74a2c66108d0 100644
--- a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
+++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
@@ -44,3 +44,36 @@ define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b)
%res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %res
}
+
+define <32 x i8> @any_of_select_vf32(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: any_of_select_vf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: movi d6, #0000000000000000
+; CHECK-NEXT: and v1.16b, v1.16b, v7.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v7.16b
+; CHECK-NEXT: ext v7.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: ext v16.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: zip1 v1.16b, v1.16b, v7.16b
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v16.16b
+; CHECK-NEXT: addv h1, v1.8h
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: bfi w9, w8, #16, #16
+; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: cmeq v0.4s, v0.4s, v6.4s
+; CHECK-NEXT: dup v1.4s, v0.s[0]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: bsl v1.16b, v3.16b, v5.16b
+; CHECK-NEXT: bsl v0.16b, v2.16b, v4.16b
+; CHECK-NEXT: ret
+ %cmp = icmp slt <32 x i8> %mask, zeroinitializer
+ %cmp.bc = bitcast <32 x i1> %cmp to i32
+ %cmp.bc.not = icmp eq i32 %cmp.bc, 0
+ %res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b
+ ret <32 x i8> %res
+}
>From 3be56c05cfdf13edbc47a462495ca12157cf17f9 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 28 Nov 2025 17:01:12 +0000
Subject: [PATCH 4/4] Always use ResVT's element count before legalization
---
.../Target/AArch64/AArch64ISelLowering.cpp | 5 ++--
llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 28 ++++++-------------
2 files changed, 10 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e6872dfe995d8..d2c377b24ca2e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26965,9 +26965,8 @@ static SDValue performSelectCombine(SDNode *N,
if (!ResVT.isVector() || NumMaskElts == 0)
return SDValue();
- // Avoid creating vectors with excessive VFs for small types.
- if (DCI.isBeforeLegalize() &&
- SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits())
+ // Avoid creating vectors with excessive VFs before legalization.
+ if (DCI.isBeforeLegalize())
NumMaskElts = ResVT.getVectorNumElements();
SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
index b74a2c66108d0..dedd4323f1519 100644
--- a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
+++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
@@ -48,28 +48,16 @@ define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b)
define <32 x i8> @any_of_select_vf32(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: any_of_select_vf32:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI2_0
-; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI2_0]
-; CHECK-NEXT: movi d6, #0000000000000000
-; CHECK-NEXT: and v1.16b, v1.16b, v7.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v7.16b
-; CHECK-NEXT: ext v7.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v16.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v1.16b, v1.16b, v7.16b
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v16.16b
-; CHECK-NEXT: addv h1, v1.8h
-; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: bfi w9, w8, #16, #16
-; CHECK-NEXT: fmov s0, w9
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v6.4s
-; CHECK-NEXT: dup v1.4s, v0.s[0]
+; CHECK-NEXT: umaxv b0, v0.16b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: dup v1.16b, w8
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: bsl v1.16b, v3.16b, v5.16b
-; CHECK-NEXT: bsl v0.16b, v2.16b, v4.16b
+; CHECK-NEXT: bsl v1.16b, v5.16b, v3.16b
+; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b
; CHECK-NEXT: ret
%cmp = icmp slt <32 x i8> %mask, zeroinitializer
%cmp.bc = bitcast <32 x i1> %cmp to i32
More information about the llvm-commits
mailing list