[llvm] [AArch64] Avoid using NEON BSL for streaming[-compatible] functions (PR #95803)

Wed Jun 19 03:27:29 PDT 2024

https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/95803

>From c877c7f23a222209873826d850e999cb43dcecbe Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 29 Apr 2024 14:23:02 +0100
Subject: [PATCH 1/2] [AArch64] Avoid using NEON BSL for streaming[-compatible]
 functions

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  9 +-
 ...e-streaming-mode-fixed-length-bitselect.ll | 99 ++++++++++++-------
 2 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c5c3ef02115ec..9ecd93f4b8cdb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18147,9 +18147,12 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   if (!VT.isVector())
     return SDValue();
 
-  // The combining code works for NEON, SVE2 and SME.
-  if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
-      (VT.isScalableVector() && !Subtarget.hasSVE2()))
+  if (VT.isScalableVector() && !Subtarget.hasSVE2())
+    return SDValue();
+
+  if (VT.isFixedLengthVector() &&
+      (!Subtarget.isNeonAvailable() ||
+       TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable())))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index b908dd61f2401..d65e87d5b9756 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -34,39 +34,72 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
 ;
 ; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
-; NONEON-NOSVE-NEXT:    ldp q5, q4, [x2]
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
-; NONEON-NOSVE-NEXT:    neg w8, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #60]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
-; NONEON-NOSVE-NEXT:    neg w8, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
-; NONEON-NOSVE-NEXT:    neg w8, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #52]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    neg w8, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
-; NONEON-NOSVE-NEXT:    neg w8, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
-; NONEON-NOSVE-NEXT:    neg w8, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    neg w8, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp]
-; NONEON-NOSVE-NEXT:    neg w8, w8
-; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
-; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v5.16b
-; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v4.16b
-; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x2]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w8, w14, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w9, w4, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w13, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    neg w3, w8
+; NONEON-NOSVE-NEXT:    neg w15, w14
+; NONEON-NOSVE-NEXT:    str q4, [sp, #32]
+; NONEON-NOSVE-NEXT:    and w9, w3, w9
+; NONEON-NOSVE-NEXT:    and w15, w15, w4
+; NONEON-NOSVE-NEXT:    str q5, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp w5, w3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp w16, w12, [sp]
+; NONEON-NOSVE-NEXT:    neg w4, w11
+; NONEON-NOSVE-NEXT:    neg w2, w13
+; NONEON-NOSVE-NEXT:    sub w11, w11, #1
+; NONEON-NOSVE-NEXT:    and w3, w4, w3
+; NONEON-NOSVE-NEXT:    and w2, w2, w5
+; NONEON-NOSVE-NEXT:    sub w13, w13, #1
+; NONEON-NOSVE-NEXT:    ldp w6, w4, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    neg w1, w16
+; NONEON-NOSVE-NEXT:    neg w0, w12
+; NONEON-NOSVE-NEXT:    sub w16, w16, #1
+; NONEON-NOSVE-NEXT:    and w1, w1, w6
+; NONEON-NOSVE-NEXT:    and w0, w0, w4
+; NONEON-NOSVE-NEXT:    sub w12, w12, #1
+; NONEON-NOSVE-NEXT:    ldp w5, w6, [sp, #24]
+; NONEON-NOSVE-NEXT:    neg w18, w17
+; NONEON-NOSVE-NEXT:    neg w4, w10
+; NONEON-NOSVE-NEXT:    sub w17, w17, #1
+; NONEON-NOSVE-NEXT:    sub w10, w10, #1
+; NONEON-NOSVE-NEXT:    sub w14, w14, #1
+; NONEON-NOSVE-NEXT:    sub w8, w8, #1
+; NONEON-NOSVE-NEXT:    and w4, w4, w5
+; NONEON-NOSVE-NEXT:    and w18, w18, w6
+; NONEON-NOSVE-NEXT:    ldp w5, w6, [sp, #32]
+; NONEON-NOSVE-NEXT:    and w16, w16, w5
+; NONEON-NOSVE-NEXT:    and w12, w12, w6
+; NONEON-NOSVE-NEXT:    ldp w5, w6, [sp, #40]
+; NONEON-NOSVE-NEXT:    and w10, w10, w5
+; NONEON-NOSVE-NEXT:    and w17, w17, w6
+; NONEON-NOSVE-NEXT:    orr w17, w17, w18
+; NONEON-NOSVE-NEXT:    orr w10, w10, w4
+; NONEON-NOSVE-NEXT:    ldp w18, w4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp w5, w6, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w10, w17, [sp, #104]
+; NONEON-NOSVE-NEXT:    orr w10, w12, w0
+; NONEON-NOSVE-NEXT:    orr w12, w16, w1
+; NONEON-NOSVE-NEXT:    and w11, w11, w4
+; NONEON-NOSVE-NEXT:    stp w12, w10, [sp, #96]
+; NONEON-NOSVE-NEXT:    and w10, w13, w18
+; NONEON-NOSVE-NEXT:    orr w11, w11, w3
+; NONEON-NOSVE-NEXT:    and w12, w14, w6
+; NONEON-NOSVE-NEXT:    orr w10, w10, w2
+; NONEON-NOSVE-NEXT:    and w8, w8, w5
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #120]
+; NONEON-NOSVE-NEXT:    orr w10, w12, w15
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %pre_cond = load <8 x i32>, ptr %pre_cond_ptr
   %left = load <8 x i32>, ptr %left_ptr

>From 1e1d2d41318cc81876e90cf72ca6dbf3e2c918c7 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Wed, 19 Jun 2024 11:26:30 +0100
Subject: [PATCH 2/2] Simplify expression

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9ecd93f4b8cdb..885581375eb3b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18151,8 +18151,7 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return SDValue();
 
   if (VT.isFixedLengthVector() &&
-      (!Subtarget.isNeonAvailable() ||
-       TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable())))
+      (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);