[llvm] [AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for add/sub. (PR #88413)

Wed Apr 17 14:33:04 PDT 2024

https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/88413

>From 1f98cf82c3b9cbfb12d9ed5f88d62b986518f0d8 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Thu, 11 Apr 2024 16:34:42 +0000
Subject: [PATCH 1/3] [AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for
 add/sub.

Allow to fold or/and-and to BSL instuction for scalable vectors.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   6 +-
 llvm/test/CodeGen/AArch64/sve2-bitselect.ll   | 254 ++++++++++++++++++
 2 files changed, 258 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-bitselect.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 80181a77c9d238..d9aabb64125a4f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17942,11 +17942,13 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       } else
         continue;
 
-      if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
+      if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()) &&
+          !ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
         continue;
 
       // Constant ones is always righthand operand of the Add.
-      if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
+      if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()) &&
+          !ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
         continue;
 
       if (Sub.getOperand(1) != Add.getOperand(0))
diff --git a/llvm/test/CodeGen/AArch64/sve2-bitselect.ll b/llvm/test/CodeGen/AArch64/sve2-bitselect.ll
new file mode 100644
index 00000000000000..9ceeffc2e5d2ab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-bitselect.ll
@@ -0,0 +1,254 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64"
+
+; Check that an expanded vbsl(vneg(pre_cond), left, right) lowers to a VBSL
+; during ISEL.
+;
+; Subtly different from a plain vector bit select: operand representing the
+; condition has been negated (-v, not to be confused with bitwise_not(v)).
+
+; Each vbsl_neg_cond_xxxx tests one of the 16 permutations of the operands.
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0000(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0000:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0000 = or <vscale x 4 x i32> %right_bits_0, %left_bits_0
+  ret <vscale x 4 x i32> %bsl0000
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0001(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0001:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0001 = or <vscale x 4 x i32> %right_bits_0, %left_bits_1
+  ret <vscale x 4 x i32> %bsl0001
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0010(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0010:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
+  %bsl0010 = or <vscale x 4 x i32> %right_bits_1, %left_bits_0
+  ret <vscale x 4 x i32> %bsl0010
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0011(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0011:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
+  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
+  %bsl0011 = or <vscale x 4 x i32> %right_bits_1, %left_bits_1
+  ret <vscale x 4 x i32> %bsl0011
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0100(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0100:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0100 = or <vscale x 4 x i32> %left_bits_0, %right_bits_0
+  ret <vscale x 4 x i32> %bsl0100
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0101(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0101:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
+  %bsl0101 = or <vscale x 4 x i32> %left_bits_0, %right_bits_1
+  ret <vscale x 4 x i32> %bsl0101
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0110(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0110:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0110 = or <vscale x 4 x i32> %left_bits_1, %right_bits_0
+  ret <vscale x 4 x i32> %bsl0110
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_0111(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_0111:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
+  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
+  %bsl0111 = or <vscale x 4 x i32> %left_bits_1, %right_bits_1
+  ret <vscale x 4 x i32> %bsl0111
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1000(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1000:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
+  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
+  %bsl1000 = or <vscale x 4 x i32> %flip_cond_right_bits_0, %flip_cond_left_bits_0
+  ret <vscale x 4 x i32> %bsl1000
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1001(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1001:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
+  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
+  %bsl1001 = or <vscale x 4 x i32> %flip_cond_right_bits_0, %flip_cond_left_bits_1
+  ret <vscale x 4 x i32> %bsl1001
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1010(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1010:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
+  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
+  %bsl1010 = or <vscale x 4 x i32> %flip_cond_right_bits_1, %flip_cond_left_bits_0
+  ret <vscale x 4 x i32> %bsl1010
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1011(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1011:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
+  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
+  %bsl1011 = or <vscale x 4 x i32> %flip_cond_right_bits_1, %flip_cond_left_bits_1
+  ret <vscale x 4 x i32> %bsl1011
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1100(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1100:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
+  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
+  %bsl1100 = or <vscale x 4 x i32> %flip_cond_left_bits_0, %flip_cond_right_bits_0
+  ret <vscale x 4 x i32> %bsl1100
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1101(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1101:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
+  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
+  %bsl1101 = or <vscale x 4 x i32> %flip_cond_left_bits_0, %flip_cond_right_bits_1
+  ret <vscale x 4 x i32> %bsl1101
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1110(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1110:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
+  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
+  %bsl1110 = or <vscale x 4 x i32> %flip_cond_left_bits_1, %flip_cond_right_bits_0
+  ret <vscale x 4 x i32> %bsl1110
+}
+
+define <vscale x 4 x i32> @vbsl_neg_cond_1111(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: vbsl_neg_cond_1111:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
+  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
+  %bsl1111 = or <vscale x 4 x i32> %flip_cond_left_bits_1, %flip_cond_right_bits_1
+  ret <vscale x 4 x i32> %bsl1111
+}
+
+attributes #0 = { "target-features"="+sve2" }

>From 24b4ad67f1773516471a7d5919243aabcdd1b94f Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 15 Apr 2024 11:07:53 +0000
Subject: [PATCH 2/3] Resolved remarks.

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   4 +-
 llvm/test/CodeGen/AArch64/sve2-bitselect.ll   | 254 ------------------
 llvm/test/CodeGen/AArch64/sve2-bsl.ll         |  15 ++
 3 files changed, 17 insertions(+), 256 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/sve2-bitselect.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d9aabb64125a4f..828869dfd3c222 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17942,12 +17942,12 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       } else
         continue;
 
-      if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()) &&
+      if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()) &&
           !ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
         continue;
 
       // Constant ones is always righthand operand of the Add.
-      if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()) &&
+      if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()) &&
           !ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
         continue;
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-bitselect.ll b/llvm/test/CodeGen/AArch64/sve2-bitselect.ll
deleted file mode 100644
index 9ceeffc2e5d2ab..00000000000000
--- a/llvm/test/CodeGen/AArch64/sve2-bitselect.ll
+++ /dev/null
@@ -1,254 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s | FileCheck %s
-
-target triple = "aarch64"
-
-; Check that an expanded vbsl(vneg(pre_cond), left, right) lowers to a VBSL
-; during ISEL.
-;
-; Subtly different from a plain vector bit select: operand representing the
-; condition has been negated (-v, not to be confused with bitwise_not(v)).
-
-; Each vbsl_neg_cond_xxxx tests one of the 16 permutations of the operands.
-
-define <vscale x 4 x i32> @vbsl_neg_cond_0000(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_0000:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
-  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
-  %bsl0000 = or <vscale x 4 x i32> %right_bits_0, %left_bits_0
-  ret <vscale x 4 x i32> %bsl0000
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_0001(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_0001:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
-  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
-  %bsl0001 = or <vscale x 4 x i32> %right_bits_0, %left_bits_1
-  ret <vscale x 4 x i32> %bsl0001
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_0010(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_0010:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
-  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
-  %bsl0010 = or <vscale x 4 x i32> %right_bits_1, %left_bits_0
-  ret <vscale x 4 x i32> %bsl0010
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_0011(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_0011:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
-  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
-  %bsl0011 = or <vscale x 4 x i32> %right_bits_1, %left_bits_1
-  ret <vscale x 4 x i32> %bsl0011
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_0100(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_0100:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
-  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
-  %bsl0100 = or <vscale x 4 x i32> %left_bits_0, %right_bits_0
-  ret <vscale x 4 x i32> %bsl0100
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_0101(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_0101:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
-  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
-  %bsl0101 = or <vscale x 4 x i32> %left_bits_0, %right_bits_1
-  ret <vscale x 4 x i32> %bsl0101
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_0110(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_0110:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
-  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
-  %bsl0110 = or <vscale x 4 x i32> %left_bits_1, %right_bits_0
-  ret <vscale x 4 x i32> %bsl0110
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_0111(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_0111:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %left_bits_1 = and <vscale x 4 x i32> %left, %neg_cond
-  %right_bits_1 = and <vscale x 4 x i32> %right, %min_cond
-  %bsl0111 = or <vscale x 4 x i32> %left_bits_1, %right_bits_1
-  ret <vscale x 4 x i32> %bsl0111
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_1000(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_1000:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
-  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
-  %bsl1000 = or <vscale x 4 x i32> %flip_cond_right_bits_0, %flip_cond_left_bits_0
-  ret <vscale x 4 x i32> %bsl1000
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_1001(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_1001:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
-  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
-  %bsl1001 = or <vscale x 4 x i32> %flip_cond_right_bits_0, %flip_cond_left_bits_1
-  ret <vscale x 4 x i32> %bsl1001
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_1010(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_1010:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
-  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
-  %bsl1010 = or <vscale x 4 x i32> %flip_cond_right_bits_1, %flip_cond_left_bits_0
-  ret <vscale x 4 x i32> %bsl1010
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_1011(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_1011:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
-  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
-  %bsl1011 = or <vscale x 4 x i32> %flip_cond_right_bits_1, %flip_cond_left_bits_1
-  ret <vscale x 4 x i32> %bsl1011
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_1100(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_1100:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
-  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
-  %bsl1100 = or <vscale x 4 x i32> %flip_cond_left_bits_0, %flip_cond_right_bits_0
-  ret <vscale x 4 x i32> %bsl1100
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_1101(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_1101:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %flip_cond_left_bits_0 = and <vscale x 4 x i32> %min_cond, %left
-  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
-  %bsl1101 = or <vscale x 4 x i32> %flip_cond_left_bits_0, %flip_cond_right_bits_1
-  ret <vscale x 4 x i32> %bsl1101
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_1110(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_1110:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
-  %flip_cond_right_bits_0 = and <vscale x 4 x i32> %neg_cond, %right
-  %bsl1110 = or <vscale x 4 x i32> %flip_cond_left_bits_1, %flip_cond_right_bits_0
-  ret <vscale x 4 x i32> %bsl1110
-}
-
-define <vscale x 4 x i32> @vbsl_neg_cond_1111(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
-; CHECK-LABEL: vbsl_neg_cond_1111:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
-  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
-  %flip_cond_left_bits_1 = and <vscale x 4 x i32> %left, %min_cond
-  %flip_cond_right_bits_1 = and <vscale x 4 x i32> %right, %neg_cond
-  %bsl1111 = or <vscale x 4 x i32> %flip_cond_left_bits_1, %flip_cond_right_bits_1
-  ret <vscale x 4 x i32> %bsl1111
-}
-
-attributes #0 = { "target-features"="+sve2" }
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 11f67634a3fb2c..23b2622f5f5863 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -13,6 +13,21 @@ define <vscale x 4 x i32> @bsl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
   ret <vscale x 4 x i32> %c
 }
 
+define <vscale x 4 x i32> @bsl_add_sub(<vscale x 4 x i32> %pre_cond, <vscale x 4 x i32> %left, <vscale x 4 x i32> %right) #0 {
+; CHECK-LABEL: bsl_add_sub:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %neg_cond = sub <vscale x 4 x i32> zeroinitializer, %pre_cond
+  %min_cond = add <vscale x 4 x i32> %pre_cond, splat(i32 -1)
+  %left_bits_0 = and <vscale x 4 x i32> %neg_cond, %left
+  %right_bits_0 = and <vscale x 4 x i32> %min_cond, %right
+  %bsl0000 = or <vscale x 4 x i32> %right_bits_0, %left_bits_0
+  ret <vscale x 4 x i32> %bsl0000
+}
+
 ; we are not expecting bsl instruction here. the constants do not match to fold to bsl.
 define <vscale x 4 x i32> @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: no_bsl_fold:

>From dac6e754e18c902ceb2e3d7f8823b88416605a17 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Wed, 17 Apr 2024 21:30:51 +0000
Subject: [PATCH 3/3] Removed isConstantSplatVector duplicates.

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 828869dfd3c222..03aaafce02e041 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17942,13 +17942,11 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       } else
         continue;
 
-      if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()) &&
-          !ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
+      if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
         continue;
 
       // Constant ones is always righthand operand of the Add.
-      if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()) &&
-          !ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
+      if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
         continue;
 
       if (Sub.getOperand(1) != Add.getOperand(0))