[llvm] [AArch64] Use SVE2 bit-sel instructions for some binary patterns. (PR #147544)

Tue Jul 8 08:12:35 PDT 2025

https://github.com/rj-jesus created https://github.com/llvm/llvm-project/pull/147544

We can use NBSL/BSL2N to implement the following operations via the
corresponding identities:
* EON(a, b) = BSL2N(a, a, b) = BSL2N (b, b, a)
* NAND(a, b) = NBSL(a, b, b) = NBSL(b, a, a)
* NOR(a, b) = NBSL(a, b, a) = NBSL(b, a, b)
* ORN(a, b) = BSL2N(a, b, a)

Most of these operations are currently lowered into at least two instructions
because we don't have dedicated Neon/SVE instructions for them. With the
appropriate pattern of NBSL/BSL2N we can lower them in a single
instruction.

P.S. We can also use NBSL to implement an unpredicated NOT(a) =
NBSL(a, a, a). However, because of the tied register constraint, this
may not always be profitable.


>From 46785cbdf84d42328ec56ed88c170ca8cc546f2f Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 8 Jul 2025 04:19:01 -0700
Subject: [PATCH 1/2] Add tests.

---
 llvm/test/CodeGen/AArch64/bsl.ll      | 85 +++++++++++++++++++++++++++
 llvm/test/CodeGen/AArch64/eor3.ll     | 48 ++++++++++++++-
 llvm/test/CodeGen/AArch64/sve2-bsl.ll | 64 ++++++++++++++++++++
 3 files changed, 196 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/bsl.ll b/llvm/test/CodeGen/AArch64/bsl.ll
index 5a270bc71cfc1..8e402ed23d085 100644
--- a/llvm/test/CodeGen/AArch64/bsl.ll
+++ b/llvm/test/CodeGen/AArch64/bsl.ll
@@ -431,3 +431,88 @@ define <4 x i8> @bsl2n_v4i8(<4 x i8> %0, <4 x i8> %1, <4 x i8> %2) {
   %7 = or <4 x i8> %4, %6
   ret <4 x i8> %7
 }
+
+; NOT (a) has a dedicated instruction (MVN).
+define <2 x i64> @not_q(<2 x i64> %0) #0 {
+; NEON-LABEL: not_q:
+; NEON:       // %bb.0:
+; NEON-NEXT:    mvn v0.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; SVE2-LABEL: not_q:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    mvn v0.16b, v0.16b
+; SVE2-NEXT:    ret
+  %2 = xor <2 x i64> %0, splat (i64 -1)
+  ret <2 x i64> %2
+}
+
+; NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a).
+define <2 x i64> @nand_q(<2 x i64> %0, <2 x i64> %1) #0 {
+; NEON-LABEL: nand_q:
+; NEON:       // %bb.0:
+; NEON-NEXT:    and v0.16b, v1.16b, v0.16b
+; NEON-NEXT:    mvn v0.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; SVE2-LABEL: nand_q:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    and v0.16b, v1.16b, v0.16b
+; SVE2-NEXT:    mvn v0.16b, v0.16b
+; SVE2-NEXT:    ret
+  %3 = and <2 x i64> %1, %0
+  %4 = xor <2 x i64> %3, splat (i64 -1)
+  ret <2 x i64> %4
+}
+
+; NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b).
+define <2 x i64> @nor_q(<2 x i64> %0, <2 x i64> %1) #0 {
+; NEON-LABEL: nor_q:
+; NEON:       // %bb.0:
+; NEON-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NEON-NEXT:    mvn v0.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; SVE2-LABEL: nor_q:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    orr v0.16b, v1.16b, v0.16b
+; SVE2-NEXT:    mvn v0.16b, v0.16b
+; SVE2-NEXT:    ret
+  %3 = or <2 x i64> %1, %0
+  %4 = xor <2 x i64> %3, splat (i64 -1)
+  ret <2 x i64> %4
+}
+
+; EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a).
+define <2 x i64> @eon_q(<2 x i64> %0, <2 x i64> %1) #0 {
+; NEON-LABEL: eon_q:
+; NEON:       // %bb.0:
+; NEON-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NEON-NEXT:    mvn v0.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; SVE2-LABEL: eon_q:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    eor v0.16b, v0.16b, v1.16b
+; SVE2-NEXT:    mvn v0.16b, v0.16b
+; SVE2-NEXT:    ret
+  %3 = xor <2 x i64> %0, %1
+  %4 = xor <2 x i64> %3, splat (i64 -1)
+  ret <2 x i64> %4
+}
+
+; ORN (a, b) has a dedicated instruction (ORN).
+define <2 x i64> @orn_q(<2 x i64> %0, <2 x i64> %1) #0 {
+; NEON-LABEL: orn_q:
+; NEON:       // %bb.0:
+; NEON-NEXT:    orn v0.16b, v0.16b, v1.16b
+; NEON-NEXT:    ret
+;
+; SVE2-LABEL: orn_q:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    orn v0.16b, v0.16b, v1.16b
+; SVE2-NEXT:    ret
+  %3 = xor <2 x i64> %1, splat (i64 -1)
+  %4 = or <2 x i64> %0, %3
+  ret <2 x i64> %4
+}
diff --git a/llvm/test/CodeGen/AArch64/eor3.ll b/llvm/test/CodeGen/AArch64/eor3.ll
index b89d9d608575c..a2631681847dc 100644
--- a/llvm/test/CodeGen/AArch64/eor3.ll
+++ b/llvm/test/CodeGen/AArch64/eor3.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
 ; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
 ; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
-; RUN: llc -mtriple=aarch64 -mattr=+sha3,+sve2 < %s | FileCheck --check-prefix=SHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3,+sve2 < %s | FileCheck --check-prefix=SHA3-SVE2 %s
 
 define <16 x i8> @eor3_16x8_left(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
 ; SHA3-LABEL: eor3_16x8_left:
@@ -24,6 +24,11 @@ define <16 x i8> @eor3_16x8_left(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
 ; SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    mov v0.16b, v2.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_16x8_left:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v0.16b, v1.16b, v2.16b
+; SHA3-SVE2-NEXT:    ret
   %4 = xor <16 x i8> %0, %1
   %5 = xor <16 x i8> %2, %4
   ret <16 x i8> %5
@@ -49,6 +54,11 @@ define <16 x i8> @eor3_16x8_right(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
 ; SVE2-NEXT:    eor3 z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    mov v0.16b, v1.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_16x8_right:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT:    ret
   %4 = xor <16 x i8> %1, %2
   %5 = xor <16 x i8> %4, %0
   ret <16 x i8> %5
@@ -74,6 +84,11 @@ define <8 x i16> @eor3_8x16_left(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
 ; SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    mov v0.16b, v2.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_8x16_left:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v0.16b, v1.16b, v2.16b
+; SHA3-SVE2-NEXT:    ret
   %4 = xor <8 x i16> %0, %1
   %5 = xor <8 x i16> %2, %4
   ret <8 x i16> %5
@@ -99,6 +114,11 @@ define <8 x i16> @eor3_8x16_right(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
 ; SVE2-NEXT:    eor3 z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    mov v0.16b, v1.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_8x16_right:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT:    ret
   %4 = xor <8 x i16> %1, %2
   %5 = xor <8 x i16> %4, %0
   ret <8 x i16> %5
@@ -124,6 +144,11 @@ define <4 x i32> @eor3_4x32_left(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
 ; SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    mov v0.16b, v2.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_4x32_left:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v0.16b, v1.16b, v2.16b
+; SHA3-SVE2-NEXT:    ret
   %4 = xor <4 x i32> %0, %1
   %5 = xor <4 x i32> %2, %4
   ret <4 x i32> %5
@@ -149,6 +174,11 @@ define <4 x i32> @eor3_4x32_right(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
 ; SVE2-NEXT:    eor3 z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    mov v0.16b, v1.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_4x32_right:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT:    ret
   %4 = xor <4 x i32> %1, %2
   %5 = xor <4 x i32> %4, %0
   ret <4 x i32> %5
@@ -174,6 +204,11 @@ define <2 x i64> @eor3_2x64_left(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
 ; SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    mov v0.16b, v2.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_2x64_left:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v0.16b, v1.16b, v2.16b
+; SHA3-SVE2-NEXT:    ret
   %4 = xor <2 x i64> %0, %1
   %5 = xor <2 x i64> %2, %4
   ret <2 x i64> %5
@@ -199,6 +234,11 @@ define <2 x i64> @eor3_2x64_right(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
 ; SVE2-NEXT:    eor3 z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    mov v0.16b, v1.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_2x64_right:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor3 v0.16b, v1.16b, v2.16b, v0.16b
+; SHA3-SVE2-NEXT:    ret
   %4 = xor <2 x i64> %1, %2
   %5 = xor <2 x i64> %4, %0
   ret <2 x i64> %5
@@ -222,6 +262,12 @@ define <2 x i64> @eor3_vnot(<2 x i64> %0, <2 x i64> %1) {
 ; SVE2-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; SVE2-NEXT:    mvn v0.16b, v0.16b
 ; SVE2-NEXT:    ret
+;
+; SHA3-SVE2-LABEL: eor3_vnot:
+; SHA3-SVE2:       // %bb.0:
+; SHA3-SVE2-NEXT:    eor v0.16b, v0.16b, v1.16b
+; SHA3-SVE2-NEXT:    mvn v0.16b, v0.16b
+; SHA3-SVE2-NEXT:    ret
   %3 = xor <2 x i64> %0, <i64 -1, i64 -1>
   %4 = xor <2 x i64> %3, %1
   ret <2 x i64> %4
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 8aedeac18f64a..660d0c85a4b7c 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -312,3 +312,67 @@ entry:
   %t3 = xor <vscale x 4 x i32> %t2, %b
   ret <vscale x 4 x i32> %t3
 }
+
+; NOT (a) = NBSL (a, a, a).
+; We don't have a pattern for this right now because the tied register
+; constraint can lead to worse code gen.
+define <vscale x 2 x i64> @not(<vscale x 2 x i64> %0) #0 {
+; CHECK-LABEL: not:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %2 = xor <vscale x 2 x i64> %0, splat (i64 -1)
+  ret <vscale x 2 x i64> %2
+}
+
+; NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a).
+define <vscale x 2 x i64> @nand(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: nand:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    eor z0.d, z0.d, z2.d
+; CHECK-NEXT:    ret
+  %3 = and <vscale x 2 x i64> %1, %0
+  %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
+  ret <vscale x 2 x i64> %4
+}
+
+; NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b).
+define <vscale x 2 x i64> @nor(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: nor:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    eor z0.d, z0.d, z2.d
+; CHECK-NEXT:    ret
+  %3 = or <vscale x 2 x i64> %1, %0
+  %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
+  ret <vscale x 2 x i64> %4
+}
+
+; EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a).
+define <vscale x 2 x i64> @eon(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: eon:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor3 z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %3 = xor <vscale x 2 x i64> %0, %1
+  %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
+  ret <vscale x 2 x i64> %4
+}
+
+; ORN (a, b) = BSL2N (a, b, a).
+define <vscale x 2 x i64> @orn(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-LABEL: orn:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z1.d, z1.d, z2.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %3 = xor <vscale x 2 x i64> %1, splat (i64 -1)
+  %4 = or <vscale x 2 x i64> %0, %3
+  ret <vscale x 2 x i64> %4
+}

>From 7f5c069de223eedf28f2f9efa603287194bc27b4 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 8 Jul 2025 04:19:17 -0700
Subject: [PATCH 2/2] [AArch64] Use SVE2 bit-sel instructions for some binary
 patterns.

We can use NBSL/BSL2N to implement the following operations via the
corresponding identities:
* EON(a, b) = BSL2N(a, a, b)
* NAND(a, b) = NBSL(a, b, b) = NBSL(b, a, a)
* NOR(a, b) = NBSL(a, b, a) = NBSL(b, a, b)
* ORN(a, b) = BSL2N(a, b, a)

These operations are currently lowered into at least two instructions
because we don't have dedicated Neon/SVE instructions for them. With the
appropriate pattern of NBSL/BSL2N we can lower them in a single
instruction.

P.S. We can also use NBSL to implement an unpredicated NOT(a) =
NBSL(a, a, a). However, because of the tied register constraint, this
may not be always profitable.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 30 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/bsl.ll              | 18 +++++++----
 llvm/test/CodeGen/AArch64/eor3.ll             | 12 +++++---
 .../test/CodeGen/AArch64/sve-pred-selectop.ll | 12 ++------
 llvm/test/CodeGen/AArch64/sve2-bsl.ll         | 15 +++-------
 5 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 261df563bb2a9..8f02fc0b647ac 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4034,6 +4034,36 @@ let Predicates = [HasSVE2_or_SME] in {
   defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", AArch64bsl2n>;
   defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  AArch64nbsl>;
 
+  multiclass binary_bitwise<ValueType VT, SDPatternOperator InOp, SDPatternOperator OutOp> {
+    def : Pat<(InOp VT:$op1, VT:$op2), (OutOp $op1, $op2)>;
+
+    def : Pat<(SVEType<VT>.DSub (InOp V64:$op1, V64:$op2)),
+              (EXTRACT_SUBREG (OutOp (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.DSub $op1), dsub),
+                                     (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.DSub $op2), dsub)), dsub)>;
+
+    def : Pat<(SVEType<VT>.ZSub (InOp V128:$op1, V128:$op2)),
+              (EXTRACT_SUBREG (OutOp (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.ZSub $op1), zsub),
+                                     (INSERT_SUBREG (IMPLICIT_DEF), (SVEType<VT>.ZSub $op2), zsub)), zsub)>;
+  }
+
+  foreach VT = [nxv16i8, nxv8i16, nxv4i32, nxv2i64] in {
+    // EON (a, b) = BSL2N (a, a, b) = BSL2N (b, b, a)
+    defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (xor node:$op1, node:$op2))>,
+                              OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op1, $op2)>>;
+
+    // NAND (a, b) = NBSL (a, b, b) = NBSL (b, a, a)
+    defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (and node:$op1, node:$op2))>,
+                              OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op1)>>;
+
+    // NOR (a, b) = NBSL (a, b, a) = NBSL (b, a, b)
+    defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (vnot (or node:$op1, node:$op2))>,
+                              OutPatFrag<(ops node:$op1, node:$op2), (NBSL_ZZZZ $op2, $op1, $op2)>>;
+
+    // ORN (a, b) = BSL2N (a, b, a)
+    defm : binary_bitwise<VT, PatFrag<(ops node:$op1, node:$op2), (or node:$op1, (vnot node:$op2))>,
+                              OutPatFrag<(ops node:$op1, node:$op2), (BSL2N_ZZZZ $op1, $op2, $op1)>>;
+  }
+
   // SVE2 bitwise xor and rotate right by immediate
   defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
 
diff --git a/llvm/test/CodeGen/AArch64/bsl.ll b/llvm/test/CodeGen/AArch64/bsl.ll
index 8e402ed23d085..df6b6f75b8935 100644
--- a/llvm/test/CodeGen/AArch64/bsl.ll
+++ b/llvm/test/CodeGen/AArch64/bsl.ll
@@ -457,8 +457,10 @@ define <2 x i64> @nand_q(<2 x i64> %0, <2 x i64> %1) #0 {
 ;
 ; SVE2-LABEL: nand_q:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    and v0.16b, v1.16b, v0.16b
-; SVE2-NEXT:    mvn v0.16b, v0.16b
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT:    nbsl z0.d, z0.d, z1.d, z1.d
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
   %3 = and <2 x i64> %1, %0
   %4 = xor <2 x i64> %3, splat (i64 -1)
@@ -475,8 +477,10 @@ define <2 x i64> @nor_q(<2 x i64> %0, <2 x i64> %1) #0 {
 ;
 ; SVE2-LABEL: nor_q:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    orr v0.16b, v1.16b, v0.16b
-; SVE2-NEXT:    mvn v0.16b, v0.16b
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT:    nbsl z0.d, z0.d, z1.d, z0.d
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
   %3 = or <2 x i64> %1, %0
   %4 = xor <2 x i64> %3, splat (i64 -1)
@@ -493,8 +497,10 @@ define <2 x i64> @eon_q(<2 x i64> %0, <2 x i64> %1) #0 {
 ;
 ; SVE2-LABEL: eon_q:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    eor v0.16b, v0.16b, v1.16b
-; SVE2-NEXT:    mvn v0.16b, v0.16b
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT:    bsl2n z0.d, z0.d, z0.d, z1.d
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
   %3 = xor <2 x i64> %0, %1
   %4 = xor <2 x i64> %3, splat (i64 -1)
diff --git a/llvm/test/CodeGen/AArch64/eor3.ll b/llvm/test/CodeGen/AArch64/eor3.ll
index a2631681847dc..eccd09131b525 100644
--- a/llvm/test/CodeGen/AArch64/eor3.ll
+++ b/llvm/test/CodeGen/AArch64/eor3.ll
@@ -259,14 +259,18 @@ define <2 x i64> @eor3_vnot(<2 x i64> %0, <2 x i64> %1) {
 ;
 ; SVE2-LABEL: eor3_vnot:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    eor v0.16b, v0.16b, v1.16b
-; SVE2-NEXT:    mvn v0.16b, v0.16b
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT:    bsl2n z0.d, z0.d, z0.d, z1.d
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
 ;
 ; SHA3-SVE2-LABEL: eor3_vnot:
 ; SHA3-SVE2:       // %bb.0:
-; SHA3-SVE2-NEXT:    eor v0.16b, v0.16b, v1.16b
-; SHA3-SVE2-NEXT:    mvn v0.16b, v0.16b
+; SHA3-SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SHA3-SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SHA3-SVE2-NEXT:    bsl2n z0.d, z0.d, z0.d, z1.d
+; SHA3-SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SHA3-SVE2-NEXT:    ret
   %3 = xor <2 x i64> %0, <i64 -1, i64 -1>
   %4 = xor <2 x i64> %3, %1
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
index 30ec2de2bd9cc..9a78726c450d1 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
@@ -322,11 +322,9 @@ entry:
 define <vscale x 4 x i32> @ornot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
 ; CHECK-LABEL: ornot_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z3.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    bsl2n z1.d, z1.d, z2.d, z1.d
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    orr z1.d, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -340,11 +338,9 @@ entry:
 define <vscale x 8 x i16> @ornot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
 ; CHECK-LABEL: ornot_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z3.h, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    bsl2n z1.d, z1.d, z2.d, z1.d
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    orr z1.d, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -358,11 +354,9 @@ entry:
 define <vscale x 16 x i8> @ornot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
 ; CHECK-LABEL: ornot_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z3.b, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    bsl2n z1.d, z1.d, z2.d, z1.d
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    orr z1.d, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 660d0c85a4b7c..6cfe66eb8e633 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -330,9 +330,7 @@ define <vscale x 2 x i64> @not(<vscale x 2 x i64> %0) #0 {
 define <vscale x 2 x i64> @nand(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: nand:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    and z0.d, z1.d, z0.d
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z1.d
 ; CHECK-NEXT:    ret
   %3 = and <vscale x 2 x i64> %1, %0
   %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
@@ -343,9 +341,7 @@ define <vscale x 2 x i64> @nand(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0
 define <vscale x 2 x i64> @nor(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: nor:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    orr z0.d, z1.d, z0.d
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
   %3 = or <vscale x 2 x i64> %1, %0
   %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
@@ -356,8 +352,7 @@ define <vscale x 2 x i64> @nor(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0
 define <vscale x 2 x i64> @eon(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: eon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    eor3 z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    bsl2n z0.d, z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %3 = xor <vscale x 2 x i64> %0, %1
   %4 = xor <vscale x 2 x i64> %3, splat (i64 -1)
@@ -368,9 +363,7 @@ define <vscale x 2 x i64> @eon(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0
 define <vscale x 2 x i64> @orn(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
 ; CHECK-LABEL: orn:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    eor z1.d, z1.d, z2.d
-; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    bsl2n z0.d, z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
   %3 = xor <vscale x 2 x i64> %1, splat (i64 -1)
   %4 = or <vscale x 2 x i64> %0, %3