[llvm] [AArch64] Combine (and/or X, (dup (not Y))) -> (bic/orn X, (dup Y)) (PR #175739)

Piotr Fusik via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 2 00:32:54 PST 2026


https://github.com/pfusik updated https://github.com/llvm/llvm-project/pull/175739

>From 8b2e680a59e0e055b29a267242c224760a890d51 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Tue, 13 Jan 2026 11:39:21 +0100
Subject: [PATCH 1/4] [AArch64][test] Combine (and/or X, (dup (not Y))) ->
 (bic/orn X, (dup Y))

---
 .../AArch64/neon-bitwise-instructions.ll      | 364 ++++++++++++++++++
 1 file changed, 364 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 01aea72d77114..3aebedd2bdb1c 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -2868,3 +2868,367 @@ entry:
   %vqaddq_v2.i26515 = tail call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1 >, <8 x i16> %vbsl5.i)
   ret <8 x i16> %vqaddq_v2.i26515
 }
+
+define <8 x i8> @and_dup_not_v8i8(<8 x i8> %a, i8 %m) {
+; CHECK-LABEL: and_dup_not_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.8b, w8
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %not = xor i8 %m, -1
+  %insert = insertelement <8 x i8> poison, i8 %not, i64 0
+  %shuffle = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer
+  %and = and <8 x i8> %a, %shuffle
+  ret <8 x i8> %and
+}
+
+define <8 x i8> @and_dup_not_v8i8_swapped(<8 x i8> %a, i8 %m) {
+; CHECK-LABEL: and_dup_not_v8i8_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.8b, w8
+; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %not = xor i8 %m, -1
+  %insert = insertelement <8 x i8> poison, i8 %not, i64 0
+  %shuffle = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer
+  %and = and <8 x i8> %shuffle, %a
+  ret <8 x i8> %and
+}
+
+define <16 x i8> @and_dup_not_v16i8(<16 x i8> %a, i8 %m) {
+; CHECK-LABEL: and_dup_not_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.16b, w8
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not = xor i8 %m, -1
+  %insert = insertelement <16 x i8> poison, i8 %not, i64 0
+  %shuffle = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+  %and = and <16 x i8> %a, %shuffle
+  ret <16 x i8> %and
+}
+
+define <16 x i8> @and_dup_not_v16i8_swapped(<16 x i8> %a, i8 %m) {
+; CHECK-LABEL: and_dup_not_v16i8_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.16b, w8
+; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %not = xor i8 %m, -1
+  %insert = insertelement <16 x i8> poison, i8 %not, i64 0
+  %shuffle = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+  %and = and <16 x i8> %shuffle, %a
+  ret <16 x i8> %and
+}
+
+define <4 x i16> @and_dup_not_v4i16(<4 x i16> %a, i16 %m) {
+; CHECK-LABEL: and_dup_not_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.4h, w8
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %not = xor i16 %m, -1
+  %insert = insertelement <4 x i16> poison, i16 %not, i64 0
+  %shuffle = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer
+  %and = and <4 x i16> %a, %shuffle
+  ret <4 x i16> %and
+}
+
+define <4 x i16> @and_dup_not_v4i16_swapped(<4 x i16> %a, i16 %m) {
+; CHECK-LABEL: and_dup_not_v4i16_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.4h, w8
+; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %not = xor i16 %m, -1
+  %insert = insertelement <4 x i16> poison, i16 %not, i64 0
+  %shuffle = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer
+  %and = and <4 x i16> %shuffle, %a
+  ret <4 x i16> %and
+}
+
+define <8 x i16> @and_dup_not_v8i16(<8 x i16> %a, i16 %m) {
+; CHECK-LABEL: and_dup_not_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not = xor i16 %m, -1
+  %insert = insertelement <8 x i16> poison, i16 %not, i64 0
+  %shuffle = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+  %and = and <8 x i16> %a, %shuffle
+  ret <8 x i16> %and
+}
+
+define <8 x i16> @and_dup_not_v8i16_swapped(<8 x i16> %a, i16 %m) {
+; CHECK-LABEL: and_dup_not_v8i16_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %not = xor i16 %m, -1
+  %insert = insertelement <8 x i16> poison, i16 %not, i64 0
+  %shuffle = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+  %and = and <8 x i16> %shuffle, %a
+  ret <8 x i16> %and
+}
+
+define <2 x i32> @and_dup_not_v2i32(<2 x i32> %a, i32 %m) {
+; CHECK-LABEL: and_dup_not_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.2s, w8
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %not = xor i32 %m, -1
+  %insert = insertelement <2 x i32> poison, i32 %not, i64 0
+  %shuffle = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %and = and <2 x i32> %a, %shuffle
+  ret <2 x i32> %and
+}
+
+define <2 x i32> @and_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
+; CHECK-LABEL: and_dup_not_v2i32_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.2s, w8
+; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %not = xor i32 %m, -1
+  %insert = insertelement <2 x i32> poison, i32 %not, i64 0
+  %shuffle = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %and = and <2 x i32> %shuffle, %a
+  ret <2 x i32> %and
+}
+
+define <4 x i32> @and_dup_not_v4i32(<4 x i32> %a, i32 %m) {
+; CHECK-LABEL: and_dup_not_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not = xor i32 %m, -1
+  %insert = insertelement <4 x i32> poison, i32 %not, i64 0
+  %shuffle = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %and = and <4 x i32> %a, %shuffle
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @and_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
+; CHECK-LABEL: and_dup_not_v4i32_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %not = xor i32 %m, -1
+  %insert = insertelement <4 x i32> poison, i32 %not, i64 0
+  %shuffle = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %and = and <4 x i32> %shuffle, %a
+  ret <4 x i32> %and
+}
+
+define <2 x i64> @and_dup_not_v2i64(<2 x i64> %a, i64 %m) {
+; CHECK-LABEL: and_dup_not_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x8, x0
+; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not = xor i64 %m, -1
+  %insert = insertelement <2 x i64> poison, i64 %not, i64 0
+  %shuffle = shufflevector <2 x i64> %insert, <2 x i64> poison, <2 x i32> zeroinitializer
+  %and = and <2 x i64> %a, %shuffle
+  ret <2 x i64> %and
+}
+
+define <8 x i8> @or_dup_not_v8i8(<8 x i8> %a, i8 %m) {
+; CHECK-LABEL: or_dup_not_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.8b, w8
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %not = xor i8 %m, -1
+  %insert = insertelement <8 x i8> poison, i8 %not, i64 0
+  %shuffle = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer
+  %or = or <8 x i8> %a, %shuffle
+  ret <8 x i8> %or
+}
+
+define <8 x i8> @or_dup_not_v8i8_swapped(<8 x i8> %a, i8 %m) {
+; CHECK-LABEL: or_dup_not_v8i8_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.8b, w8
+; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %not = xor i8 %m, -1
+  %insert = insertelement <8 x i8> poison, i8 %not, i64 0
+  %shuffle = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer
+  %or = or <8 x i8> %shuffle, %a
+  ret <8 x i8> %or
+}
+
+define <16 x i8> @or_dup_not_v16i8(<16 x i8> %a, i8 %m) {
+; CHECK-LABEL: or_dup_not_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.16b, w8
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not = xor i8 %m, -1
+  %insert = insertelement <16 x i8> poison, i8 %not, i64 0
+  %shuffle = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+  %or = or <16 x i8> %a, %shuffle
+  ret <16 x i8> %or
+}
+
+define <16 x i8> @or_dup_not_v16i8_swapped(<16 x i8> %a, i8 %m) {
+; CHECK-LABEL: or_dup_not_v16i8_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.16b, w8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %not = xor i8 %m, -1
+  %insert = insertelement <16 x i8> poison, i8 %not, i64 0
+  %shuffle = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+  %or = or <16 x i8> %shuffle, %a
+  ret <16 x i8> %or
+}
+
+define <4 x i16> @or_dup_not_v4i16(<4 x i16> %a, i16 %m) {
+; CHECK-LABEL: or_dup_not_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.4h, w8
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %not = xor i16 %m, -1
+  %insert = insertelement <4 x i16> poison, i16 %not, i64 0
+  %shuffle = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer
+  %or = or <4 x i16> %a, %shuffle
+  ret <4 x i16> %or
+}
+
+define <4 x i16> @or_dup_not_v4i16_swapped(<4 x i16> %a, i16 %m) {
+; CHECK-LABEL: or_dup_not_v4i16_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.4h, w8
+; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %not = xor i16 %m, -1
+  %insert = insertelement <4 x i16> poison, i16 %not, i64 0
+  %shuffle = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer
+  %or = or <4 x i16> %shuffle, %a
+  ret <4 x i16> %or
+}
+
+define <8 x i16> @or_dup_not_v8i16(<8 x i16> %a, i16 %m) {
+; CHECK-LABEL: or_dup_not_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not = xor i16 %m, -1
+  %insert = insertelement <8 x i16> poison, i16 %not, i64 0
+  %shuffle = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+  %or = or <8 x i16> %a, %shuffle
+  ret <8 x i16> %or
+}
+
+define <8 x i16> @or_dup_not_v8i16_swapped(<8 x i16> %a, i16 %m) {
+; CHECK-LABEL: or_dup_not_v8i16_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %not = xor i16 %m, -1
+  %insert = insertelement <8 x i16> poison, i16 %not, i64 0
+  %shuffle = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+  %or = or <8 x i16> %shuffle, %a
+  ret <8 x i16> %or
+}
+
+define <2 x i32> @or_dup_not_v2i32(<2 x i32> %a, i32 %m) {
+; CHECK-LABEL: or_dup_not_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.2s, w8
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %not = xor i32 %m, -1
+  %insert = insertelement <2 x i32> poison, i32 %not, i64 0
+  %shuffle = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %or = or <2 x i32> %a, %shuffle
+  ret <2 x i32> %or
+}
+
+define <2 x i32> @or_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
+; CHECK-LABEL: or_dup_not_v2i32_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.2s, w8
+; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ret
+  %not = xor i32 %m, -1
+  %insert = insertelement <2 x i32> poison, i32 %not, i64 0
+  %shuffle = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
+  %or = or <2 x i32> %shuffle, %a
+  ret <2 x i32> %or
+}
+
+define <4 x i32> @or_dup_not_v4i32(<4 x i32> %a, i32 %m) {
+; CHECK-LABEL: or_dup_not_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not = xor i32 %m, -1
+  %insert = insertelement <4 x i32> poison, i32 %not, i64 0
+  %shuffle = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %or = or <4 x i32> %a, %shuffle
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @or_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
+; CHECK-LABEL: or_dup_not_v4i32_swapped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %not = xor i32 %m, -1
+  %insert = insertelement <4 x i32> poison, i32 %not, i64 0
+  %shuffle = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %or = or <4 x i32> %shuffle, %a
+  ret <4 x i32> %or
+}
+
+define <2 x i64> @or_dup_not_v2i64(<2 x i64> %a, i64 %m) {
+; CHECK-LABEL: or_dup_not_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x8, x0
+; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not = xor i64 %m, -1
+  %insert = insertelement <2 x i64> poison, i64 %not, i64 0
+  %shuffle = shufflevector <2 x i64> %insert, <2 x i64> poison, <2 x i32> zeroinitializer
+  %or = or <2 x i64> %a, %shuffle
+  ret <2 x i64> %or
+}

>From 03191e24d780c9a6a4f986faa24def4edcb9a53f Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Tue, 13 Jan 2026 11:39:40 +0100
Subject: [PATCH 2/4] [AArch64] Combine (and/or X, (dup (not Y))) -> (bic/orn
 X, (dup Y))

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  17 +
 .../AArch64/neon-bitwise-instructions.ll      | 338 +++++++++++-------
 2 files changed, 229 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ace85b04595b8..8dccc892bf2dd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7856,6 +7856,23 @@ def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
 def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
 def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
+multiclass BicOrnDupGpr<ValueType vectype, RegisterClass regtype> {
+  defvar vecreg = !cast<RegisterOperand>("V" # vectype.Size);
+  defvar bvectype = "v" # !srl(vectype.Size, 3) # "i8";
+  def : Pat<(vectype (and vecreg:$Rn, (AArch64dup (not regtype:$Rm)))),
+            (!cast<Instruction>("BIC" # bvectype) vectype:$Rn, (!cast<Instruction>("DUP" # vectype # "gpr") regtype:$Rm))>;
+  def : Pat<(vectype (or  vecreg:$Rn, (AArch64dup (not regtype:$Rm)))),
+            (!cast<Instruction>("ORN" # bvectype) vectype:$Rn, (!cast<Instruction>("DUP" # vectype # "gpr") regtype:$Rm))>;
+}
+
+defm : BicOrnDupGpr<v8i8,  GPR32>;
+defm : BicOrnDupGpr<v16i8, GPR32>;
+defm : BicOrnDupGpr<v4i16, GPR32>;
+defm : BicOrnDupGpr<v8i16, GPR32>;
+defm : BicOrnDupGpr<v2i32, GPR32>;
+defm : BicOrnDupGpr<v4i32, GPR32>;
+defm : BicOrnDupGpr<v2i64, GPR64>;
+
 // SMOV and UMOV definitions, with some extra patterns for convenience
 defm SMOV : SMov;
 defm UMOV : UMov;
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 3aebedd2bdb1c..576fe9d1181e4 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -2870,12 +2870,18 @@ entry:
 }
 
 define <8 x i8> @and_dup_not_v8i8(<8 x i8> %a, i8 %m) {
-; CHECK-LABEL: and_dup_not_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.8b, w8
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.8b, w0
+; CHECK-SD-NEXT:    bic v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.8b, w8
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i8 %m, -1
   %insert = insertelement <8 x i8> poison, i8 %not, i64 0
   %shuffle = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -2884,12 +2890,18 @@ define <8 x i8> @and_dup_not_v8i8(<8 x i8> %a, i8 %m) {
 }
 
 define <8 x i8> @and_dup_not_v8i8_swapped(<8 x i8> %a, i8 %m) {
-; CHECK-LABEL: and_dup_not_v8i8_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.8b, w8
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v8i8_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.8b, w0
+; CHECK-SD-NEXT:    bic v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v8i8_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.8b, w8
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i8 %m, -1
   %insert = insertelement <8 x i8> poison, i8 %not, i64 0
   %shuffle = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -2898,12 +2910,18 @@ define <8 x i8> @and_dup_not_v8i8_swapped(<8 x i8> %a, i8 %m) {
 }
 
 define <16 x i8> @and_dup_not_v16i8(<16 x i8> %a, i8 %m) {
-; CHECK-LABEL: and_dup_not_v16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.16b, w0
+; CHECK-SD-NEXT:    bic v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.16b, w8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i8 %m, -1
   %insert = insertelement <16 x i8> poison, i8 %not, i64 0
   %shuffle = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -2912,12 +2930,18 @@ define <16 x i8> @and_dup_not_v16i8(<16 x i8> %a, i8 %m) {
 }
 
 define <16 x i8> @and_dup_not_v16i8_swapped(<16 x i8> %a, i8 %m) {
-; CHECK-LABEL: and_dup_not_v16i8_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v16i8_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.16b, w0
+; CHECK-SD-NEXT:    bic v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v16i8_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.16b, w8
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i8 %m, -1
   %insert = insertelement <16 x i8> poison, i8 %not, i64 0
   %shuffle = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -2926,12 +2950,18 @@ define <16 x i8> @and_dup_not_v16i8_swapped(<16 x i8> %a, i8 %m) {
 }
 
 define <4 x i16> @and_dup_not_v4i16(<4 x i16> %a, i16 %m) {
-; CHECK-LABEL: and_dup_not_v4i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.4h, w8
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v4i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4h, w0
+; CHECK-SD-NEXT:    bic v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v4i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.4h, w8
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i16 %m, -1
   %insert = insertelement <4 x i16> poison, i16 %not, i64 0
   %shuffle = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -2940,12 +2970,18 @@ define <4 x i16> @and_dup_not_v4i16(<4 x i16> %a, i16 %m) {
 }
 
 define <4 x i16> @and_dup_not_v4i16_swapped(<4 x i16> %a, i16 %m) {
-; CHECK-LABEL: and_dup_not_v4i16_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.4h, w8
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v4i16_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4h, w0
+; CHECK-SD-NEXT:    bic v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v4i16_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.4h, w8
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i16 %m, -1
   %insert = insertelement <4 x i16> poison, i16 %not, i64 0
   %shuffle = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -2954,12 +2990,18 @@ define <4 x i16> @and_dup_not_v4i16_swapped(<4 x i16> %a, i16 %m) {
 }
 
 define <8 x i16> @and_dup_not_v8i16(<8 x i16> %a, i16 %m) {
-; CHECK-LABEL: and_dup_not_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.8h, w0
+; CHECK-SD-NEXT:    bic v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.8h, w8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i16 %m, -1
   %insert = insertelement <8 x i16> poison, i16 %not, i64 0
   %shuffle = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -2968,12 +3010,18 @@ define <8 x i16> @and_dup_not_v8i16(<8 x i16> %a, i16 %m) {
 }
 
 define <8 x i16> @and_dup_not_v8i16_swapped(<8 x i16> %a, i16 %m) {
-; CHECK-LABEL: and_dup_not_v8i16_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v8i16_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.8h, w0
+; CHECK-SD-NEXT:    bic v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v8i16_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.8h, w8
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i16 %m, -1
   %insert = insertelement <8 x i16> poison, i16 %not, i64 0
   %shuffle = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -2984,9 +3032,8 @@ define <8 x i16> @and_dup_not_v8i16_swapped(<8 x i16> %a, i16 %m) {
 define <2 x i32> @and_dup_not_v2i32(<2 x i32> %a, i32 %m) {
 ; CHECK-LABEL: and_dup_not_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.2s, w8
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    bic v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <2 x i32> poison, i32 %not, i64 0
@@ -2998,9 +3045,8 @@ define <2 x i32> @and_dup_not_v2i32(<2 x i32> %a, i32 %m) {
 define <2 x i32> @and_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
 ; CHECK-LABEL: and_dup_not_v2i32_swapped:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.2s, w8
-; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    bic v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <2 x i32> poison, i32 %not, i64 0
@@ -3012,9 +3058,8 @@ define <2 x i32> @and_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
 define <4 x i32> @and_dup_not_v4i32(<4 x i32> %a, i32 %m) {
 ; CHECK-LABEL: and_dup_not_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <4 x i32> poison, i32 %not, i64 0
@@ -3026,9 +3071,8 @@ define <4 x i32> @and_dup_not_v4i32(<4 x i32> %a, i32 %m) {
 define <4 x i32> @and_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
 ; CHECK-LABEL: and_dup_not_v4i32_swapped:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <4 x i32> poison, i32 %not, i64 0
@@ -3040,9 +3084,8 @@ define <4 x i32> @and_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
 define <2 x i64> @and_dup_not_v2i64(<2 x i64> %a, i64 %m) {
 ; CHECK-LABEL: and_dup_not_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x0
-; CHECK-NEXT:    dup v1.2d, x8
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    dup v1.2d, x0
+; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %not = xor i64 %m, -1
   %insert = insertelement <2 x i64> poison, i64 %not, i64 0
@@ -3052,12 +3095,18 @@ define <2 x i64> @and_dup_not_v2i64(<2 x i64> %a, i64 %m) {
 }
 
 define <8 x i8> @or_dup_not_v8i8(<8 x i8> %a, i8 %m) {
-; CHECK-LABEL: or_dup_not_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.8b, w8
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.8b, w0
+; CHECK-SD-NEXT:    orn v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.8b, w8
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i8 %m, -1
   %insert = insertelement <8 x i8> poison, i8 %not, i64 0
   %shuffle = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -3066,12 +3115,18 @@ define <8 x i8> @or_dup_not_v8i8(<8 x i8> %a, i8 %m) {
 }
 
 define <8 x i8> @or_dup_not_v8i8_swapped(<8 x i8> %a, i8 %m) {
-; CHECK-LABEL: or_dup_not_v8i8_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.8b, w8
-; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v8i8_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.8b, w0
+; CHECK-SD-NEXT:    orn v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v8i8_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.8b, w8
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i8 %m, -1
   %insert = insertelement <8 x i8> poison, i8 %not, i64 0
   %shuffle = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -3080,12 +3135,18 @@ define <8 x i8> @or_dup_not_v8i8_swapped(<8 x i8> %a, i8 %m) {
 }
 
 define <16 x i8> @or_dup_not_v16i8(<16 x i8> %a, i8 %m) {
-; CHECK-LABEL: or_dup_not_v16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v16i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.16b, w0
+; CHECK-SD-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v16i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.16b, w8
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i8 %m, -1
   %insert = insertelement <16 x i8> poison, i8 %not, i64 0
   %shuffle = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -3094,12 +3155,18 @@ define <16 x i8> @or_dup_not_v16i8(<16 x i8> %a, i8 %m) {
 }
 
 define <16 x i8> @or_dup_not_v16i8_swapped(<16 x i8> %a, i8 %m) {
-; CHECK-LABEL: or_dup_not_v16i8_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.16b, w8
-; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v16i8_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.16b, w0
+; CHECK-SD-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v16i8_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.16b, w8
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i8 %m, -1
   %insert = insertelement <16 x i8> poison, i8 %not, i64 0
   %shuffle = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -3108,12 +3175,18 @@ define <16 x i8> @or_dup_not_v16i8_swapped(<16 x i8> %a, i8 %m) {
 }
 
 define <4 x i16> @or_dup_not_v4i16(<4 x i16> %a, i16 %m) {
-; CHECK-LABEL: or_dup_not_v4i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.4h, w8
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v4i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4h, w0
+; CHECK-SD-NEXT:    orn v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v4i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.4h, w8
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i16 %m, -1
   %insert = insertelement <4 x i16> poison, i16 %not, i64 0
   %shuffle = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -3122,12 +3195,18 @@ define <4 x i16> @or_dup_not_v4i16(<4 x i16> %a, i16 %m) {
 }
 
 define <4 x i16> @or_dup_not_v4i16_swapped(<4 x i16> %a, i16 %m) {
-; CHECK-LABEL: or_dup_not_v4i16_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.4h, w8
-; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v4i16_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4h, w0
+; CHECK-SD-NEXT:    orn v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v4i16_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.4h, w8
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i16 %m, -1
   %insert = insertelement <4 x i16> poison, i16 %not, i64 0
   %shuffle = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -3136,12 +3215,18 @@ define <4 x i16> @or_dup_not_v4i16_swapped(<4 x i16> %a, i16 %m) {
 }
 
 define <8 x i16> @or_dup_not_v8i16(<8 x i16> %a, i16 %m) {
-; CHECK-LABEL: or_dup_not_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.8h, w0
+; CHECK-SD-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.8h, w8
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i16 %m, -1
   %insert = insertelement <8 x i16> poison, i16 %not, i64 0
   %shuffle = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -3150,12 +3235,18 @@ define <8 x i16> @or_dup_not_v8i16(<8 x i16> %a, i16 %m) {
 }
 
 define <8 x i16> @or_dup_not_v8i16_swapped(<8 x i16> %a, i16 %m) {
-; CHECK-LABEL: or_dup_not_v8i16_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v8i16_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.8h, w0
+; CHECK-SD-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v8i16_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.8h, w8
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i16 %m, -1
   %insert = insertelement <8 x i16> poison, i16 %not, i64 0
   %shuffle = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -3166,9 +3257,8 @@ define <8 x i16> @or_dup_not_v8i16_swapped(<8 x i16> %a, i16 %m) {
 define <2 x i32> @or_dup_not_v2i32(<2 x i32> %a, i32 %m) {
 ; CHECK-LABEL: or_dup_not_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.2s, w8
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    orn v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <2 x i32> poison, i32 %not, i64 0
@@ -3180,9 +3270,8 @@ define <2 x i32> @or_dup_not_v2i32(<2 x i32> %a, i32 %m) {
 define <2 x i32> @or_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
 ; CHECK-LABEL: or_dup_not_v2i32_swapped:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.2s, w8
-; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    orn v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <2 x i32> poison, i32 %not, i64 0
@@ -3194,9 +3283,8 @@ define <2 x i32> @or_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
 define <4 x i32> @or_dup_not_v4i32(<4 x i32> %a, i32 %m) {
 ; CHECK-LABEL: or_dup_not_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <4 x i32> poison, i32 %not, i64 0
@@ -3208,9 +3296,8 @@ define <4 x i32> @or_dup_not_v4i32(<4 x i32> %a, i32 %m) {
 define <4 x i32> @or_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
 ; CHECK-LABEL: or_dup_not_v4i32_swapped:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w0
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <4 x i32> poison, i32 %not, i64 0
@@ -3222,9 +3309,8 @@ define <4 x i32> @or_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
 define <2 x i64> @or_dup_not_v2i64(<2 x i64> %a, i64 %m) {
 ; CHECK-LABEL: or_dup_not_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x0
-; CHECK-NEXT:    dup v1.2d, x8
-; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    dup v1.2d, x0
+; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %not = xor i64 %m, -1
   %insert = insertelement <2 x i64> poison, i64 %not, i64 0

>From e317495ea9bb9b5df8115ce32b2ddf7e88da551b Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Fri, 27 Feb 2026 14:45:45 +0100
Subject: [PATCH 3/4] [AArch64] Move the transform to AArch64DAGToDAGISel

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 26 ++++++++++++++++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 17 ------------
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 67f4e127b0c87..cedbc2b46dfd1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Function.h" // To access function attributes.
 #include "llvm/IR/GlobalValue.h"
@@ -432,6 +433,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
                            const SDValue &OldBase, const SDValue &OldOffset,
                            unsigned Scale);
 
+  bool tryBicOrnDup(SDNode *N);
   bool tryBitfieldExtractOp(SDNode *N);
   bool tryBitfieldExtractOpFromSExt(SDNode *N);
   bool tryBitfieldInsertOp(SDNode *N);
@@ -2815,6 +2817,23 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   return true;
 }
 
+bool AArch64DAGToDAGISel::tryBicOrnDup(SDNode *N) {
+  unsigned Opc = N->getOpcode();
+  assert(Opc == ISD::AND || Opc == ISD::OR);
+  using namespace SDPatternMatch;
+  SDValue X, Y;
+  if (!sd_match(N, m_c_BinOp(Opc, m_Node(AArch64ISD::DUP, m_Not(m_Value(X))),
+                             m_Value(Y))))
+    return false;
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue Dup = CurDAG->getNode(AArch64ISD::DUP, DL, VT, X);
+  SDValue Ops[] = {CurDAG->getNOT(DL, Dup, VT), Y};
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
+}
+
 bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
   assert(N->getOpcode() == ISD::SIGN_EXTEND);
 
@@ -4870,8 +4889,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
   }
 
-  case ISD::SRL:
   case ISD::AND:
+    if (tryBicOrnDup(Node))
+      return;
+    [[fallthrough]];
+  case ISD::SRL:
   case ISD::SRA:
   case ISD::SIGN_EXTEND_INREG:
     if (tryBitfieldExtractOp(Node))
@@ -4891,6 +4913,8 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::OR:
+    if (tryBicOrnDup(Node))
+      return;
     if (tryBitfieldInsertOp(Node))
       return;
     if (trySelectXAR(Node))
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8dccc892bf2dd..ace85b04595b8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7856,23 +7856,6 @@ def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
 def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
 def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
-multiclass BicOrnDupGpr<ValueType vectype, RegisterClass regtype> {
-  defvar vecreg = !cast<RegisterOperand>("V" # vectype.Size);
-  defvar bvectype = "v" # !srl(vectype.Size, 3) # "i8";
-  def : Pat<(vectype (and vecreg:$Rn, (AArch64dup (not regtype:$Rm)))),
-            (!cast<Instruction>("BIC" # bvectype) vectype:$Rn, (!cast<Instruction>("DUP" # vectype # "gpr") regtype:$Rm))>;
-  def : Pat<(vectype (or  vecreg:$Rn, (AArch64dup (not regtype:$Rm)))),
-            (!cast<Instruction>("ORN" # bvectype) vectype:$Rn, (!cast<Instruction>("DUP" # vectype # "gpr") regtype:$Rm))>;
-}
-
-defm : BicOrnDupGpr<v8i8,  GPR32>;
-defm : BicOrnDupGpr<v16i8, GPR32>;
-defm : BicOrnDupGpr<v4i16, GPR32>;
-defm : BicOrnDupGpr<v8i16, GPR32>;
-defm : BicOrnDupGpr<v2i32, GPR32>;
-defm : BicOrnDupGpr<v4i32, GPR32>;
-defm : BicOrnDupGpr<v2i64, GPR64>;
-
 // SMOV and UMOV definitions, with some extra patterns for convenience
 defm SMOV : SMov;
 defm UMOV : UMov;

>From 415c522df9600d78fb0d53dcc59921c93d7aeccd Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Mon, 2 Mar 2026 09:05:26 +0100
Subject: [PATCH 4/4] [AArch64] Move the transform to AArch64ISelLowering

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  26 +--
 .../Target/AArch64/AArch64ISelLowering.cpp    |  25 +++
 .../AArch64/neon-bitwise-instructions.ll      | 170 ++++++++++++------
 3 files changed, 146 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index cedbc2b46dfd1..67f4e127b0c87 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -15,7 +15,6 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Function.h" // To access function attributes.
 #include "llvm/IR/GlobalValue.h"
@@ -433,7 +432,6 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
                            const SDValue &OldBase, const SDValue &OldOffset,
                            unsigned Scale);
 
-  bool tryBicOrnDup(SDNode *N);
   bool tryBitfieldExtractOp(SDNode *N);
   bool tryBitfieldExtractOpFromSExt(SDNode *N);
   bool tryBitfieldInsertOp(SDNode *N);
@@ -2817,23 +2815,6 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   return true;
 }
 
-bool AArch64DAGToDAGISel::tryBicOrnDup(SDNode *N) {
-  unsigned Opc = N->getOpcode();
-  assert(Opc == ISD::AND || Opc == ISD::OR);
-  using namespace SDPatternMatch;
-  SDValue X, Y;
-  if (!sd_match(N, m_c_BinOp(Opc, m_Node(AArch64ISD::DUP, m_Not(m_Value(X))),
-                             m_Value(Y))))
-    return false;
-
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-  SDValue Dup = CurDAG->getNode(AArch64ISD::DUP, DL, VT, X);
-  SDValue Ops[] = {CurDAG->getNOT(DL, Dup, VT), Y};
-  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
-  return true;
-}
-
 bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
   assert(N->getOpcode() == ISD::SIGN_EXTEND);
 
@@ -4889,11 +4870,8 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
   }
 
-  case ISD::AND:
-    if (tryBicOrnDup(Node))
-      return;
-    [[fallthrough]];
   case ISD::SRL:
+  case ISD::AND:
   case ISD::SRA:
   case ISD::SIGN_EXTEND_INREG:
     if (tryBitfieldExtractOp(Node))
@@ -4913,8 +4891,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::OR:
-    if (tryBicOrnDup(Node))
-      return;
     if (tryBitfieldInsertOp(Node))
       return;
     if (trySelectXAR(Node))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb6e9146e3839..dc5756f950cc9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21024,6 +21024,25 @@ static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG,
                      DAG.getNode(RevOp, DL, HalfVT, N0->getOperand(0)));
 }
 
+// (and/or X, (splat (not Y))) -> (and/or X, (not (splat Y)))
+// so that it gets selected as (bic/orn X, (dup Y))
+static SDValue performANDORDUPNOTCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opc = N->getOpcode();
+  assert(Opc == ISD::AND || Opc == ISD::OR);
+  using namespace llvm::SDPatternMatch;
+  SDValue X, Y;
+  if (!sd_match(N, m_c_BinOp(Opc, m_Value(X),
+                             m_Shuffle(m_InsertElt(m_Poison(),
+                                                   m_Not(m_Value(Y)), m_Zero()),
+                                       m_Poison()))))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue Not = DAG.getNOT(DL, DAG.getSplat(VT, DL, Y), VT);
+  return DAG.getNode(Opc, DL, VT, X, Not);
+}
+
 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const AArch64Subtarget *Subtarget,
                                 const AArch64TargetLowering &TLI) {
@@ -21035,6 +21054,9 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   if (SDValue R = tryCombineToREV(N, DAG, DCI))
     return R;
 
+  if (SDValue R = performANDORDUPNOTCombine(N, DAG))
+    return R;
+
   return SDValue();
 }
 
@@ -21241,6 +21263,9 @@ static SDValue performANDCombine(SDNode *N,
   if (SDValue R = performANDSETCCCombine(N,DCI))
     return R;
 
+  if (SDValue R = performANDORDUPNOTCombine(N, DAG))
+    return R;
+
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 576fe9d1181e4..ac7c6c00e0533 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -3030,11 +3030,18 @@ define <8 x i16> @and_dup_not_v8i16_swapped(<8 x i16> %a, i16 %m) {
 }
 
 define <2 x i32> @and_dup_not_v2i32(<2 x i32> %a, i32 %m) {
-; CHECK-LABEL: and_dup_not_v2i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.2s, w0
-; CHECK-NEXT:    bic v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v2i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.2s, w0
+; CHECK-SD-NEXT:    bic v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v2i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.2s, w8
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <2 x i32> poison, i32 %not, i64 0
   %shuffle = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3043,11 +3050,18 @@ define <2 x i32> @and_dup_not_v2i32(<2 x i32> %a, i32 %m) {
 }
 
 define <2 x i32> @and_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
-; CHECK-LABEL: and_dup_not_v2i32_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.2s, w0
-; CHECK-NEXT:    bic v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v2i32_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.2s, w0
+; CHECK-SD-NEXT:    bic v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v2i32_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.2s, w8
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <2 x i32> poison, i32 %not, i64 0
   %shuffle = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3056,11 +3070,18 @@ define <2 x i32> @and_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
 }
 
 define <4 x i32> @and_dup_not_v4i32(<4 x i32> %a, i32 %m) {
-; CHECK-LABEL: and_dup_not_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.4s, w0
-; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    bic v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.4s, w8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <4 x i32> poison, i32 %not, i64 0
   %shuffle = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -3069,11 +3090,18 @@ define <4 x i32> @and_dup_not_v4i32(<4 x i32> %a, i32 %m) {
 }
 
 define <4 x i32> @and_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
-; CHECK-LABEL: and_dup_not_v4i32_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.4s, w0
-; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v4i32_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    bic v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v4i32_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.4s, w8
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <4 x i32> poison, i32 %not, i64 0
   %shuffle = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -3082,11 +3110,18 @@ define <4 x i32> @and_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
 }
 
 define <2 x i64> @and_dup_not_v2i64(<2 x i64> %a, i64 %m) {
-; CHECK-LABEL: and_dup_not_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.2d, x0
-; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: and_dup_not_v2i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.2d, x0
+; CHECK-SD-NEXT:    bic v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_dup_not_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn x8, x0
+; CHECK-GI-NEXT:    dup v1.2d, x8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i64 %m, -1
   %insert = insertelement <2 x i64> poison, i64 %not, i64 0
   %shuffle = shufflevector <2 x i64> %insert, <2 x i64> poison, <2 x i32> zeroinitializer
@@ -3255,11 +3290,18 @@ define <8 x i16> @or_dup_not_v8i16_swapped(<8 x i16> %a, i16 %m) {
 }
 
 define <2 x i32> @or_dup_not_v2i32(<2 x i32> %a, i32 %m) {
-; CHECK-LABEL: or_dup_not_v2i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.2s, w0
-; CHECK-NEXT:    orn v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v2i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.2s, w0
+; CHECK-SD-NEXT:    orn v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v2i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.2s, w8
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <2 x i32> poison, i32 %not, i64 0
   %shuffle = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3268,11 +3310,18 @@ define <2 x i32> @or_dup_not_v2i32(<2 x i32> %a, i32 %m) {
 }
 
 define <2 x i32> @or_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
-; CHECK-LABEL: or_dup_not_v2i32_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.2s, w0
-; CHECK-NEXT:    orn v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v2i32_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.2s, w0
+; CHECK-SD-NEXT:    orn v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v2i32_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.2s, w8
+; CHECK-GI-NEXT:    orr v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <2 x i32> poison, i32 %not, i64 0
   %shuffle = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer
@@ -3281,11 +3330,18 @@ define <2 x i32> @or_dup_not_v2i32_swapped(<2 x i32> %a, i32 %m) {
 }
 
 define <4 x i32> @or_dup_not_v4i32(<4 x i32> %a, i32 %m) {
-; CHECK-LABEL: or_dup_not_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.4s, w0
-; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.4s, w8
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <4 x i32> poison, i32 %not, i64 0
   %shuffle = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -3294,11 +3350,18 @@ define <4 x i32> @or_dup_not_v4i32(<4 x i32> %a, i32 %m) {
 }
 
 define <4 x i32> @or_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
-; CHECK-LABEL: or_dup_not_v4i32_swapped:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.4s, w0
-; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v4i32_swapped:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v4i32_swapped:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn w8, w0
+; CHECK-GI-NEXT:    dup v1.4s, w8
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i32 %m, -1
   %insert = insertelement <4 x i32> poison, i32 %not, i64 0
   %shuffle = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -3307,11 +3370,18 @@ define <4 x i32> @or_dup_not_v4i32_swapped(<4 x i32> %a, i32 %m) {
 }
 
 define <2 x i64> @or_dup_not_v2i64(<2 x i64> %a, i64 %m) {
-; CHECK-LABEL: or_dup_not_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.2d, x0
-; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: or_dup_not_v2i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.2d, x0
+; CHECK-SD-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: or_dup_not_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvn x8, x0
+; CHECK-GI-NEXT:    dup v1.2d, x8
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %not = xor i64 %m, -1
   %insert = insertelement <2 x i64> poison, i64 %not, i64 0
   %shuffle = shufflevector <2 x i64> %insert, <2 x i64> poison, <2 x i32> zeroinitializer



More information about the llvm-commits mailing list