[llvm] [X86] Sink NOT to be fold into ANDN (PR #172329)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 6 06:51:03 PST 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/172329
>From 532f68e510f7a48524fa41adcbcb08806927f597 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Mon, 15 Dec 2025 17:01:50 +0100
Subject: [PATCH 1/4] [X86][test] Add tests for sinking NOT to be fold into
ANDN
---
llvm/test/CodeGen/X86/andnot-sink-not.ll | 3149 ++++++++++++++++++++++
1 file changed, 3149 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/andnot-sink-not.ll
diff --git a/llvm/test/CodeGen/X86/andnot-sink-not.ll b/llvm/test/CodeGen/X86/andnot-sink-not.ll
new file mode 100644
index 0000000000000..4d6aa02c0fe72
--- /dev/null
+++ b/llvm/test/CodeGen/X86/andnot-sink-not.ll
@@ -0,0 +1,3149 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -mattr=-bmi | FileCheck %s --check-prefixes=X86-NOBMI,X86
+; RUN: llc < %s -mtriple=i686-- -mattr=-bmi,+sse | FileCheck %s --check-prefixes=X86-NOBMI,X86-SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=-bmi,+sse2 | FileCheck %s --check-prefixes=X86-NOBMI,X86-SSE2
+; RUN: llc < %s -mtriple=i686-- -mattr=+bmi | FileCheck %s --check-prefixes=X86-BMI
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOAVX2,X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI,X64-NOAVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi,+avx2 | FileCheck %s --check-prefixes=X64,X64-BMI,X64-AVX2
+
+define i8 @and_sink_not_i8(i8 %x, i8 %m, i1 zeroext %cond) {
+; X86-NOBMI-LABEL: and_sink_not_i8:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: je .LBB0_2
+; X86-NOBMI-NEXT: # %bb.1: # %mask
+; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: notb %cl
+; X86-NOBMI-NEXT: andb %al, %cl
+; X86-NOBMI-NEXT: movl %ecx, %eax
+; X86-NOBMI-NEXT: .LBB0_2: # %identity
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_i8:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB0_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: notb %cl
+; X86-BMI-NEXT: andb %al, %cl
+; X86-BMI-NEXT: movl %ecx, %eax
+; X86-BMI-NEXT: .LBB0_2: # %identity
+; X86-BMI-NEXT: retl
+;
+; X64-LABEL: and_sink_not_i8:
+; X64: # %bb.0:
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: je .LBB0_2
+; X64-NEXT: # %bb.1: # %mask
+; X64-NEXT: notb %sil
+; X64-NEXT: andb %dil, %sil
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
+; X64-NEXT: .LBB0_2: # %identity
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %a = xor i8 %m, -1
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and i8 %a, %x
+ ret i8 %masked
+
+identity:
+ ret i8 %x
+}
+
+define i8 @and_sink_not_i8_swapped(i8 %x, i8 %m, i1 zeroext %cond) {
+; X86-NOBMI-LABEL: and_sink_not_i8_swapped:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: je .LBB1_2
+; X86-NOBMI-NEXT: # %bb.1: # %mask
+; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: notb %cl
+; X86-NOBMI-NEXT: andb %cl, %al
+; X86-NOBMI-NEXT: .LBB1_2: # %identity
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_i8_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB1_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: notb %cl
+; X86-BMI-NEXT: andb %cl, %al
+; X86-BMI-NEXT: .LBB1_2: # %identity
+; X86-BMI-NEXT: retl
+;
+; X64-LABEL: and_sink_not_i8_swapped:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: je .LBB1_2
+; X64-NEXT: # %bb.1: # %mask
+; X64-NEXT: notb %sil
+; X64-NEXT: andb %sil, %al
+; X64-NEXT: .LBB1_2: # %identity
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+ %a = xor i8 %m, -1
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and i8 %x, %a
+ ret i8 %masked
+
+identity:
+ ret i8 %x
+}
+
+define i16 @and_sink_not_i16(i16 %x, i16 %m, i1 zeroext %cond) {
+; X86-NOBMI-LABEL: and_sink_not_i16:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: je .LBB2_2
+; X86-NOBMI-NEXT: # %bb.1: # %mask
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: notl %ecx
+; X86-NOBMI-NEXT: andl %eax, %ecx
+; X86-NOBMI-NEXT: movl %ecx, %eax
+; X86-NOBMI-NEXT: retl
+; X86-NOBMI-NEXT: .LBB2_2: # %identity
+; X86-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_i16:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB2_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: notl %ecx
+; X86-BMI-NEXT: andl %eax, %ecx
+; X86-BMI-NEXT: movl %ecx, %eax
+; X86-BMI-NEXT: retl
+; X86-BMI-NEXT: .LBB2_2: # %identity
+; X86-BMI-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-BMI-NEXT: retl
+;
+; X64-LABEL: and_sink_not_i16:
+; X64: # %bb.0:
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: je .LBB2_2
+; X64-NEXT: # %bb.1: # %mask
+; X64-NEXT: notl %esi
+; X64-NEXT: andl %edi, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
+; X64-NEXT: .LBB2_2: # %identity
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %a = xor i16 %m, -1
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and i16 %a, %x
+ ret i16 %masked
+
+identity:
+ ret i16 %x
+}
+
+define i16 @and_sink_not_i16_swapped(i16 %x, i16 %m, i1 zeroext %cond) {
+; X86-NOBMI-LABEL: and_sink_not_i16_swapped:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: je .LBB3_2
+; X86-NOBMI-NEXT: # %bb.1: # %mask
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: notl %ecx
+; X86-NOBMI-NEXT: andl %ecx, %eax
+; X86-NOBMI-NEXT: .LBB3_2: # %identity
+; X86-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_i16_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB3_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: notl %ecx
+; X86-BMI-NEXT: andl %ecx, %eax
+; X86-BMI-NEXT: .LBB3_2: # %identity
+; X86-BMI-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-BMI-NEXT: retl
+;
+; X64-LABEL: and_sink_not_i16_swapped:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: je .LBB3_2
+; X64-NEXT: # %bb.1: # %mask
+; X64-NEXT: notl %esi
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: .LBB3_2: # %identity
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %a = xor i16 %m, -1
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and i16 %x, %a
+ ret i16 %masked
+
+identity:
+ ret i16 %x
+}
+
+define i32 @and_sink_not_i32(i32 %x, i32 %m, i1 zeroext %cond) {
+; X86-NOBMI-LABEL: and_sink_not_i32:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: je .LBB4_2
+; X86-NOBMI-NEXT: # %bb.1: # %mask
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: notl %ecx
+; X86-NOBMI-NEXT: andl %eax, %ecx
+; X86-NOBMI-NEXT: movl %ecx, %eax
+; X86-NOBMI-NEXT: .LBB4_2: # %identity
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_i32:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB4_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: notl %ecx
+; X86-BMI-NEXT: andl %eax, %ecx
+; X86-BMI-NEXT: movl %ecx, %eax
+; X86-BMI-NEXT: .LBB4_2: # %identity
+; X86-BMI-NEXT: retl
+;
+; X64-LABEL: and_sink_not_i32:
+; X64: # %bb.0:
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: je .LBB4_2
+; X64-NEXT: # %bb.1: # %mask
+; X64-NEXT: notl %esi
+; X64-NEXT: andl %edi, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
+; X64-NEXT: .LBB4_2: # %identity
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %a = xor i32 %m, -1
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and i32 %a, %x
+ ret i32 %masked
+
+identity:
+ ret i32 %x
+}
+
+define i32 @and_sink_not_i32_swapped(i32 %x, i32 %m, i1 zeroext %cond) {
+; X86-NOBMI-LABEL: and_sink_not_i32_swapped:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: je .LBB5_2
+; X86-NOBMI-NEXT: # %bb.1: # %mask
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: notl %ecx
+; X86-NOBMI-NEXT: andl %ecx, %eax
+; X86-NOBMI-NEXT: .LBB5_2: # %identity
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_i32_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB5_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: notl %ecx
+; X86-BMI-NEXT: andl %ecx, %eax
+; X86-BMI-NEXT: .LBB5_2: # %identity
+; X86-BMI-NEXT: retl
+;
+; X64-LABEL: and_sink_not_i32_swapped:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: je .LBB5_2
+; X64-NEXT: # %bb.1: # %mask
+; X64-NEXT: notl %esi
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: .LBB5_2: # %identity
+; X64-NEXT: retq
+ %a = xor i32 %m, -1
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and i32 %x, %a
+ ret i32 %masked
+
+identity:
+ ret i32 %x
+}
+
+define i64 @and_sink_not_i64(i64 %x, i64 %m, i1 zeroext %cond) nounwind {
+; X86-NOBMI-LABEL: and_sink_not_i64:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: pushl %esi
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: je .LBB6_2
+; X86-NOBMI-NEXT: # %bb.1: # %mask
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: notl %esi
+; X86-NOBMI-NEXT: notl %ecx
+; X86-NOBMI-NEXT: andl %eax, %ecx
+; X86-NOBMI-NEXT: andl %edx, %esi
+; X86-NOBMI-NEXT: movl %ecx, %eax
+; X86-NOBMI-NEXT: movl %esi, %edx
+; X86-NOBMI-NEXT: .LBB6_2: # %identity
+; X86-NOBMI-NEXT: popl %esi
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_i64:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB6_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: notl %ecx
+; X86-BMI-NEXT: andl %eax, %ecx
+; X86-BMI-NEXT: andl %edx, %esi
+; X86-BMI-NEXT: movl %ecx, %eax
+; X86-BMI-NEXT: movl %esi, %edx
+; X86-BMI-NEXT: .LBB6_2: # %identity
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: retl
+;
+; X64-LABEL: and_sink_not_i64:
+; X64: # %bb.0:
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: je .LBB6_2
+; X64-NEXT: # %bb.1: # %mask
+; X64-NEXT: notq %rsi
+; X64-NEXT: andq %rdi, %rsi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: retq
+; X64-NEXT: .LBB6_2: # %identity
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %a = xor i64 %m, -1
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and i64 %a, %x
+ ret i64 %masked
+
+identity:
+ ret i64 %x
+}
+
+define i64 @and_sink_not_i64_swapped(i64 %x, i64 %m, i1 zeroext %cond) nounwind {
+; X86-NOBMI-LABEL: and_sink_not_i64_swapped:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: pushl %esi
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: je .LBB7_2
+; X86-NOBMI-NEXT: # %bb.1: # %mask
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: notl %esi
+; X86-NOBMI-NEXT: notl %ecx
+; X86-NOBMI-NEXT: andl %ecx, %eax
+; X86-NOBMI-NEXT: andl %esi, %edx
+; X86-NOBMI-NEXT: .LBB7_2: # %identity
+; X86-NOBMI-NEXT: popl %esi
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_i64_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB7_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: notl %ecx
+; X86-BMI-NEXT: andl %ecx, %eax
+; X86-BMI-NEXT: andl %esi, %edx
+; X86-BMI-NEXT: .LBB7_2: # %identity
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: retl
+;
+; X64-LABEL: and_sink_not_i64_swapped:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: je .LBB7_2
+; X64-NEXT: # %bb.1: # %mask
+; X64-NEXT: notq %rsi
+; X64-NEXT: andq %rsi, %rax
+; X64-NEXT: .LBB7_2: # %identity
+; X64-NEXT: retq
+ %a = xor i64 %m, -1
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and i64 %x, %a
+ ret i64 %masked
+
+identity:
+ ret i64 %x
+}
+
+define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_v8i8:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB8_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: notb %dh
+; X86-NEXT: andb %ch, %dh
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: andb %dl, %ch
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: notb %dl
+; X86-NEXT: andb %cl, %dl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: notb %cl
+; X86-NEXT: andb %bh, %cl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT: notb %bh
+; X86-NEXT: andb %bl, %bh
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bl
+; X86-NEXT: notb %bl
+; X86-NEXT: andb {{[0-9]+}}(%esp), %bl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: notb %al
+; X86-NEXT: andb %ah, %al
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: notb %ah
+; X86-NEXT: andb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: movb %ah, (%esi)
+; X86-NEXT: movb %al, 1(%esi)
+; X86-NEXT: movb %bl, 2(%esi)
+; X86-NEXT: movb %bh, 3(%esi)
+; X86-NEXT: movb %cl, 4(%esi)
+; X86-NEXT: movb %dl, 5(%esi)
+; X86-NEXT: movb %ch, 6(%esi)
+; X86-NEXT: movb %dh, 7(%esi)
+; X86-NEXT: jmp .LBB8_3
+; X86-NEXT: .LBB8_2: # %identity
+; X86-NEXT: movb %al, (%esi)
+; X86-NEXT: movb %ah, 1(%esi)
+; X86-NEXT: movb %dh, 2(%esi)
+; X86-NEXT: movb %bl, 3(%esi)
+; X86-NEXT: movb %bh, 4(%esi)
+; X86-NEXT: movb %cl, 5(%esi)
+; X86-NEXT: movb %dl, 6(%esi)
+; X86-NEXT: movb %ch, 7(%esi)
+; X86-NEXT: .LBB8_3: # %identity
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_v8i8:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %bl
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB8_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: notb %dh
+; X86-SSE-NEXT: andb %ch, %dh
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: andb %dl, %ch
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-SSE-NEXT: notb %dl
+; X86-SSE-NEXT: andb %cl, %dl
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT: notb %cl
+; X86-SSE-NEXT: andb %bh, %cl
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT: notb %bh
+; X86-SSE-NEXT: andb %bl, %bh
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %bl
+; X86-SSE-NEXT: notb %bl
+; X86-SSE-NEXT: andb {{[0-9]+}}(%esp), %bl
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-SSE-NEXT: notb %al
+; X86-SSE-NEXT: andb %ah, %al
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-SSE-NEXT: notb %ah
+; X86-SSE-NEXT: andb {{[0-9]+}}(%esp), %ah
+; X86-SSE-NEXT: movb %ah, (%esi)
+; X86-SSE-NEXT: movb %al, 1(%esi)
+; X86-SSE-NEXT: movb %bl, 2(%esi)
+; X86-SSE-NEXT: movb %bh, 3(%esi)
+; X86-SSE-NEXT: movb %cl, 4(%esi)
+; X86-SSE-NEXT: movb %dl, 5(%esi)
+; X86-SSE-NEXT: movb %ch, 6(%esi)
+; X86-SSE-NEXT: movb %dh, 7(%esi)
+; X86-SSE-NEXT: jmp .LBB8_3
+; X86-SSE-NEXT: .LBB8_2: # %identity
+; X86-SSE-NEXT: movb %al, (%esi)
+; X86-SSE-NEXT: movb %ah, 1(%esi)
+; X86-SSE-NEXT: movb %dh, 2(%esi)
+; X86-SSE-NEXT: movb %bl, 3(%esi)
+; X86-SSE-NEXT: movb %bh, 4(%esi)
+; X86-SSE-NEXT: movb %cl, 5(%esi)
+; X86-SSE-NEXT: movb %dl, 6(%esi)
+; X86-SSE-NEXT: movb %ch, 7(%esi)
+; X86-SSE-NEXT: .LBB8_3: # %identity
+; X86-SSE-NEXT: movl %esi, %eax
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_v8i8:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB8_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: .LBB8_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_v8i8:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB8_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: notb %dh
+; X86-BMI-NEXT: andb %ch, %dh
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: andb %dl, %ch
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-BMI-NEXT: notb %dl
+; X86-BMI-NEXT: andb %cl, %dl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT: notb %cl
+; X86-BMI-NEXT: andb %bh, %cl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT: notb %bh
+; X86-BMI-NEXT: andb %bl, %bh
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bl
+; X86-BMI-NEXT: notb %bl
+; X86-BMI-NEXT: andb {{[0-9]+}}(%esp), %bl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-BMI-NEXT: notb %al
+; X86-BMI-NEXT: andb %ah, %al
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-BMI-NEXT: notb %ah
+; X86-BMI-NEXT: andb {{[0-9]+}}(%esp), %ah
+; X86-BMI-NEXT: movb %ah, (%esi)
+; X86-BMI-NEXT: movb %al, 1(%esi)
+; X86-BMI-NEXT: movb %bl, 2(%esi)
+; X86-BMI-NEXT: movb %bh, 3(%esi)
+; X86-BMI-NEXT: movb %cl, 4(%esi)
+; X86-BMI-NEXT: movb %dl, 5(%esi)
+; X86-BMI-NEXT: movb %ch, 6(%esi)
+; X86-BMI-NEXT: movb %dh, 7(%esi)
+; X86-BMI-NEXT: jmp .LBB8_3
+; X86-BMI-NEXT: .LBB8_2: # %identity
+; X86-BMI-NEXT: movb %al, (%esi)
+; X86-BMI-NEXT: movb %ah, 1(%esi)
+; X86-BMI-NEXT: movb %dh, 2(%esi)
+; X86-BMI-NEXT: movb %bl, 3(%esi)
+; X86-BMI-NEXT: movb %bh, 4(%esi)
+; X86-BMI-NEXT: movb %cl, 5(%esi)
+; X86-BMI-NEXT: movb %dl, 6(%esi)
+; X86-BMI-NEXT: movb %ch, 7(%esi)
+; X86-BMI-NEXT: .LBB8_3: # %identity
+; X86-BMI-NEXT: movl %esi, %eax
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_v8i8:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %edi, %edi
+; X64-NOAVX2-NEXT: je .LBB8_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: pcmpeqd %xmm2, %xmm2
+; X64-NOAVX2-NEXT: pxor %xmm2, %xmm1
+; X64-NOAVX2-NEXT: pand %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movdqa %xmm1, %xmm0
+; X64-NOAVX2-NEXT: .LBB8_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_v8i8:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %edi, %edi
+; X64-AVX2-NEXT: je .LBB8_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: .LBB8_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor <8 x i8> %m, splat (i8 -1)
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <8 x i8> %a, %x
+ ret <8 x i8> %masked
+
+identity:
+ ret <8 x i8> %x
+}
+
+define <8 x i8> @and_sink_not_v8i8_swapped(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_v8i8_swapped:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB9_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movb %ch, %dh
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: andb %ch, %dh
+; X86-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: andb %ch, %ah
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: andb %ch, %al
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: andb %ch, %bl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: andb %ch, %bh
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: andb %ch, %cl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: andb %ch, %dl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: andb %ch, %dh
+; X86-NEXT: movb %dh, (%esi)
+; X86-NEXT: movb %dl, 1(%esi)
+; X86-NEXT: movb %cl, 2(%esi)
+; X86-NEXT: movb %bh, 3(%esi)
+; X86-NEXT: movb %bl, 4(%esi)
+; X86-NEXT: movb %al, 5(%esi)
+; X86-NEXT: movb %ah, 6(%esi)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 7(%esi)
+; X86-NEXT: jmp .LBB9_3
+; X86-NEXT: .LBB9_2: # %identity
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: movb %dh, (%esi)
+; X86-NEXT: movb %dl, 1(%esi)
+; X86-NEXT: movb %cl, 2(%esi)
+; X86-NEXT: movb %bh, 3(%esi)
+; X86-NEXT: movb %bl, 4(%esi)
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movb %cl, 5(%esi)
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movb %cl, 6(%esi)
+; X86-NEXT: movb %ch, 7(%esi)
+; X86-NEXT: .LBB9_3: # %identity
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_v8i8_swapped:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: pushl %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB9_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movb %ch, %dh
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: andb %ch, %dh
+; X86-SSE-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-SSE-NEXT: andb %ch, %ah
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: andb %ch, %al
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: andb %ch, %bl
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: andb %ch, %bh
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: andb %ch, %cl
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: andb %ch, %dl
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: andb %ch, %dh
+; X86-SSE-NEXT: movb %dh, (%esi)
+; X86-SSE-NEXT: movb %dl, 1(%esi)
+; X86-SSE-NEXT: movb %cl, 2(%esi)
+; X86-SSE-NEXT: movb %bh, 3(%esi)
+; X86-SSE-NEXT: movb %bl, 4(%esi)
+; X86-SSE-NEXT: movb %al, 5(%esi)
+; X86-SSE-NEXT: movb %ah, 6(%esi)
+; X86-SSE-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT: movb %cl, 7(%esi)
+; X86-SSE-NEXT: jmp .LBB9_3
+; X86-SSE-NEXT: .LBB9_2: # %identity
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: movb %dh, (%esi)
+; X86-SSE-NEXT: movb %dl, 1(%esi)
+; X86-SSE-NEXT: movb %cl, 2(%esi)
+; X86-SSE-NEXT: movb %bh, 3(%esi)
+; X86-SSE-NEXT: movb %bl, 4(%esi)
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT: movb %cl, 5(%esi)
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT: movb %cl, 6(%esi)
+; X86-SSE-NEXT: movb %ch, 7(%esi)
+; X86-SSE-NEXT: .LBB9_3: # %identity
+; X86-SSE-NEXT: movl %esi, %eax
+; X86-SSE-NEXT: addl $4, %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_v8i8_swapped:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB9_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: .LBB9_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_v8i8_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: pushl %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB9_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movb %ch, %dh
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: andb %ch, %dh
+; X86-BMI-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-BMI-NEXT: andb %ch, %ah
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: andb %ch, %al
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: andb %ch, %bl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: andb %ch, %bh
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: andb %ch, %cl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: andb %ch, %dl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: andb %ch, %dh
+; X86-BMI-NEXT: movb %dh, (%esi)
+; X86-BMI-NEXT: movb %dl, 1(%esi)
+; X86-BMI-NEXT: movb %cl, 2(%esi)
+; X86-BMI-NEXT: movb %bh, 3(%esi)
+; X86-BMI-NEXT: movb %bl, 4(%esi)
+; X86-BMI-NEXT: movb %al, 5(%esi)
+; X86-BMI-NEXT: movb %ah, 6(%esi)
+; X86-BMI-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT: movb %cl, 7(%esi)
+; X86-BMI-NEXT: jmp .LBB9_3
+; X86-BMI-NEXT: .LBB9_2: # %identity
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: movb %dh, (%esi)
+; X86-BMI-NEXT: movb %dl, 1(%esi)
+; X86-BMI-NEXT: movb %cl, 2(%esi)
+; X86-BMI-NEXT: movb %bh, 3(%esi)
+; X86-BMI-NEXT: movb %bl, 4(%esi)
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT: movb %cl, 5(%esi)
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT: movb %cl, 6(%esi)
+; X86-BMI-NEXT: movb %ch, 7(%esi)
+; X86-BMI-NEXT: .LBB9_3: # %identity
+; X86-BMI-NEXT: movl %esi, %eax
+; X86-BMI-NEXT: addl $4, %esp
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_v8i8_swapped:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %edi, %edi
+; X64-NOAVX2-NEXT: je .LBB9_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: pcmpeqd %xmm2, %xmm2
+; X64-NOAVX2-NEXT: pxor %xmm2, %xmm1
+; X64-NOAVX2-NEXT: pand %xmm1, %xmm0
+; X64-NOAVX2-NEXT: .LBB9_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_v8i8_swapped:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %edi, %edi
+; X64-AVX2-NEXT: je .LBB9_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: .LBB9_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor <8 x i8> %m, splat (i8 -1)
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <8 x i8> %x, %a
+ ret <8 x i8> %masked
+
+identity:
+ ret <8 x i8> %x
+}
+
+define <4 x i32> @and_sink_not_v4i32(<4 x i32> %x, <4 x i32> %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_v4i32:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB10_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %esi, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: notl %edx
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: jmp .LBB10_3
+; X86-NEXT: .LBB10_2: # %identity
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: .LBB10_3: # %identity
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_v4i32:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: movl %esp, %ebp
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %edi
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: andl $-16, %esp
+; X86-SSE-NEXT: subl $64, %esp
+; X86-SSE-NEXT: movl 8(%ebp), %eax
+; X86-SSE-NEXT: movl 24(%ebp), %ecx
+; X86-SSE-NEXT: movl 20(%ebp), %edx
+; X86-SSE-NEXT: movl 16(%ebp), %esi
+; X86-SSE-NEXT: movzbl 44(%ebp), %ebx
+; X86-SSE-NEXT: testb %bl, %bl
+; X86-SSE-NEXT: movl 12(%ebp), %edi
+; X86-SSE-NEXT: movups 28(%ebp), %xmm0
+; X86-SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB10_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl 16(%ebp), %edi
+; X86-SSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl 20(%ebp), %edi
+; X86-SSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl 24(%ebp), %edi
+; X86-SSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %edx, (%esp)
+; X86-SSE-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE-NEXT: andps %xmm0, %xmm1
+; X86-SSE-NEXT: movaps %xmm1, (%eax)
+; X86-SSE-NEXT: jmp .LBB10_3
+; X86-SSE-NEXT: .LBB10_2: # %identity
+; X86-SSE-NEXT: movl %edi, (%eax)
+; X86-SSE-NEXT: movl %esi, 4(%eax)
+; X86-SSE-NEXT: movl %edx, 8(%eax)
+; X86-SSE-NEXT: movl %ecx, 12(%eax)
+; X86-SSE-NEXT: .LBB10_3: # %identity
+; X86-SSE-NEXT: leal -12(%ebp), %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %edi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_v4i32:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB10_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: .LBB10_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_v4i32:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB10_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: notl %edi
+; X86-BMI-NEXT: andl %ebx, %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: notl %ebx
+; X86-BMI-NEXT: andl %esi, %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %edx, %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: notl %edx
+; X86-BMI-NEXT: andl %ecx, %edx
+; X86-BMI-NEXT: movl %edx, (%eax)
+; X86-BMI-NEXT: movl %esi, 4(%eax)
+; X86-BMI-NEXT: movl %ebx, 8(%eax)
+; X86-BMI-NEXT: movl %edi, 12(%eax)
+; X86-BMI-NEXT: jmp .LBB10_3
+; X86-BMI-NEXT: .LBB10_2: # %identity
+; X86-BMI-NEXT: movl %ecx, (%eax)
+; X86-BMI-NEXT: movl %edx, 4(%eax)
+; X86-BMI-NEXT: movl %esi, 8(%eax)
+; X86-BMI-NEXT: movl %ebx, 12(%eax)
+; X86-BMI-NEXT: .LBB10_3: # %identity
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_v4i32:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %edi, %edi
+; X64-NOAVX2-NEXT: je .LBB10_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: pcmpeqd %xmm2, %xmm2
+; X64-NOAVX2-NEXT: pxor %xmm2, %xmm1
+; X64-NOAVX2-NEXT: pand %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movdqa %xmm1, %xmm0
+; X64-NOAVX2-NEXT: .LBB10_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_v4i32:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %edi, %edi
+; X64-AVX2-NEXT: je .LBB10_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: .LBB10_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor <4 x i32> %m, splat (i32 -1)
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <4 x i32> %a, %x
+ ret <4 x i32> %masked
+
+identity:
+ ret <4 x i32> %x
+}
+
+define <4 x i32> @and_sink_not_v4i32_swapped(<4 x i32> %x, <4 x i32> %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_v4i32_swapped:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB11_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %ebx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %ebx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: .LBB11_2: # %identity
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_v4i32_swapped:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: movl %esp, %ebp
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %edi
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: andl $-16, %esp
+; X86-SSE-NEXT: subl $64, %esp
+; X86-SSE-NEXT: movl 8(%ebp), %eax
+; X86-SSE-NEXT: movl 24(%ebp), %ecx
+; X86-SSE-NEXT: movl 20(%ebp), %edx
+; X86-SSE-NEXT: movl 16(%ebp), %esi
+; X86-SSE-NEXT: movzbl 44(%ebp), %ebx
+; X86-SSE-NEXT: testb %bl, %bl
+; X86-SSE-NEXT: movl 12(%ebp), %edi
+; X86-SSE-NEXT: movups 28(%ebp), %xmm0
+; X86-SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB11_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %edi, (%esp)
+; X86-SSE-NEXT: movl 16(%ebp), %ecx
+; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl 20(%ebp), %ecx
+; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl 24(%ebp), %ecx
+; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE-NEXT: andps %xmm0, %xmm1
+; X86-SSE-NEXT: movaps %xmm1, (%eax)
+; X86-SSE-NEXT: jmp .LBB11_3
+; X86-SSE-NEXT: .LBB11_2: # %identity
+; X86-SSE-NEXT: movl %edi, (%eax)
+; X86-SSE-NEXT: movl %esi, 4(%eax)
+; X86-SSE-NEXT: movl %edx, 8(%eax)
+; X86-SSE-NEXT: movl %ecx, 12(%eax)
+; X86-SSE-NEXT: .LBB11_3: # %identity
+; X86-SSE-NEXT: leal -12(%ebp), %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %edi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_v4i32_swapped:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB11_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: .LBB11_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_v4i32_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB11_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: notl %ebx
+; X86-BMI-NEXT: andl %ebx, %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: notl %ebx
+; X86-BMI-NEXT: andl %ebx, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: notl %ebx
+; X86-BMI-NEXT: andl %ebx, %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: notl %ebx
+; X86-BMI-NEXT: andl %ebx, %edi
+; X86-BMI-NEXT: .LBB11_2: # %identity
+; X86-BMI-NEXT: movl %edi, (%eax)
+; X86-BMI-NEXT: movl %esi, 4(%eax)
+; X86-BMI-NEXT: movl %edx, 8(%eax)
+; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_v4i32_swapped:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %edi, %edi
+; X64-NOAVX2-NEXT: je .LBB11_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: pcmpeqd %xmm2, %xmm2
+; X64-NOAVX2-NEXT: pxor %xmm2, %xmm1
+; X64-NOAVX2-NEXT: pand %xmm1, %xmm0
+; X64-NOAVX2-NEXT: .LBB11_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_v4i32_swapped:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %edi, %edi
+; X64-AVX2-NEXT: je .LBB11_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: .LBB11_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor <4 x i32> %m, splat (i32 -1)
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <4 x i32> %x, %a
+ ret <4 x i32> %masked
+
+identity:
+ ret <4 x i32> %x
+}
+
+define <4 x i64> @and_sink_not_v4i64(<4 x i64> %x, <4 x i64> %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_v4i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: je .LBB12_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %edx, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %ecx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl %ebp, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: notl %ebp
+; X86-NEXT: andl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, (%edx)
+; X86-NEXT: movl %eax, 4(%edx)
+; X86-NEXT: movl %edi, 8(%edx)
+; X86-NEXT: movl %ebp, 12(%edx)
+; X86-NEXT: movl %ecx, 16(%edx)
+; X86-NEXT: movl %esi, 20(%edx)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: jmp .LBB12_3
+; X86-NEXT: .LBB12_2: # %identity
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, 4(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %ebp, 16(%eax)
+; X86-NEXT: movl %ecx, 20(%eax)
+; X86-NEXT: movl %edx, 24(%eax)
+; X86-NEXT: movl %esi, 28(%eax)
+; X86-NEXT: .LBB12_3: # %identity
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_v4i64:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %edi
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: subl $8, %esp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: je .LBB12_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: notl %edi
+; X86-SSE-NEXT: andl %esi, %edi
+; X86-SSE-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %edx, %esi
+; X86-SSE-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %ecx, %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: notl %ecx
+; X86-SSE-NEXT: andl %ebp, %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT: notl %ebp
+; X86-SSE-NEXT: andl {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: notl %edi
+; X86-SSE-NEXT: andl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: movl %eax, %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: notl %eax
+; X86-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: notl %ebx
+; X86-SSE-NEXT: andl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movl %ebx, (%edx)
+; X86-SSE-NEXT: movl %eax, 4(%edx)
+; X86-SSE-NEXT: movl %edi, 8(%edx)
+; X86-SSE-NEXT: movl %ebp, 12(%edx)
+; X86-SSE-NEXT: movl %ecx, 16(%edx)
+; X86-SSE-NEXT: movl %esi, 20(%edx)
+; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SSE-NEXT: movl %eax, 24(%edx)
+; X86-SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE-NEXT: movl %eax, 28(%edx)
+; X86-SSE-NEXT: movl %edx, %eax
+; X86-SSE-NEXT: jmp .LBB12_3
+; X86-SSE-NEXT: .LBB12_2: # %identity
+; X86-SSE-NEXT: movl %ebx, (%eax)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movl %ebx, 4(%eax)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movl %ebx, 8(%eax)
+; X86-SSE-NEXT: movl %edi, 12(%eax)
+; X86-SSE-NEXT: movl %ebp, 16(%eax)
+; X86-SSE-NEXT: movl %ecx, 20(%eax)
+; X86-SSE-NEXT: movl %edx, 24(%eax)
+; X86-SSE-NEXT: movl %esi, 28(%eax)
+; X86-SSE-NEXT: .LBB12_3: # %identity
+; X86-SSE-NEXT: addl $8, %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %edi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_v4i64:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $16, %esp
+; X86-SSE2-NEXT: cmpb $0, 24(%ebp)
+; X86-SSE2-NEXT: je .LBB12_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; X86-SSE2-NEXT: pxor %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor 8(%ebp), %xmm3
+; X86-SSE2-NEXT: pand %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: .LBB12_2: # %identity
+; X86-SSE2-NEXT: movl %ebp, %esp
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_v4i64:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebp
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: subl $8, %esp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: je .LBB12_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: notl %edi
+; X86-BMI-NEXT: andl %esi, %edi
+; X86-BMI-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %edx, %esi
+; X86-BMI-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %ecx, %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: notl %ecx
+; X86-BMI-NEXT: andl %ebp, %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: notl %ebp
+; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: notl %edi
+; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl %eax, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: notl %eax
+; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: notl %ebx
+; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl %ebx, (%edx)
+; X86-BMI-NEXT: movl %eax, 4(%edx)
+; X86-BMI-NEXT: movl %edi, 8(%edx)
+; X86-BMI-NEXT: movl %ebp, 12(%edx)
+; X86-BMI-NEXT: movl %ecx, 16(%edx)
+; X86-BMI-NEXT: movl %esi, 20(%edx)
+; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-BMI-NEXT: movl %eax, 24(%edx)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-BMI-NEXT: movl %eax, 28(%edx)
+; X86-BMI-NEXT: movl %edx, %eax
+; X86-BMI-NEXT: jmp .LBB12_3
+; X86-BMI-NEXT: .LBB12_2: # %identity
+; X86-BMI-NEXT: movl %ebx, (%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl %ebx, 4(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl %ebx, 8(%eax)
+; X86-BMI-NEXT: movl %edi, 12(%eax)
+; X86-BMI-NEXT: movl %ebp, 16(%eax)
+; X86-BMI-NEXT: movl %ecx, 20(%eax)
+; X86-BMI-NEXT: movl %edx, 24(%eax)
+; X86-BMI-NEXT: movl %esi, 28(%eax)
+; X86-BMI-NEXT: .LBB12_3: # %identity
+; X86-BMI-NEXT: addl $8, %esp
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: popl %ebp
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_v4i64:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %edi, %edi
+; X64-NOAVX2-NEXT: je .LBB12_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: pcmpeqd %xmm4, %xmm4
+; X64-NOAVX2-NEXT: pxor %xmm4, %xmm3
+; X64-NOAVX2-NEXT: pxor %xmm4, %xmm2
+; X64-NOAVX2-NEXT: pand %xmm0, %xmm2
+; X64-NOAVX2-NEXT: pand %xmm1, %xmm3
+; X64-NOAVX2-NEXT: movdqa %xmm2, %xmm0
+; X64-NOAVX2-NEXT: movdqa %xmm3, %xmm1
+; X64-NOAVX2-NEXT: .LBB12_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_v4i64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %edi, %edi
+; X64-AVX2-NEXT: je .LBB12_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: .LBB12_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor <4 x i64> %m, splat (i64 -1)
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <4 x i64> %a, %x
+ ret <4 x i64> %masked
+
+identity:
+ ret <4 x i64> %x
+}
+
+define <4 x i64> @and_sink_not_v4i64_swapped(<4 x i64> %x, <4 x i64> %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_v4i64_swapped:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB13_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl %eax, (%ebx)
+; X86-NEXT: movl %edx, 4(%ebx)
+; X86-NEXT: movl %ecx, 8(%ebx)
+; X86-NEXT: movl %ebp, 12(%ebx)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%ebx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%ebx)
+; X86-NEXT: movl %edi, 24(%ebx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%ebx)
+; X86-NEXT: jmp .LBB13_3
+; X86-NEXT: .LBB13_2: # %identity
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, (%ebx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, 4(%ebx)
+; X86-NEXT: movl %ecx, 8(%ebx)
+; X86-NEXT: movl %ebp, 12(%ebx)
+; X86-NEXT: movl %eax, 16(%ebx)
+; X86-NEXT: movl %edi, 20(%ebx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, 24(%ebx)
+; X86-NEXT: movl %esi, 28(%ebx)
+; X86-NEXT: .LBB13_3: # %identity
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_v4i64_swapped:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %edi
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: subl $12, %esp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB13_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movl %esi, %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %esi, %edx
+; X86-SSE-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: movl %edi, %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: andl %esi, %edi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %esi, %edx
+; X86-SSE-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %esi, %eax
+; X86-SSE-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %esi, %ebp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %esi, %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: andl %esi, %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %esi, %eax
+; X86-SSE-NEXT: movl %eax, (%ebx)
+; X86-SSE-NEXT: movl %edx, 4(%ebx)
+; X86-SSE-NEXT: movl %ecx, 8(%ebx)
+; X86-SSE-NEXT: movl %ebp, 12(%ebx)
+; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SSE-NEXT: movl %eax, 16(%ebx)
+; X86-SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE-NEXT: movl %eax, 20(%ebx)
+; X86-SSE-NEXT: movl %edi, 24(%ebx)
+; X86-SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE-NEXT: movl %eax, 28(%ebx)
+; X86-SSE-NEXT: jmp .LBB13_3
+; X86-SSE-NEXT: .LBB13_2: # %identity
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl %edx, (%ebx)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl %edx, 4(%ebx)
+; X86-SSE-NEXT: movl %ecx, 8(%ebx)
+; X86-SSE-NEXT: movl %ebp, 12(%ebx)
+; X86-SSE-NEXT: movl %eax, 16(%ebx)
+; X86-SSE-NEXT: movl %edi, 20(%ebx)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl %ecx, 24(%ebx)
+; X86-SSE-NEXT: movl %esi, 28(%ebx)
+; X86-SSE-NEXT: .LBB13_3: # %identity
+; X86-SSE-NEXT: movl %ebx, %eax
+; X86-SSE-NEXT: addl $12, %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %edi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_v4i64_swapped:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $16, %esp
+; X86-SSE2-NEXT: cmpb $0, 24(%ebp)
+; X86-SSE2-NEXT: je .LBB13_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; X86-SSE2-NEXT: pxor %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor 8(%ebp), %xmm3
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: .LBB13_2: # %identity
+; X86-SSE2-NEXT: movl %ebp, %esp
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_v4i64_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebp
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: subl $12, %esp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB13_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl %esi, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %esi, %edx
+; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: movl %edi, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: andl %esi, %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %esi, %edx
+; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %esi, %eax
+; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %esi, %ebp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %esi, %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: andl %esi, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %esi, %eax
+; X86-BMI-NEXT: movl %eax, (%ebx)
+; X86-BMI-NEXT: movl %edx, 4(%ebx)
+; X86-BMI-NEXT: movl %ecx, 8(%ebx)
+; X86-BMI-NEXT: movl %ebp, 12(%ebx)
+; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-BMI-NEXT: movl %eax, 16(%ebx)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-BMI-NEXT: movl %eax, 20(%ebx)
+; X86-BMI-NEXT: movl %edi, 24(%ebx)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-BMI-NEXT: movl %eax, 28(%ebx)
+; X86-BMI-NEXT: jmp .LBB13_3
+; X86-BMI-NEXT: .LBB13_2: # %identity
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, (%ebx)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, 4(%ebx)
+; X86-BMI-NEXT: movl %ecx, 8(%ebx)
+; X86-BMI-NEXT: movl %ebp, 12(%ebx)
+; X86-BMI-NEXT: movl %eax, 16(%ebx)
+; X86-BMI-NEXT: movl %edi, 20(%ebx)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl %ecx, 24(%ebx)
+; X86-BMI-NEXT: movl %esi, 28(%ebx)
+; X86-BMI-NEXT: .LBB13_3: # %identity
+; X86-BMI-NEXT: movl %ebx, %eax
+; X86-BMI-NEXT: addl $12, %esp
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: popl %ebp
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_v4i64_swapped:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %edi, %edi
+; X64-NOAVX2-NEXT: je .LBB13_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: pcmpeqd %xmm4, %xmm4
+; X64-NOAVX2-NEXT: pxor %xmm4, %xmm3
+; X64-NOAVX2-NEXT: pxor %xmm4, %xmm2
+; X64-NOAVX2-NEXT: pand %xmm2, %xmm0
+; X64-NOAVX2-NEXT: pand %xmm3, %xmm1
+; X64-NOAVX2-NEXT: .LBB13_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_v4i64_swapped:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %edi, %edi
+; X64-AVX2-NEXT: je .LBB13_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: .LBB13_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor <4 x i64> %m, splat (i64 -1)
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <4 x i64> %x, %a
+ ret <4 x i64> %masked
+
+identity:
+ ret <4 x i64> %x
+}
+
+define <8 x i8> @and_sink_not_splat_v8i8(<8 x i8> %x, i8 %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_splat_v8i8:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB14_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movb %dl, %ch
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: notb %dl
+; X86-NEXT: andb %dl, %ch
+; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: andb %dl, %ch
+; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: andb %dl, %ch
+; X86-NEXT: andb %dl, %dh
+; X86-NEXT: andb %dl, %bl
+; X86-NEXT: andb %dl, %bh
+; X86-NEXT: andb %dl, %cl
+; X86-NEXT: andb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: movb %dl, (%eax)
+; X86-NEXT: movb %cl, 1(%eax)
+; X86-NEXT: movb %bh, 2(%eax)
+; X86-NEXT: movb %bl, 3(%eax)
+; X86-NEXT: movb %dh, 4(%eax)
+; X86-NEXT: movb %ch, 5(%eax)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 6(%eax)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 7(%eax)
+; X86-NEXT: jmp .LBB14_3
+; X86-NEXT: .LBB14_2: # %identity
+; X86-NEXT: movb %ch, (%eax)
+; X86-NEXT: movb %cl, 1(%eax)
+; X86-NEXT: movb %bh, 2(%eax)
+; X86-NEXT: movb %bl, 3(%eax)
+; X86-NEXT: movb %dh, 4(%eax)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb %cl, 5(%eax)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb %cl, 6(%eax)
+; X86-NEXT: movb %dl, 7(%eax)
+; X86-NEXT: .LBB14_3: # %identity
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_splat_v8i8:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB14_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movb %dl, %ch
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-SSE-NEXT: notb %dl
+; X86-SSE-NEXT: andb %dl, %ch
+; X86-SSE-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: andb %dl, %ch
+; X86-SSE-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: andb %dl, %ch
+; X86-SSE-NEXT: andb %dl, %dh
+; X86-SSE-NEXT: andb %dl, %bl
+; X86-SSE-NEXT: andb %dl, %bh
+; X86-SSE-NEXT: andb %dl, %cl
+; X86-SSE-NEXT: andb {{[0-9]+}}(%esp), %dl
+; X86-SSE-NEXT: movb %dl, (%eax)
+; X86-SSE-NEXT: movb %cl, 1(%eax)
+; X86-SSE-NEXT: movb %bh, 2(%eax)
+; X86-SSE-NEXT: movb %bl, 3(%eax)
+; X86-SSE-NEXT: movb %dh, 4(%eax)
+; X86-SSE-NEXT: movb %ch, 5(%eax)
+; X86-SSE-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT: movb %cl, 6(%eax)
+; X86-SSE-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT: movb %cl, 7(%eax)
+; X86-SSE-NEXT: jmp .LBB14_3
+; X86-SSE-NEXT: .LBB14_2: # %identity
+; X86-SSE-NEXT: movb %ch, (%eax)
+; X86-SSE-NEXT: movb %cl, 1(%eax)
+; X86-SSE-NEXT: movb %bh, 2(%eax)
+; X86-SSE-NEXT: movb %bl, 3(%eax)
+; X86-SSE-NEXT: movb %dh, 4(%eax)
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movb %cl, 5(%eax)
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movb %cl, 6(%eax)
+; X86-SSE-NEXT: movb %dl, 7(%eax)
+; X86-SSE-NEXT: .LBB14_3: # %identity
+; X86-SSE-NEXT: addl $4, %esp
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_splat_v8i8:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB14_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: notb %al
+; X86-SSE2-NEXT: movzbl %al, %eax
+; X86-SSE2-NEXT: movd %eax, %xmm1
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: .LBB14_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_splat_v8i8:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB14_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movb %dl, %ch
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-BMI-NEXT: notb %dl
+; X86-BMI-NEXT: andb %dl, %ch
+; X86-BMI-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: andb %dl, %ch
+; X86-BMI-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: andb %dl, %ch
+; X86-BMI-NEXT: andb %dl, %dh
+; X86-BMI-NEXT: andb %dl, %bl
+; X86-BMI-NEXT: andb %dl, %bh
+; X86-BMI-NEXT: andb %dl, %cl
+; X86-BMI-NEXT: andb {{[0-9]+}}(%esp), %dl
+; X86-BMI-NEXT: movb %dl, (%eax)
+; X86-BMI-NEXT: movb %cl, 1(%eax)
+; X86-BMI-NEXT: movb %bh, 2(%eax)
+; X86-BMI-NEXT: movb %bl, 3(%eax)
+; X86-BMI-NEXT: movb %dh, 4(%eax)
+; X86-BMI-NEXT: movb %ch, 5(%eax)
+; X86-BMI-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT: movb %cl, 6(%eax)
+; X86-BMI-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT: movb %cl, 7(%eax)
+; X86-BMI-NEXT: jmp .LBB14_3
+; X86-BMI-NEXT: .LBB14_2: # %identity
+; X86-BMI-NEXT: movb %ch, (%eax)
+; X86-BMI-NEXT: movb %cl, 1(%eax)
+; X86-BMI-NEXT: movb %bh, 2(%eax)
+; X86-BMI-NEXT: movb %bl, 3(%eax)
+; X86-BMI-NEXT: movb %dh, 4(%eax)
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movb %cl, 5(%eax)
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movb %cl, 6(%eax)
+; X86-BMI-NEXT: movb %dl, 7(%eax)
+; X86-BMI-NEXT: .LBB14_3: # %identity
+; X86-BMI-NEXT: addl $4, %esp
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_splat_v8i8:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %esi, %esi
+; X64-NOAVX2-NEXT: je .LBB14_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: notb %dil
+; X64-NOAVX2-NEXT: movzbl %dil, %eax
+; X64-NOAVX2-NEXT: movd %eax, %xmm1
+; X64-NOAVX2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NOAVX2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; X64-NOAVX2-NEXT: pand %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movdqa %xmm1, %xmm0
+; X64-NOAVX2-NEXT: .LBB14_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_splat_v8i8:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %esi, %esi
+; X64-AVX2-NEXT: je .LBB14_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: notb %dil
+; X64-AVX2-NEXT: vmovd %edi, %xmm1
+; X64-AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: .LBB14_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor i8 %m, -1
+ %head = insertelement <8 x i8> poison, i8 %a, i8 0
+ %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <8 x i8> %splat, %x
+ ret <8 x i8> %masked
+
+identity:
+ ret <8 x i8> %x
+}
+
+define <8 x i8> @and_sink_not_splat_v8i8_swapped(<8 x i8> %x, i8 %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_splat_v8i8_swapped:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB15_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: notb %ch
+; X86-NEXT: andb %ch, %dh
+; X86-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: andb %ch, %dh
+; X86-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: andb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: andb %ch, %dh
+; X86-NEXT: andb %ch, %bl
+; X86-NEXT: andb %ch, %bh
+; X86-NEXT: andb %ch, %cl
+; X86-NEXT: andb %ch, %dl
+; X86-NEXT: movb %dl, (%eax)
+; X86-NEXT: movb %cl, 1(%eax)
+; X86-NEXT: movb %bh, 2(%eax)
+; X86-NEXT: movb %bl, 3(%eax)
+; X86-NEXT: movb %dh, 4(%eax)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 5(%eax)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 6(%eax)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 7(%eax)
+; X86-NEXT: jmp .LBB15_3
+; X86-NEXT: .LBB15_2: # %identity
+; X86-NEXT: movb %dl, (%eax)
+; X86-NEXT: movb %cl, 1(%eax)
+; X86-NEXT: movb %bh, 2(%eax)
+; X86-NEXT: movb %bl, 3(%eax)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb %cl, 4(%eax)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb %cl, 5(%eax)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb %cl, 6(%eax)
+; X86-NEXT: movb %dh, 7(%eax)
+; X86-NEXT: .LBB15_3: # %identity
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_splat_v8i8_swapped:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB15_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT: notb %ch
+; X86-SSE-NEXT: andb %ch, %dh
+; X86-SSE-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: andb %ch, %dh
+; X86-SSE-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE-NEXT: andb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-SSE-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT: andb %ch, %dh
+; X86-SSE-NEXT: andb %ch, %bl
+; X86-SSE-NEXT: andb %ch, %bh
+; X86-SSE-NEXT: andb %ch, %cl
+; X86-SSE-NEXT: andb %ch, %dl
+; X86-SSE-NEXT: movb %dl, (%eax)
+; X86-SSE-NEXT: movb %cl, 1(%eax)
+; X86-SSE-NEXT: movb %bh, 2(%eax)
+; X86-SSE-NEXT: movb %bl, 3(%eax)
+; X86-SSE-NEXT: movb %dh, 4(%eax)
+; X86-SSE-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT: movb %cl, 5(%eax)
+; X86-SSE-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT: movb %cl, 6(%eax)
+; X86-SSE-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT: movb %cl, 7(%eax)
+; X86-SSE-NEXT: jmp .LBB15_3
+; X86-SSE-NEXT: .LBB15_2: # %identity
+; X86-SSE-NEXT: movb %dl, (%eax)
+; X86-SSE-NEXT: movb %cl, 1(%eax)
+; X86-SSE-NEXT: movb %bh, 2(%eax)
+; X86-SSE-NEXT: movb %bl, 3(%eax)
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movb %cl, 4(%eax)
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movb %cl, 5(%eax)
+; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movb %cl, 6(%eax)
+; X86-SSE-NEXT: movb %dh, 7(%eax)
+; X86-SSE-NEXT: .LBB15_3: # %identity
+; X86-SSE-NEXT: addl $4, %esp
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_splat_v8i8_swapped:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB15_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: notb %al
+; X86-SSE2-NEXT: movzbl %al, %eax
+; X86-SSE2-NEXT: movd %eax, %xmm1
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: .LBB15_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_splat_v8i8_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB15_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: notb %ch
+; X86-BMI-NEXT: andb %ch, %dh
+; X86-BMI-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: andb %ch, %dh
+; X86-BMI-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-BMI-NEXT: andb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: andb %ch, %dh
+; X86-BMI-NEXT: andb %ch, %bl
+; X86-BMI-NEXT: andb %ch, %bh
+; X86-BMI-NEXT: andb %ch, %cl
+; X86-BMI-NEXT: andb %ch, %dl
+; X86-BMI-NEXT: movb %dl, (%eax)
+; X86-BMI-NEXT: movb %cl, 1(%eax)
+; X86-BMI-NEXT: movb %bh, 2(%eax)
+; X86-BMI-NEXT: movb %bl, 3(%eax)
+; X86-BMI-NEXT: movb %dh, 4(%eax)
+; X86-BMI-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT: movb %cl, 5(%eax)
+; X86-BMI-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT: movb %cl, 6(%eax)
+; X86-BMI-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT: movb %cl, 7(%eax)
+; X86-BMI-NEXT: jmp .LBB15_3
+; X86-BMI-NEXT: .LBB15_2: # %identity
+; X86-BMI-NEXT: movb %dl, (%eax)
+; X86-BMI-NEXT: movb %cl, 1(%eax)
+; X86-BMI-NEXT: movb %bh, 2(%eax)
+; X86-BMI-NEXT: movb %bl, 3(%eax)
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movb %cl, 4(%eax)
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movb %cl, 5(%eax)
+; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movb %cl, 6(%eax)
+; X86-BMI-NEXT: movb %dh, 7(%eax)
+; X86-BMI-NEXT: .LBB15_3: # %identity
+; X86-BMI-NEXT: addl $4, %esp
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_splat_v8i8_swapped:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %esi, %esi
+; X64-NOAVX2-NEXT: je .LBB15_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: notb %dil
+; X64-NOAVX2-NEXT: movzbl %dil, %eax
+; X64-NOAVX2-NEXT: movd %eax, %xmm1
+; X64-NOAVX2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NOAVX2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; X64-NOAVX2-NEXT: pand %xmm1, %xmm0
+; X64-NOAVX2-NEXT: .LBB15_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_splat_v8i8_swapped:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %esi, %esi
+; X64-AVX2-NEXT: je .LBB15_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: notb %dil
+; X64-AVX2-NEXT: vmovd %edi, %xmm1
+; X64-AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: .LBB15_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor i8 %m, -1
+ %head = insertelement <8 x i8> poison, i8 %a, i8 0
+ %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <8 x i8> %x, %splat
+ ret <8 x i8> %masked
+
+identity:
+ ret <8 x i8> %x
+}
+
+define <4 x i32> @and_sink_not_splat_v4i32(<4 x i32> %x, i32 %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_splat_v4i32:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB16_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %ebx, %ecx
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: andl %ebx, %esi
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: jmp .LBB16_3
+; X86-NEXT: .LBB16_2: # %identity
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: .LBB16_3: # %identity
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_splat_v4i32:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %edi
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: subl $32, %esp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB16_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: notl %ebx
+; X86-SSE-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ebx, (%esp)
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE-NEXT: andps %xmm0, %xmm1
+; X86-SSE-NEXT: movaps %xmm1, (%eax)
+; X86-SSE-NEXT: jmp .LBB16_3
+; X86-SSE-NEXT: .LBB16_2: # %identity
+; X86-SSE-NEXT: movl %edi, (%eax)
+; X86-SSE-NEXT: movl %esi, 4(%eax)
+; X86-SSE-NEXT: movl %edx, 8(%eax)
+; X86-SSE-NEXT: movl %ecx, 12(%eax)
+; X86-SSE-NEXT: .LBB16_3: # %identity
+; X86-SSE-NEXT: addl $32, %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %edi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_splat_v4i32:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB16_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: notl %eax
+; X86-SSE2-NEXT: movd %eax, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: .LBB16_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_splat_v4i32:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB16_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: notl %ebx
+; X86-BMI-NEXT: andl %ebx, %ecx
+; X86-BMI-NEXT: andl %ebx, %edx
+; X86-BMI-NEXT: andl %ebx, %esi
+; X86-BMI-NEXT: andl %edi, %ebx
+; X86-BMI-NEXT: movl %ebx, (%eax)
+; X86-BMI-NEXT: jmp .LBB16_3
+; X86-BMI-NEXT: .LBB16_2: # %identity
+; X86-BMI-NEXT: movl %edi, (%eax)
+; X86-BMI-NEXT: .LBB16_3: # %identity
+; X86-BMI-NEXT: movl %esi, 4(%eax)
+; X86-BMI-NEXT: movl %edx, 8(%eax)
+; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_splat_v4i32:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %esi, %esi
+; X64-NOAVX2-NEXT: je .LBB16_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: notl %edi
+; X64-NOAVX2-NEXT: movd %edi, %xmm1
+; X64-NOAVX2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-NOAVX2-NEXT: pand %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movdqa %xmm1, %xmm0
+; X64-NOAVX2-NEXT: .LBB16_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_splat_v4i32:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %esi, %esi
+; X64-AVX2-NEXT: je .LBB16_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: notl %edi
+; X64-AVX2-NEXT: vmovd %edi, %xmm1
+; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: .LBB16_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor i32 %m, -1
+ %head = insertelement <4 x i32> poison, i32 %a, i32 0
+ %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <4 x i32> %splat, %x
+ ret <4 x i32> %masked
+
+identity:
+ ret <4 x i32> %x
+}
+
+define <4 x i32> @and_sink_not_splat_v4i32_swapped(<4 x i32> %x, i32 %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_splat_v4i32_swapped:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB17_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %ebx, %ecx
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: andl %ebx, %esi
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: .LBB17_2: # %identity
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_splat_v4i32_swapped:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %edi
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: subl $32, %esp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB17_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: notl %ebx
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %edi, (%esp)
+; X86-SSE-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE-NEXT: andps %xmm0, %xmm1
+; X86-SSE-NEXT: movaps %xmm1, (%eax)
+; X86-SSE-NEXT: jmp .LBB17_3
+; X86-SSE-NEXT: .LBB17_2: # %identity
+; X86-SSE-NEXT: movl %edi, (%eax)
+; X86-SSE-NEXT: movl %esi, 4(%eax)
+; X86-SSE-NEXT: movl %edx, 8(%eax)
+; X86-SSE-NEXT: movl %ecx, 12(%eax)
+; X86-SSE-NEXT: .LBB17_3: # %identity
+; X86-SSE-NEXT: addl $32, %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %edi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_splat_v4i32_swapped:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB17_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: notl %eax
+; X86-SSE2-NEXT: movd %eax, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: .LBB17_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_splat_v4i32_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB17_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: notl %ebx
+; X86-BMI-NEXT: andl %ebx, %ecx
+; X86-BMI-NEXT: andl %ebx, %edx
+; X86-BMI-NEXT: andl %ebx, %esi
+; X86-BMI-NEXT: andl %ebx, %edi
+; X86-BMI-NEXT: .LBB17_2: # %identity
+; X86-BMI-NEXT: movl %edi, (%eax)
+; X86-BMI-NEXT: movl %esi, 4(%eax)
+; X86-BMI-NEXT: movl %edx, 8(%eax)
+; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_splat_v4i32_swapped:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %esi, %esi
+; X64-NOAVX2-NEXT: je .LBB17_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: notl %edi
+; X64-NOAVX2-NEXT: movd %edi, %xmm1
+; X64-NOAVX2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-NOAVX2-NEXT: pand %xmm1, %xmm0
+; X64-NOAVX2-NEXT: .LBB17_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_splat_v4i32_swapped:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %esi, %esi
+; X64-AVX2-NEXT: je .LBB17_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: notl %edi
+; X64-AVX2-NEXT: vmovd %edi, %xmm1
+; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: .LBB17_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor i32 %m, -1
+ %head = insertelement <4 x i32> poison, i32 %a, i32 0
+ %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <4 x i32> %x, %splat
+ ret <4 x i32> %masked
+
+identity:
+ ret <4 x i32> %x
+}
+
+define <4 x i64> @and_sink_not_splat_v4i64(<4 x i64> %x, i64 %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_splat_v4i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB18_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %esi
+; X86-NEXT: andl %ecx, %ebx
+; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: notl %edx
+; X86-NEXT: andl %edx, (%esp) # 4-byte Folded Spill
+; X86-NEXT: andl %edx, %edi
+; X86-NEXT: andl %edx, %ebp
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %ebp, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %edi, 16(%eax)
+; X86-NEXT: movl %esi, 20(%eax)
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 24(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 28(%eax)
+; X86-NEXT: jmp .LBB18_3
+; X86-NEXT: .LBB18_2: # %identity
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %ebp, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %edi, 16(%eax)
+; X86-NEXT: movl %esi, 20(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, 24(%eax)
+; X86-NEXT: movl %edx, 28(%eax)
+; X86-NEXT: .LBB18_3: # %identity
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_splat_v4i64:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %edi
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: subl $8, %esp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: je .LBB18_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: notl %ecx
+; X86-SSE-NEXT: andl %ecx, %edx
+; X86-SSE-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT: andl %ecx, %esi
+; X86-SSE-NEXT: andl %ecx, %ebx
+; X86-SSE-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: notl %edx
+; X86-SSE-NEXT: andl %edx, (%esp) # 4-byte Folded Spill
+; X86-SSE-NEXT: andl %edx, %edi
+; X86-SSE-NEXT: andl %edx, %ebp
+; X86-SSE-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: movl %edx, (%eax)
+; X86-SSE-NEXT: movl %ecx, 4(%eax)
+; X86-SSE-NEXT: movl %ebp, 8(%eax)
+; X86-SSE-NEXT: movl %ebx, 12(%eax)
+; X86-SSE-NEXT: movl %edi, 16(%eax)
+; X86-SSE-NEXT: movl %esi, 20(%eax)
+; X86-SSE-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE-NEXT: movl %ecx, 24(%eax)
+; X86-SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE-NEXT: movl %ecx, 28(%eax)
+; X86-SSE-NEXT: jmp .LBB18_3
+; X86-SSE-NEXT: .LBB18_2: # %identity
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl %ecx, (%eax)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl %ecx, 4(%eax)
+; X86-SSE-NEXT: movl %ebp, 8(%eax)
+; X86-SSE-NEXT: movl %ebx, 12(%eax)
+; X86-SSE-NEXT: movl %edi, 16(%eax)
+; X86-SSE-NEXT: movl %esi, 20(%eax)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl %ecx, 24(%eax)
+; X86-SSE-NEXT: movl %edx, 28(%eax)
+; X86-SSE-NEXT: .LBB18_3: # %identity
+; X86-SSE-NEXT: addl $8, %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %edi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_splat_v4i64:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB18_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; X86-SSE2-NEXT: pxor %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: .LBB18_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_splat_v4i64:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebp
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: subl $8, %esp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: je .LBB18_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: notl %ecx
+; X86-BMI-NEXT: andl %ecx, %edx
+; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: andl %ecx, %esi
+; X86-BMI-NEXT: andl %ecx, %ebx
+; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: notl %edx
+; X86-BMI-NEXT: andl %edx, (%esp) # 4-byte Folded Spill
+; X86-BMI-NEXT: andl %edx, %edi
+; X86-BMI-NEXT: andl %edx, %ebp
+; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, (%eax)
+; X86-BMI-NEXT: movl %ecx, 4(%eax)
+; X86-BMI-NEXT: movl %ebp, 8(%eax)
+; X86-BMI-NEXT: movl %ebx, 12(%eax)
+; X86-BMI-NEXT: movl %edi, 16(%eax)
+; X86-BMI-NEXT: movl %esi, 20(%eax)
+; X86-BMI-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 24(%eax)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 28(%eax)
+; X86-BMI-NEXT: jmp .LBB18_3
+; X86-BMI-NEXT: .LBB18_2: # %identity
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl %ecx, (%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl %ecx, 4(%eax)
+; X86-BMI-NEXT: movl %ebp, 8(%eax)
+; X86-BMI-NEXT: movl %ebx, 12(%eax)
+; X86-BMI-NEXT: movl %edi, 16(%eax)
+; X86-BMI-NEXT: movl %esi, 20(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl %ecx, 24(%eax)
+; X86-BMI-NEXT: movl %edx, 28(%eax)
+; X86-BMI-NEXT: .LBB18_3: # %identity
+; X86-BMI-NEXT: addl $8, %esp
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: popl %ebp
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_splat_v4i64:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %esi, %esi
+; X64-NOAVX2-NEXT: je .LBB18_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: notq %rdi
+; X64-NOAVX2-NEXT: movq %rdi, %xmm2
+; X64-NOAVX2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; X64-NOAVX2-NEXT: pand %xmm2, %xmm0
+; X64-NOAVX2-NEXT: pand %xmm2, %xmm1
+; X64-NOAVX2-NEXT: .LBB18_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_splat_v4i64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %esi, %esi
+; X64-AVX2-NEXT: je .LBB18_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: notq %rdi
+; X64-AVX2-NEXT: vmovq %rdi, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; X64-AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: .LBB18_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor i64 %m, -1
+ %head = insertelement <4 x i64> poison, i64 %a, i64 0
+ %splat = shufflevector <4 x i64> %head, <4 x i64> poison, <4 x i32> zeroinitializer
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <4 x i64> %splat, %x
+ ret <4 x i64> %masked
+
+identity:
+ ret <4 x i64> %x
+}
+
+define <4 x i64> @and_sink_not_splat_v4i64_swapped(<4 x i64> %x, i64 %m, i1 zeroext %cond) nounwind {
+; X86-LABEL: and_sink_not_splat_v4i64_swapped:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: je .LBB19_2
+; X86-NEXT: # %bb.1: # %mask
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, (%esp) # 4-byte Folded Spill
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: andl %esi, %ebx
+; X86-NEXT: andl %esi, %ebp
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %ebp, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ebx, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 20(%eax)
+; X86-NEXT: movl %edi, 24(%eax)
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 28(%eax)
+; X86-NEXT: jmp .LBB19_3
+; X86-NEXT: .LBB19_2: # %identity
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %ebp, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, 16(%eax)
+; X86-NEXT: movl %edi, 20(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, 24(%eax)
+; X86-NEXT: movl %esi, 28(%eax)
+; X86-NEXT: .LBB19_3: # %identity
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+;
+; X86-SSE-LABEL: and_sink_not_splat_v4i64_swapped:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: pushl %ebp
+; X86-SSE-NEXT: pushl %ebx
+; X86-SSE-NEXT: pushl %edi
+; X86-SSE-NEXT: pushl %esi
+; X86-SSE-NEXT: subl $12, %esp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT: je .LBB19_2
+; X86-SSE-NEXT: # %bb.1: # %mask
+; X86-SSE-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: andl %esi, (%esp) # 4-byte Folded Spill
+; X86-SSE-NEXT: andl %esi, %edi
+; X86-SSE-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT: andl %esi, %ebx
+; X86-SSE-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT: andl %esi, %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT: notl %esi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT: andl %esi, %edi
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT: andl %esi, %ebx
+; X86-SSE-NEXT: andl %esi, %ebp
+; X86-SSE-NEXT: andl %esi, %edx
+; X86-SSE-NEXT: movl %edx, (%eax)
+; X86-SSE-NEXT: movl %ecx, 4(%eax)
+; X86-SSE-NEXT: movl %ebp, 8(%eax)
+; X86-SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE-NEXT: movl %ecx, 12(%eax)
+; X86-SSE-NEXT: movl %ebx, 16(%eax)
+; X86-SSE-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE-NEXT: movl %ecx, 20(%eax)
+; X86-SSE-NEXT: movl %edi, 24(%eax)
+; X86-SSE-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE-NEXT: movl %ecx, 28(%eax)
+; X86-SSE-NEXT: jmp .LBB19_3
+; X86-SSE-NEXT: .LBB19_2: # %identity
+; X86-SSE-NEXT: movl %edx, (%eax)
+; X86-SSE-NEXT: movl %ecx, 4(%eax)
+; X86-SSE-NEXT: movl %ebp, 8(%eax)
+; X86-SSE-NEXT: movl %ebx, 12(%eax)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl %ecx, 16(%eax)
+; X86-SSE-NEXT: movl %edi, 20(%eax)
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl %ecx, 24(%eax)
+; X86-SSE-NEXT: movl %esi, 28(%eax)
+; X86-SSE-NEXT: .LBB19_3: # %identity
+; X86-SSE-NEXT: addl $12, %esp
+; X86-SSE-NEXT: popl %esi
+; X86-SSE-NEXT: popl %edi
+; X86-SSE-NEXT: popl %ebx
+; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: retl $4
+;
+; X86-SSE2-LABEL: and_sink_not_splat_v4i64_swapped:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: je .LBB19_2
+; X86-SSE2-NEXT: # %bb.1: # %mask
+; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: .LBB19_2: # %identity
+; X86-SSE2-NEXT: retl
+;
+; X86-BMI-LABEL: and_sink_not_splat_v4i64_swapped:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebp
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: subl $12, %esp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: je .LBB19_2
+; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: andl %esi, (%esp) # 4-byte Folded Spill
+; X86-BMI-NEXT: andl %esi, %edi
+; X86-BMI-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: andl %esi, %ebx
+; X86-BMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: andl %esi, %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: notl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: andl %esi, %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: andl %esi, %ebx
+; X86-BMI-NEXT: andl %esi, %ebp
+; X86-BMI-NEXT: andl %esi, %edx
+; X86-BMI-NEXT: movl %edx, (%eax)
+; X86-BMI-NEXT: movl %ecx, 4(%eax)
+; X86-BMI-NEXT: movl %ebp, 8(%eax)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: movl %ebx, 16(%eax)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 20(%eax)
+; X86-BMI-NEXT: movl %edi, 24(%eax)
+; X86-BMI-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 28(%eax)
+; X86-BMI-NEXT: jmp .LBB19_3
+; X86-BMI-NEXT: .LBB19_2: # %identity
+; X86-BMI-NEXT: movl %edx, (%eax)
+; X86-BMI-NEXT: movl %ecx, 4(%eax)
+; X86-BMI-NEXT: movl %ebp, 8(%eax)
+; X86-BMI-NEXT: movl %ebx, 12(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl %ecx, 16(%eax)
+; X86-BMI-NEXT: movl %edi, 20(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl %ecx, 24(%eax)
+; X86-BMI-NEXT: movl %esi, 28(%eax)
+; X86-BMI-NEXT: .LBB19_3: # %identity
+; X86-BMI-NEXT: addl $12, %esp
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: popl %ebp
+; X86-BMI-NEXT: retl $4
+;
+; X64-NOAVX2-LABEL: and_sink_not_splat_v4i64_swapped:
+; X64-NOAVX2: # %bb.0:
+; X64-NOAVX2-NEXT: testl %esi, %esi
+; X64-NOAVX2-NEXT: je .LBB19_2
+; X64-NOAVX2-NEXT: # %bb.1: # %mask
+; X64-NOAVX2-NEXT: notq %rdi
+; X64-NOAVX2-NEXT: movq %rdi, %xmm2
+; X64-NOAVX2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; X64-NOAVX2-NEXT: pand %xmm2, %xmm0
+; X64-NOAVX2-NEXT: pand %xmm2, %xmm1
+; X64-NOAVX2-NEXT: .LBB19_2: # %identity
+; X64-NOAVX2-NEXT: retq
+;
+; X64-AVX2-LABEL: and_sink_not_splat_v4i64_swapped:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: testl %esi, %esi
+; X64-AVX2-NEXT: je .LBB19_2
+; X64-AVX2-NEXT: # %bb.1: # %mask
+; X64-AVX2-NEXT: notq %rdi
+; X64-AVX2-NEXT: vmovq %rdi, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: .LBB19_2: # %identity
+; X64-AVX2-NEXT: retq
+ %a = xor i64 %m, -1
+ %head = insertelement <4 x i64> poison, i64 %a, i64 0
+ %splat = shufflevector <4 x i64> %head, <4 x i64> poison, <4 x i32> zeroinitializer
+ br i1 %cond, label %mask, label %identity
+
+mask:
+ %masked = and <4 x i64> %x, %splat
+ ret <4 x i64> %masked
+
+identity:
+ ret <4 x i64> %x
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64-BMI: {{.*}}
+; X64-NOBMI: {{.*}}
>From 5139a905974d2a5a649aaaf74023540309d6292a Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Mon, 15 Dec 2025 17:02:03 +0100
Subject: [PATCH 2/4] [X86] Sink NOT to be fold into ANDN
Undoes a negation being hoisted out of a loop, so that if can be fold
into an inverted bitwise operation in the loop.
Implements #108840 on X86
---
.../lib/Target/X86/X86TargetTransformInfo.cpp | 22 +
llvm/test/CodeGen/X86/andnot-sink-not.ll | 407 +++++++++---------
2 files changed, 235 insertions(+), 194 deletions(-)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 9fb97918cb71a..b90232b25088b 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -7198,6 +7198,28 @@ bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
using namespace llvm::PatternMatch;
+ if (I->getOpcode() == Instruction::And &&
+ (I->getType()->isVectorTy() ? ST->hasSSE2() : ST->hasBMI())) {
+ for (auto &Op : I->operands()) {
+ // (and X, (not Y)) -> (andn X, Y)
+ if (match(Op.get(), m_Not(m_Value()))) {
+ Ops.push_back(&Op);
+ return true;
+ }
+ // (and X, (splat (not Y))) -> (andn X, (splat Y))
+ if (match(Op.get(),
+ m_Shuffle(m_InsertElt(m_Value(), m_Not(m_Value()), m_ZeroInt()),
+ m_Value(), m_ZeroMask()))) {
+ Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
+ Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
+ Ops.push_back(&Not);
+ Ops.push_back(&InsertElt);
+ Ops.push_back(&Op);
+ return true;
+ }
+ }
+ }
+
FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
if (!VTy)
return false;
diff --git a/llvm/test/CodeGen/X86/andnot-sink-not.ll b/llvm/test/CodeGen/X86/andnot-sink-not.ll
index 4d6aa02c0fe72..8463b8ae4e750 100644
--- a/llvm/test/CodeGen/X86/andnot-sink-not.ll
+++ b/llvm/test/CodeGen/X86/andnot-sink-not.ll
@@ -126,27 +126,36 @@ define i16 @and_sink_not_i16(i16 %x, i16 %m, i1 zeroext %cond) {
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-BMI-NEXT: je .LBB2_2
; X86-BMI-NEXT: # %bb.1: # %mask
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: notl %ecx
-; X86-BMI-NEXT: andl %eax, %ecx
-; X86-BMI-NEXT: movl %ecx, %eax
-; X86-BMI-NEXT: retl
+; X86-BMI-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: andnl %eax, %ecx, %eax
; X86-BMI-NEXT: .LBB2_2: # %identity
; X86-BMI-NEXT: # kill: def $ax killed $ax killed $eax
; X86-BMI-NEXT: retl
;
-; X64-LABEL: and_sink_not_i16:
-; X64: # %bb.0:
-; X64-NEXT: testl %edx, %edx
-; X64-NEXT: je .LBB2_2
-; X64-NEXT: # %bb.1: # %mask
-; X64-NEXT: notl %esi
-; X64-NEXT: andl %edi, %esi
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: retq
-; X64-NEXT: .LBB2_2: # %identity
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: retq
+; X64-NOBMI-LABEL: and_sink_not_i16:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: testl %edx, %edx
+; X64-NOBMI-NEXT: je .LBB2_2
+; X64-NOBMI-NEXT: # %bb.1: # %mask
+; X64-NOBMI-NEXT: notl %esi
+; X64-NOBMI-NEXT: andl %edi, %esi
+; X64-NOBMI-NEXT: movl %esi, %eax
+; X64-NOBMI-NEXT: retq
+; X64-NOBMI-NEXT: .LBB2_2: # %identity
+; X64-NOBMI-NEXT: movl %edi, %eax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI-LABEL: and_sink_not_i16:
+; X64-BMI: # %bb.0:
+; X64-BMI-NEXT: testl %edx, %edx
+; X64-BMI-NEXT: je .LBB2_2
+; X64-BMI-NEXT: # %bb.1: # %mask
+; X64-BMI-NEXT: andnl %edi, %esi, %eax
+; X64-BMI-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-BMI-NEXT: retq
+; X64-BMI-NEXT: .LBB2_2: # %identity
+; X64-BMI-NEXT: movl %edi, %eax
+; X64-BMI-NEXT: retq
%a = xor i16 %m, -1
br i1 %cond, label %mask, label %identity
@@ -178,24 +187,35 @@ define i16 @and_sink_not_i16_swapped(i16 %x, i16 %m, i1 zeroext %cond) {
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-BMI-NEXT: je .LBB3_2
; X86-BMI-NEXT: # %bb.1: # %mask
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: notl %ecx
-; X86-BMI-NEXT: andl %ecx, %eax
+; X86-BMI-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: andnl %eax, %ecx, %eax
; X86-BMI-NEXT: .LBB3_2: # %identity
; X86-BMI-NEXT: # kill: def $ax killed $ax killed $eax
; X86-BMI-NEXT: retl
;
-; X64-LABEL: and_sink_not_i16_swapped:
-; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: testl %edx, %edx
-; X64-NEXT: je .LBB3_2
-; X64-NEXT: # %bb.1: # %mask
-; X64-NEXT: notl %esi
-; X64-NEXT: andl %esi, %eax
-; X64-NEXT: .LBB3_2: # %identity
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
-; X64-NEXT: retq
+; X64-NOBMI-LABEL: and_sink_not_i16_swapped:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: movl %edi, %eax
+; X64-NOBMI-NEXT: testl %edx, %edx
+; X64-NOBMI-NEXT: je .LBB3_2
+; X64-NOBMI-NEXT: # %bb.1: # %mask
+; X64-NOBMI-NEXT: notl %esi
+; X64-NOBMI-NEXT: andl %esi, %eax
+; X64-NOBMI-NEXT: .LBB3_2: # %identity
+; X64-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI-LABEL: and_sink_not_i16_swapped:
+; X64-BMI: # %bb.0:
+; X64-BMI-NEXT: testl %edx, %edx
+; X64-BMI-NEXT: je .LBB3_2
+; X64-BMI-NEXT: # %bb.1: # %mask
+; X64-BMI-NEXT: andnl %edi, %esi, %eax
+; X64-BMI-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-BMI-NEXT: retq
+; X64-BMI-NEXT: .LBB3_2: # %identity
+; X64-BMI-NEXT: movl %edi, %eax
+; X64-BMI-NEXT: retq
%a = xor i16 %m, -1
br i1 %cond, label %mask, label %identity
@@ -228,24 +248,33 @@ define i32 @and_sink_not_i32(i32 %x, i32 %m, i1 zeroext %cond) {
; X86-BMI-NEXT: je .LBB4_2
; X86-BMI-NEXT: # %bb.1: # %mask
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: notl %ecx
-; X86-BMI-NEXT: andl %eax, %ecx
-; X86-BMI-NEXT: movl %ecx, %eax
+; X86-BMI-NEXT: andnl %eax, %ecx, %eax
; X86-BMI-NEXT: .LBB4_2: # %identity
; X86-BMI-NEXT: retl
;
-; X64-LABEL: and_sink_not_i32:
-; X64: # %bb.0:
-; X64-NEXT: testl %edx, %edx
-; X64-NEXT: je .LBB4_2
-; X64-NEXT: # %bb.1: # %mask
-; X64-NEXT: notl %esi
-; X64-NEXT: andl %edi, %esi
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: retq
-; X64-NEXT: .LBB4_2: # %identity
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: retq
+; X64-NOBMI-LABEL: and_sink_not_i32:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: testl %edx, %edx
+; X64-NOBMI-NEXT: je .LBB4_2
+; X64-NOBMI-NEXT: # %bb.1: # %mask
+; X64-NOBMI-NEXT: notl %esi
+; X64-NOBMI-NEXT: andl %edi, %esi
+; X64-NOBMI-NEXT: movl %esi, %eax
+; X64-NOBMI-NEXT: retq
+; X64-NOBMI-NEXT: .LBB4_2: # %identity
+; X64-NOBMI-NEXT: movl %edi, %eax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI-LABEL: and_sink_not_i32:
+; X64-BMI: # %bb.0:
+; X64-BMI-NEXT: testl %edx, %edx
+; X64-BMI-NEXT: je .LBB4_2
+; X64-BMI-NEXT: # %bb.1: # %mask
+; X64-BMI-NEXT: andnl %edi, %esi, %eax
+; X64-BMI-NEXT: retq
+; X64-BMI-NEXT: .LBB4_2: # %identity
+; X64-BMI-NEXT: movl %edi, %eax
+; X64-BMI-NEXT: retq
%a = xor i32 %m, -1
br i1 %cond, label %mask, label %identity
@@ -277,21 +306,31 @@ define i32 @and_sink_not_i32_swapped(i32 %x, i32 %m, i1 zeroext %cond) {
; X86-BMI-NEXT: je .LBB5_2
; X86-BMI-NEXT: # %bb.1: # %mask
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: notl %ecx
-; X86-BMI-NEXT: andl %ecx, %eax
+; X86-BMI-NEXT: andnl %eax, %ecx, %eax
; X86-BMI-NEXT: .LBB5_2: # %identity
; X86-BMI-NEXT: retl
;
-; X64-LABEL: and_sink_not_i32_swapped:
-; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: testl %edx, %edx
-; X64-NEXT: je .LBB5_2
-; X64-NEXT: # %bb.1: # %mask
-; X64-NEXT: notl %esi
-; X64-NEXT: andl %esi, %eax
-; X64-NEXT: .LBB5_2: # %identity
-; X64-NEXT: retq
+; X64-NOBMI-LABEL: and_sink_not_i32_swapped:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: movl %edi, %eax
+; X64-NOBMI-NEXT: testl %edx, %edx
+; X64-NOBMI-NEXT: je .LBB5_2
+; X64-NOBMI-NEXT: # %bb.1: # %mask
+; X64-NOBMI-NEXT: notl %esi
+; X64-NOBMI-NEXT: andl %esi, %eax
+; X64-NOBMI-NEXT: .LBB5_2: # %identity
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI-LABEL: and_sink_not_i32_swapped:
+; X64-BMI: # %bb.0:
+; X64-BMI-NEXT: testl %edx, %edx
+; X64-BMI-NEXT: je .LBB5_2
+; X64-BMI-NEXT: # %bb.1: # %mask
+; X64-BMI-NEXT: andnl %edi, %esi, %eax
+; X64-BMI-NEXT: retq
+; X64-BMI-NEXT: .LBB5_2: # %identity
+; X64-BMI-NEXT: movl %edi, %eax
+; X64-BMI-NEXT: retq
%a = xor i32 %m, -1
br i1 %cond, label %mask, label %identity
@@ -334,28 +373,35 @@ define i64 @and_sink_not_i64(i64 %x, i64 %m, i1 zeroext %cond) nounwind {
; X86-BMI-NEXT: # %bb.1: # %mask
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: notl %ecx
-; X86-BMI-NEXT: andl %eax, %ecx
-; X86-BMI-NEXT: andl %edx, %esi
-; X86-BMI-NEXT: movl %ecx, %eax
-; X86-BMI-NEXT: movl %esi, %edx
+; X86-BMI-NEXT: andnl %eax, %esi, %eax
+; X86-BMI-NEXT: andnl %edx, %ecx, %edx
; X86-BMI-NEXT: .LBB6_2: # %identity
; X86-BMI-NEXT: popl %esi
; X86-BMI-NEXT: retl
;
-; X64-LABEL: and_sink_not_i64:
-; X64: # %bb.0:
-; X64-NEXT: testl %edx, %edx
-; X64-NEXT: je .LBB6_2
-; X64-NEXT: # %bb.1: # %mask
-; X64-NEXT: notq %rsi
-; X64-NEXT: andq %rdi, %rsi
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: retq
-; X64-NEXT: .LBB6_2: # %identity
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: retq
+; X64-NOBMI-LABEL: and_sink_not_i64:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: testl %edx, %edx
+; X64-NOBMI-NEXT: je .LBB6_2
+; X64-NOBMI-NEXT: # %bb.1: # %mask
+; X64-NOBMI-NEXT: notq %rsi
+; X64-NOBMI-NEXT: andq %rdi, %rsi
+; X64-NOBMI-NEXT: movq %rsi, %rax
+; X64-NOBMI-NEXT: retq
+; X64-NOBMI-NEXT: .LBB6_2: # %identity
+; X64-NOBMI-NEXT: movq %rdi, %rax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI-LABEL: and_sink_not_i64:
+; X64-BMI: # %bb.0:
+; X64-BMI-NEXT: testl %edx, %edx
+; X64-BMI-NEXT: je .LBB6_2
+; X64-BMI-NEXT: # %bb.1: # %mask
+; X64-BMI-NEXT: andnq %rdi, %rsi, %rax
+; X64-BMI-NEXT: retq
+; X64-BMI-NEXT: .LBB6_2: # %identity
+; X64-BMI-NEXT: movq %rdi, %rax
+; X64-BMI-NEXT: retq
%a = xor i64 %m, -1
br i1 %cond, label %mask, label %identity
@@ -396,24 +442,33 @@ define i64 @and_sink_not_i64_swapped(i64 %x, i64 %m, i1 zeroext %cond) nounwind
; X86-BMI-NEXT: # %bb.1: # %mask
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: notl %ecx
-; X86-BMI-NEXT: andl %ecx, %eax
-; X86-BMI-NEXT: andl %esi, %edx
+; X86-BMI-NEXT: andnl %eax, %esi, %eax
+; X86-BMI-NEXT: andnl %edx, %ecx, %edx
; X86-BMI-NEXT: .LBB7_2: # %identity
; X86-BMI-NEXT: popl %esi
; X86-BMI-NEXT: retl
;
-; X64-LABEL: and_sink_not_i64_swapped:
-; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: testl %edx, %edx
-; X64-NEXT: je .LBB7_2
-; X64-NEXT: # %bb.1: # %mask
-; X64-NEXT: notq %rsi
-; X64-NEXT: andq %rsi, %rax
-; X64-NEXT: .LBB7_2: # %identity
-; X64-NEXT: retq
+; X64-NOBMI-LABEL: and_sink_not_i64_swapped:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: movq %rdi, %rax
+; X64-NOBMI-NEXT: testl %edx, %edx
+; X64-NOBMI-NEXT: je .LBB7_2
+; X64-NOBMI-NEXT: # %bb.1: # %mask
+; X64-NOBMI-NEXT: notq %rsi
+; X64-NOBMI-NEXT: andq %rsi, %rax
+; X64-NOBMI-NEXT: .LBB7_2: # %identity
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI-LABEL: and_sink_not_i64_swapped:
+; X64-BMI: # %bb.0:
+; X64-BMI-NEXT: testl %edx, %edx
+; X64-BMI-NEXT: je .LBB7_2
+; X64-BMI-NEXT: # %bb.1: # %mask
+; X64-BMI-NEXT: andnq %rdi, %rsi, %rax
+; X64-BMI-NEXT: retq
+; X64-BMI-NEXT: .LBB7_2: # %identity
+; X64-BMI-NEXT: movq %rdi, %rax
+; X64-BMI-NEXT: retq
%a = xor i64 %m, -1
br i1 %cond, label %mask, label %identity
@@ -559,10 +614,8 @@ define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) n
; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: je .LBB8_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pand %xmm0, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: andnps %xmm0, %xmm1
+; X86-SSE2-NEXT: movaps %xmm1, %xmm0
; X86-SSE2-NEXT: .LBB8_2: # %identity
; X86-SSE2-NEXT: retl
;
@@ -635,10 +688,8 @@ define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) n
; X64-NOAVX2-NEXT: testl %edi, %edi
; X64-NOAVX2-NEXT: je .LBB8_2
; X64-NOAVX2-NEXT: # %bb.1: # %mask
-; X64-NOAVX2-NEXT: pcmpeqd %xmm2, %xmm2
-; X64-NOAVX2-NEXT: pxor %xmm2, %xmm1
-; X64-NOAVX2-NEXT: pand %xmm0, %xmm1
-; X64-NOAVX2-NEXT: movdqa %xmm1, %xmm0
+; X64-NOAVX2-NEXT: andnps %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movaps %xmm1, %xmm0
; X64-NOAVX2-NEXT: .LBB8_2: # %identity
; X64-NOAVX2-NEXT: retq
;
@@ -647,9 +698,7 @@ define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) n
; X64-AVX2-NEXT: testl %edi, %edi
; X64-AVX2-NEXT: je .LBB8_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vandnps %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: .LBB8_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor <8 x i8> %m, splat (i8 -1)
@@ -813,9 +862,8 @@ define <8 x i8> @and_sink_not_v8i8_swapped(<8 x i8> %x, <8 x i8> %m, i1 zeroext
; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: je .LBB9_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: andnps %xmm0, %xmm1
+; X86-SSE2-NEXT: movaps %xmm1, %xmm0
; X86-SSE2-NEXT: .LBB9_2: # %identity
; X86-SSE2-NEXT: retl
;
@@ -896,9 +944,8 @@ define <8 x i8> @and_sink_not_v8i8_swapped(<8 x i8> %x, <8 x i8> %m, i1 zeroext
; X64-NOAVX2-NEXT: testl %edi, %edi
; X64-NOAVX2-NEXT: je .LBB9_2
; X64-NOAVX2-NEXT: # %bb.1: # %mask
-; X64-NOAVX2-NEXT: pcmpeqd %xmm2, %xmm2
-; X64-NOAVX2-NEXT: pxor %xmm2, %xmm1
-; X64-NOAVX2-NEXT: pand %xmm1, %xmm0
+; X64-NOAVX2-NEXT: andnps %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movaps %xmm1, %xmm0
; X64-NOAVX2-NEXT: .LBB9_2: # %identity
; X64-NOAVX2-NEXT: retq
;
@@ -907,9 +954,7 @@ define <8 x i8> @and_sink_not_v8i8_swapped(<8 x i8> %x, <8 x i8> %m, i1 zeroext
; X64-AVX2-NEXT: testl %edi, %edi
; X64-AVX2-NEXT: je .LBB9_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vandnps %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: .LBB9_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor <8 x i8> %m, splat (i8 -1)
@@ -1036,10 +1081,8 @@ define <4 x i32> @and_sink_not_v4i32(<4 x i32> %x, <4 x i32> %m, i1 zeroext %con
; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: je .LBB10_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pand %xmm0, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: andnps %xmm0, %xmm1
+; X86-SSE2-NEXT: movaps %xmm1, %xmm0
; X86-SSE2-NEXT: .LBB10_2: # %identity
; X86-SSE2-NEXT: retl
;
@@ -1089,10 +1132,8 @@ define <4 x i32> @and_sink_not_v4i32(<4 x i32> %x, <4 x i32> %m, i1 zeroext %con
; X64-NOAVX2-NEXT: testl %edi, %edi
; X64-NOAVX2-NEXT: je .LBB10_2
; X64-NOAVX2-NEXT: # %bb.1: # %mask
-; X64-NOAVX2-NEXT: pcmpeqd %xmm2, %xmm2
-; X64-NOAVX2-NEXT: pxor %xmm2, %xmm1
-; X64-NOAVX2-NEXT: pand %xmm0, %xmm1
-; X64-NOAVX2-NEXT: movdqa %xmm1, %xmm0
+; X64-NOAVX2-NEXT: andnps %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movaps %xmm1, %xmm0
; X64-NOAVX2-NEXT: .LBB10_2: # %identity
; X64-NOAVX2-NEXT: retq
;
@@ -1101,9 +1142,7 @@ define <4 x i32> @and_sink_not_v4i32(<4 x i32> %x, <4 x i32> %m, i1 zeroext %con
; X64-AVX2-NEXT: testl %edi, %edi
; X64-AVX2-NEXT: je .LBB10_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vandnps %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: .LBB10_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor <4 x i32> %m, splat (i32 -1)
@@ -1224,9 +1263,8 @@ define <4 x i32> @and_sink_not_v4i32_swapped(<4 x i32> %x, <4 x i32> %m, i1 zero
; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: je .LBB11_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; X86-SSE2-NEXT: pxor %xmm2, %xmm1
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: andnps %xmm0, %xmm1
+; X86-SSE2-NEXT: movaps %xmm1, %xmm0
; X86-SSE2-NEXT: .LBB11_2: # %identity
; X86-SSE2-NEXT: retl
;
@@ -1270,9 +1308,8 @@ define <4 x i32> @and_sink_not_v4i32_swapped(<4 x i32> %x, <4 x i32> %m, i1 zero
; X64-NOAVX2-NEXT: testl %edi, %edi
; X64-NOAVX2-NEXT: je .LBB11_2
; X64-NOAVX2-NEXT: # %bb.1: # %mask
-; X64-NOAVX2-NEXT: pcmpeqd %xmm2, %xmm2
-; X64-NOAVX2-NEXT: pxor %xmm2, %xmm1
-; X64-NOAVX2-NEXT: pand %xmm1, %xmm0
+; X64-NOAVX2-NEXT: andnps %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movaps %xmm1, %xmm0
; X64-NOAVX2-NEXT: .LBB11_2: # %identity
; X64-NOAVX2-NEXT: retq
;
@@ -1281,9 +1318,7 @@ define <4 x i32> @and_sink_not_v4i32_swapped(<4 x i32> %x, <4 x i32> %m, i1 zero
; X64-AVX2-NEXT: testl %edi, %edi
; X64-AVX2-NEXT: je .LBB11_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vandnps %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: .LBB11_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor <4 x i32> %m, splat (i32 -1)
@@ -1457,13 +1492,11 @@ define <4 x i64> @and_sink_not_v4i64(<4 x i64> %x, <4 x i64> %m, i1 zeroext %con
; X86-SSE2-NEXT: cmpb $0, 24(%ebp)
; X86-SSE2-NEXT: je .LBB12_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; X86-SSE2-NEXT: pxor %xmm3, %xmm2
-; X86-SSE2-NEXT: pxor 8(%ebp), %xmm3
-; X86-SSE2-NEXT: pand %xmm0, %xmm2
-; X86-SSE2-NEXT: pand %xmm1, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: movaps 8(%ebp), %xmm3
+; X86-SSE2-NEXT: andnps %xmm0, %xmm2
+; X86-SSE2-NEXT: andnps %xmm1, %xmm3
+; X86-SSE2-NEXT: movaps %xmm2, %xmm0
+; X86-SSE2-NEXT: movaps %xmm3, %xmm1
; X86-SSE2-NEXT: .LBB12_2: # %identity
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
@@ -1549,13 +1582,10 @@ define <4 x i64> @and_sink_not_v4i64(<4 x i64> %x, <4 x i64> %m, i1 zeroext %con
; X64-NOAVX2-NEXT: testl %edi, %edi
; X64-NOAVX2-NEXT: je .LBB12_2
; X64-NOAVX2-NEXT: # %bb.1: # %mask
-; X64-NOAVX2-NEXT: pcmpeqd %xmm4, %xmm4
-; X64-NOAVX2-NEXT: pxor %xmm4, %xmm3
-; X64-NOAVX2-NEXT: pxor %xmm4, %xmm2
-; X64-NOAVX2-NEXT: pand %xmm0, %xmm2
-; X64-NOAVX2-NEXT: pand %xmm1, %xmm3
-; X64-NOAVX2-NEXT: movdqa %xmm2, %xmm0
-; X64-NOAVX2-NEXT: movdqa %xmm3, %xmm1
+; X64-NOAVX2-NEXT: andnps %xmm0, %xmm2
+; X64-NOAVX2-NEXT: andnps %xmm1, %xmm3
+; X64-NOAVX2-NEXT: movaps %xmm2, %xmm0
+; X64-NOAVX2-NEXT: movaps %xmm3, %xmm1
; X64-NOAVX2-NEXT: .LBB12_2: # %identity
; X64-NOAVX2-NEXT: retq
;
@@ -1564,9 +1594,7 @@ define <4 x i64> @and_sink_not_v4i64(<4 x i64> %x, <4 x i64> %m, i1 zeroext %con
; X64-AVX2-NEXT: testl %edi, %edi
; X64-AVX2-NEXT: je .LBB12_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vandnps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: .LBB12_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor <4 x i64> %m, splat (i64 -1)
@@ -1752,11 +1780,11 @@ define <4 x i64> @and_sink_not_v4i64_swapped(<4 x i64> %x, <4 x i64> %m, i1 zero
; X86-SSE2-NEXT: cmpb $0, 24(%ebp)
; X86-SSE2-NEXT: je .LBB13_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; X86-SSE2-NEXT: pxor %xmm3, %xmm2
-; X86-SSE2-NEXT: pxor 8(%ebp), %xmm3
-; X86-SSE2-NEXT: pand %xmm2, %xmm0
-; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: movaps 8(%ebp), %xmm3
+; X86-SSE2-NEXT: andnps %xmm0, %xmm2
+; X86-SSE2-NEXT: andnps %xmm1, %xmm3
+; X86-SSE2-NEXT: movaps %xmm2, %xmm0
+; X86-SSE2-NEXT: movaps %xmm3, %xmm1
; X86-SSE2-NEXT: .LBB13_2: # %identity
; X86-SSE2-NEXT: movl %ebp, %esp
; X86-SSE2-NEXT: popl %ebp
@@ -1848,11 +1876,10 @@ define <4 x i64> @and_sink_not_v4i64_swapped(<4 x i64> %x, <4 x i64> %m, i1 zero
; X64-NOAVX2-NEXT: testl %edi, %edi
; X64-NOAVX2-NEXT: je .LBB13_2
; X64-NOAVX2-NEXT: # %bb.1: # %mask
-; X64-NOAVX2-NEXT: pcmpeqd %xmm4, %xmm4
-; X64-NOAVX2-NEXT: pxor %xmm4, %xmm3
-; X64-NOAVX2-NEXT: pxor %xmm4, %xmm2
-; X64-NOAVX2-NEXT: pand %xmm2, %xmm0
-; X64-NOAVX2-NEXT: pand %xmm3, %xmm1
+; X64-NOAVX2-NEXT: andnps %xmm0, %xmm2
+; X64-NOAVX2-NEXT: andnps %xmm1, %xmm3
+; X64-NOAVX2-NEXT: movaps %xmm2, %xmm0
+; X64-NOAVX2-NEXT: movaps %xmm3, %xmm1
; X64-NOAVX2-NEXT: .LBB13_2: # %identity
; X64-NOAVX2-NEXT: retq
;
@@ -1861,9 +1888,7 @@ define <4 x i64> @and_sink_not_v4i64_swapped(<4 x i64> %x, <4 x i64> %m, i1 zero
; X64-AVX2-NEXT: testl %edi, %edi
; X64-AVX2-NEXT: je .LBB13_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vandnps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: .LBB13_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor <4 x i64> %m, splat (i64 -1)
@@ -2420,11 +2445,9 @@ define <4 x i32> @and_sink_not_splat_v4i32(<4 x i32> %x, i32 %m, i1 zeroext %con
; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: je .LBB16_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: notl %eax
-; X86-SSE2-NEXT: movd %eax, %xmm1
+; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
; X86-SSE2-NEXT: .LBB16_2: # %identity
; X86-SSE2-NEXT: retl
@@ -2466,10 +2489,9 @@ define <4 x i32> @and_sink_not_splat_v4i32(<4 x i32> %x, i32 %m, i1 zeroext %con
; X64-NOAVX2-NEXT: testl %esi, %esi
; X64-NOAVX2-NEXT: je .LBB16_2
; X64-NOAVX2-NEXT: # %bb.1: # %mask
-; X64-NOAVX2-NEXT: notl %edi
; X64-NOAVX2-NEXT: movd %edi, %xmm1
; X64-NOAVX2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-NOAVX2-NEXT: pand %xmm0, %xmm1
+; X64-NOAVX2-NEXT: pandn %xmm0, %xmm1
; X64-NOAVX2-NEXT: movdqa %xmm1, %xmm0
; X64-NOAVX2-NEXT: .LBB16_2: # %identity
; X64-NOAVX2-NEXT: retq
@@ -2479,10 +2501,9 @@ define <4 x i32> @and_sink_not_splat_v4i32(<4 x i32> %x, i32 %m, i1 zeroext %con
; X64-AVX2-NEXT: testl %esi, %esi
; X64-AVX2-NEXT: je .LBB16_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: notl %edi
; X64-AVX2-NEXT: vmovd %edi, %xmm1
; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: .LBB16_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor i32 %m, -1
@@ -2586,11 +2607,10 @@ define <4 x i32> @and_sink_not_splat_v4i32_swapped(<4 x i32> %x, i32 %m, i1 zero
; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: je .LBB17_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: notl %eax
-; X86-SSE2-NEXT: movd %eax, %xmm1
+; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
; X86-SSE2-NEXT: .LBB17_2: # %identity
; X86-SSE2-NEXT: retl
;
@@ -2628,10 +2648,10 @@ define <4 x i32> @and_sink_not_splat_v4i32_swapped(<4 x i32> %x, i32 %m, i1 zero
; X64-NOAVX2-NEXT: testl %esi, %esi
; X64-NOAVX2-NEXT: je .LBB17_2
; X64-NOAVX2-NEXT: # %bb.1: # %mask
-; X64-NOAVX2-NEXT: notl %edi
; X64-NOAVX2-NEXT: movd %edi, %xmm1
; X64-NOAVX2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-NOAVX2-NEXT: pand %xmm1, %xmm0
+; X64-NOAVX2-NEXT: pandn %xmm0, %xmm1
+; X64-NOAVX2-NEXT: movdqa %xmm1, %xmm0
; X64-NOAVX2-NEXT: .LBB17_2: # %identity
; X64-NOAVX2-NEXT: retq
;
@@ -2640,10 +2660,9 @@ define <4 x i32> @and_sink_not_splat_v4i32_swapped(<4 x i32> %x, i32 %m, i1 zero
; X64-AVX2-NEXT: testl %esi, %esi
; X64-AVX2-NEXT: je .LBB17_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: notl %edi
; X64-AVX2-NEXT: vmovd %edi, %xmm1
; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
; X64-AVX2-NEXT: .LBB17_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor i32 %m, -1
@@ -2789,12 +2808,14 @@ define <4 x i64> @and_sink_not_splat_v4i64(<4 x i64> %x, i64 %m, i1 zeroext %con
; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: je .LBB18_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; X86-SSE2-NEXT: pxor %xmm3, %xmm2
-; X86-SSE2-NEXT: pand %xmm2, %xmm0
-; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
; X86-SSE2-NEXT: .LBB18_2: # %identity
; X86-SSE2-NEXT: retl
@@ -2879,10 +2900,9 @@ define <4 x i64> @and_sink_not_splat_v4i64(<4 x i64> %x, i64 %m, i1 zeroext %con
; X64-AVX2-NEXT: testl %esi, %esi
; X64-AVX2-NEXT: je .LBB18_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: notq %rdi
; X64-AVX2-NEXT: vmovq %rdi, %xmm1
; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
-; X64-AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: .LBB18_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor i64 %m, -1
@@ -3034,12 +3054,15 @@ define <4 x i64> @and_sink_not_splat_v4i64_swapped(<4 x i64> %x, i64 %m, i1 zero
; X86-SSE2-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: je .LBB19_2
; X86-SSE2-NEXT: # %bb.1: # %mask
-; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; X86-SSE2-NEXT: pxor %xmm2, %xmm3
-; X86-SSE2-NEXT: pand %xmm3, %xmm0
-; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
; X86-SSE2-NEXT: .LBB19_2: # %identity
; X86-SSE2-NEXT: retl
;
@@ -3126,10 +3149,9 @@ define <4 x i64> @and_sink_not_splat_v4i64_swapped(<4 x i64> %x, i64 %m, i1 zero
; X64-AVX2-NEXT: testl %esi, %esi
; X64-AVX2-NEXT: je .LBB19_2
; X64-AVX2-NEXT: # %bb.1: # %mask
-; X64-AVX2-NEXT: notq %rdi
; X64-AVX2-NEXT: vmovq %rdi, %xmm1
; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
-; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: .LBB19_2: # %identity
; X64-AVX2-NEXT: retq
%a = xor i64 %m, -1
@@ -3144,6 +3166,3 @@ mask:
identity:
ret <4 x i64> %x
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-BMI: {{.*}}
-; X64-NOBMI: {{.*}}
>From 91550c69d756d0cd1a33b0014099e097c7326232 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Wed, 17 Dec 2025 08:46:16 +0100
Subject: [PATCH 3/4] [X86] Use BMI for vectors, if vector instructions absent
---
.../lib/Target/X86/X86TargetTransformInfo.cpp | 2 +-
llvm/test/CodeGen/X86/andnot-sink-not.ll | 432 ++++++++----------
2 files changed, 190 insertions(+), 244 deletions(-)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b90232b25088b..7f3206aeeed6a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -7199,7 +7199,7 @@ bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
using namespace llvm::PatternMatch;
if (I->getOpcode() == Instruction::And &&
- (I->getType()->isVectorTy() ? ST->hasSSE2() : ST->hasBMI())) {
+ (ST->hasBMI() || (I->getType()->isVectorTy() && ST->hasSSE2()))) {
for (auto &Op : I->operands()) {
// (and X, (not Y)) -> (andn X, Y)
if (match(Op.get(), m_Not(m_Value()))) {
diff --git a/llvm/test/CodeGen/X86/andnot-sink-not.ll b/llvm/test/CodeGen/X86/andnot-sink-not.ll
index 8463b8ae4e750..fefbdc84699f4 100644
--- a/llvm/test/CodeGen/X86/andnot-sink-not.ll
+++ b/llvm/test/CodeGen/X86/andnot-sink-not.ll
@@ -624,17 +624,20 @@ define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) n
; X86-BMI-NEXT: pushl %ebx
; X86-BMI-NEXT: pushl %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bl
-; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ah
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-BMI-NEXT: je .LBB8_2
; X86-BMI-NEXT: # %bb.1: # %mask
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT: notb %cl
+; X86-BMI-NEXT: andb %dh, %cl
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
; X86-BMI-NEXT: notb %dh
; X86-BMI-NEXT: andb %ch, %dh
@@ -643,10 +646,7 @@ define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) n
; X86-BMI-NEXT: andb %dl, %ch
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-BMI-NEXT: notb %dl
-; X86-BMI-NEXT: andb %cl, %dl
-; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-BMI-NEXT: notb %cl
-; X86-BMI-NEXT: andb %bh, %cl
+; X86-BMI-NEXT: andb %bh, %dl
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
; X86-BMI-NEXT: notb %bh
; X86-BMI-NEXT: andb %bl, %bh
@@ -663,20 +663,20 @@ define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) n
; X86-BMI-NEXT: movb %al, 1(%esi)
; X86-BMI-NEXT: movb %bl, 2(%esi)
; X86-BMI-NEXT: movb %bh, 3(%esi)
-; X86-BMI-NEXT: movb %cl, 4(%esi)
-; X86-BMI-NEXT: movb %dl, 5(%esi)
-; X86-BMI-NEXT: movb %ch, 6(%esi)
-; X86-BMI-NEXT: movb %dh, 7(%esi)
+; X86-BMI-NEXT: movb %dl, 4(%esi)
+; X86-BMI-NEXT: movb %ch, 5(%esi)
+; X86-BMI-NEXT: movb %dh, 6(%esi)
+; X86-BMI-NEXT: movb %cl, 7(%esi)
; X86-BMI-NEXT: jmp .LBB8_3
; X86-BMI-NEXT: .LBB8_2: # %identity
; X86-BMI-NEXT: movb %al, (%esi)
; X86-BMI-NEXT: movb %ah, 1(%esi)
-; X86-BMI-NEXT: movb %dh, 2(%esi)
+; X86-BMI-NEXT: movb %cl, 2(%esi)
; X86-BMI-NEXT: movb %bl, 3(%esi)
; X86-BMI-NEXT: movb %bh, 4(%esi)
-; X86-BMI-NEXT: movb %cl, 5(%esi)
-; X86-BMI-NEXT: movb %dl, 6(%esi)
-; X86-BMI-NEXT: movb %ch, 7(%esi)
+; X86-BMI-NEXT: movb %dl, 5(%esi)
+; X86-BMI-NEXT: movb %ch, 6(%esi)
+; X86-BMI-NEXT: movb %dh, 7(%esi)
; X86-BMI-NEXT: .LBB8_3: # %identity
; X86-BMI-NEXT: movl %esi, %eax
; X86-BMI-NEXT: popl %esi
@@ -878,6 +878,7 @@ define <8 x i8> @and_sink_not_v8i8_swapped(<8 x i8> %x, <8 x i8> %m, i1 zeroext
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-BMI-NEXT: je .LBB9_2
; X86-BMI-NEXT: # %bb.1: # %mask
@@ -921,7 +922,6 @@ define <8 x i8> @and_sink_not_v8i8_swapped(<8 x i8> %x, <8 x i8> %m, i1 zeroext
; X86-BMI-NEXT: movb %cl, 7(%esi)
; X86-BMI-NEXT: jmp .LBB9_3
; X86-BMI-NEXT: .LBB9_2: # %identity
-; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
; X86-BMI-NEXT: movb %dh, (%esi)
; X86-BMI-NEXT: movb %dl, 1(%esi)
; X86-BMI-NEXT: movb %cl, 2(%esi)
@@ -1092,36 +1092,26 @@ define <4 x i32> @and_sink_not_v4i32(<4 x i32> %x, <4 x i32> %m, i1 zeroext %con
; X86-BMI-NEXT: pushl %edi
; X86-BMI-NEXT: pushl %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-BMI-NEXT: je .LBB10_2
; X86-BMI-NEXT: # %bb.1: # %mask
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT: notl %edi
-; X86-BMI-NEXT: andl %ebx, %edi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: notl %ebx
-; X86-BMI-NEXT: andl %esi, %ebx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %edx, %esi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: notl %edx
-; X86-BMI-NEXT: andl %ecx, %edx
-; X86-BMI-NEXT: movl %edx, (%eax)
-; X86-BMI-NEXT: movl %esi, 4(%eax)
-; X86-BMI-NEXT: movl %ebx, 8(%eax)
-; X86-BMI-NEXT: movl %edi, 12(%eax)
-; X86-BMI-NEXT: jmp .LBB10_3
+; X86-BMI-NEXT: andnl %edi, %ebx, %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: andnl %esi, %ebx, %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: andnl %edx, %ebx, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: andnl %ecx, %ebx, %ecx
; X86-BMI-NEXT: .LBB10_2: # %identity
; X86-BMI-NEXT: movl %ecx, (%eax)
; X86-BMI-NEXT: movl %edx, 4(%eax)
; X86-BMI-NEXT: movl %esi, 8(%eax)
-; X86-BMI-NEXT: movl %ebx, 12(%eax)
-; X86-BMI-NEXT: .LBB10_3: # %identity
+; X86-BMI-NEXT: movl %edi, 12(%eax)
; X86-BMI-NEXT: popl %esi
; X86-BMI-NEXT: popl %edi
; X86-BMI-NEXT: popl %ebx
@@ -1274,30 +1264,26 @@ define <4 x i32> @and_sink_not_v4i32_swapped(<4 x i32> %x, <4 x i32> %m, i1 zero
; X86-BMI-NEXT: pushl %edi
; X86-BMI-NEXT: pushl %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-BMI-NEXT: je .LBB11_2
; X86-BMI-NEXT: # %bb.1: # %mask
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: notl %ebx
-; X86-BMI-NEXT: andl %ebx, %ecx
+; X86-BMI-NEXT: andnl %edi, %ebx, %edi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: notl %ebx
-; X86-BMI-NEXT: andl %ebx, %edx
+; X86-BMI-NEXT: andnl %esi, %ebx, %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: notl %ebx
-; X86-BMI-NEXT: andl %ebx, %esi
+; X86-BMI-NEXT: andnl %edx, %ebx, %edx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: notl %ebx
-; X86-BMI-NEXT: andl %ebx, %edi
+; X86-BMI-NEXT: andnl %ecx, %ebx, %ecx
; X86-BMI-NEXT: .LBB11_2: # %identity
-; X86-BMI-NEXT: movl %edi, (%eax)
-; X86-BMI-NEXT: movl %esi, 4(%eax)
-; X86-BMI-NEXT: movl %edx, 8(%eax)
-; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: movl %ecx, (%eax)
+; X86-BMI-NEXT: movl %edx, 4(%eax)
+; X86-BMI-NEXT: movl %esi, 8(%eax)
+; X86-BMI-NEXT: movl %edi, 12(%eax)
; X86-BMI-NEXT: popl %esi
; X86-BMI-NEXT: popl %edi
; X86-BMI-NEXT: popl %ebx
@@ -1510,65 +1496,56 @@ define <4 x i64> @and_sink_not_v4i64(<4 x i64> %x, <4 x i64> %m, i1 zeroext %con
; X86-BMI-NEXT: pushl %esi
; X86-BMI-NEXT: subl $8, %esp
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-BMI-NEXT: je .LBB12_2
; X86-BMI-NEXT: # %bb.1: # %mask
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT: notl %edi
-; X86-BMI-NEXT: andl %esi, %edi
-; X86-BMI-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %edx, %esi
+; X86-BMI-NEXT: andnl %ebp, %esi, %esi
+; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: andnl %ebx, %esi, %esi
; X86-BMI-NEXT: movl %esi, (%esp) # 4-byte Spill
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %ecx, %esi
+; X86-BMI-NEXT: andnl %edi, %esi, %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %esi
+; X86-BMI-NEXT: movl %ecx, %ebx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: notl %ecx
-; X86-BMI-NEXT: andl %ebp, %ecx
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %ecx, %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: andnl %ebx, %edx, %edx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-BMI-NEXT: notl %ebp
-; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %ebp
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT: notl %edi
-; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT: movl %eax, %edx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT: notl %eax
-; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %ebp, %ebp
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: notl %ebx
-; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: movl %ebx, (%edx)
-; X86-BMI-NEXT: movl %eax, 4(%edx)
-; X86-BMI-NEXT: movl %edi, 8(%edx)
-; X86-BMI-NEXT: movl %ebp, 12(%edx)
-; X86-BMI-NEXT: movl %ecx, 16(%edx)
-; X86-BMI-NEXT: movl %esi, 20(%edx)
-; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-BMI-NEXT: movl %eax, 24(%edx)
-; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-BMI-NEXT: movl %eax, 28(%edx)
-; X86-BMI-NEXT: movl %edx, %eax
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %ebx, %ebx
+; X86-BMI-NEXT: movl %ebx, (%eax)
+; X86-BMI-NEXT: movl %ebp, 4(%eax)
+; X86-BMI-NEXT: movl %edx, 8(%eax)
+; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: movl %esi, 16(%eax)
+; X86-BMI-NEXT: movl %edi, 20(%eax)
+; X86-BMI-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 24(%eax)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 28(%eax)
; X86-BMI-NEXT: jmp .LBB12_3
; X86-BMI-NEXT: .LBB12_2: # %identity
-; X86-BMI-NEXT: movl %ebx, (%eax)
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: movl %ebx, 4(%eax)
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: movl %ebx, 8(%eax)
-; X86-BMI-NEXT: movl %edi, 12(%eax)
-; X86-BMI-NEXT: movl %ebp, 16(%eax)
-; X86-BMI-NEXT: movl %ecx, 20(%eax)
-; X86-BMI-NEXT: movl %edx, 24(%eax)
-; X86-BMI-NEXT: movl %esi, 28(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, (%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, 4(%eax)
+; X86-BMI-NEXT: movl %ecx, 8(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: movl %esi, 16(%eax)
+; X86-BMI-NEXT: movl %edi, 20(%eax)
+; X86-BMI-NEXT: movl %ebx, 24(%eax)
+; X86-BMI-NEXT: movl %ebp, 28(%eax)
; X86-BMI-NEXT: .LBB12_3: # %identity
; X86-BMI-NEXT: addl $8, %esp
; X86-BMI-NEXT: popl %esi
@@ -1796,75 +1773,60 @@ define <4 x i64> @and_sink_not_v4i64_swapped(<4 x i64> %x, <4 x i64> %m, i1 zero
; X86-BMI-NEXT: pushl %ebx
; X86-BMI-NEXT: pushl %edi
; X86-BMI-NEXT: pushl %esi
-; X86-BMI-NEXT: subl $12, %esp
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: subl $8, %esp
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-BMI-NEXT: je .LBB13_2
; X86-BMI-NEXT: # %bb.1: # %mask
-; X86-BMI-NEXT: movl %esi, %edx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %esi, %edx
-; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: movl %edi, %edx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT: andl %esi, %edi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %esi, %edx
-; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %esi, %eax
-; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: andnl %ebp, %esi, %esi
+; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %esi, %ebp
+; X86-BMI-NEXT: andnl %ebx, %esi, %esi
+; X86-BMI-NEXT: movl %esi, (%esp) # 4-byte Spill
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %esi, %ecx
+; X86-BMI-NEXT: andnl %edi, %esi, %edi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %esi
+; X86-BMI-NEXT: movl %ecx, %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %ecx, %ecx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: andl %esi, %edx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %esi, %eax
-; X86-BMI-NEXT: movl %eax, (%ebx)
-; X86-BMI-NEXT: movl %edx, 4(%ebx)
-; X86-BMI-NEXT: movl %ecx, 8(%ebx)
-; X86-BMI-NEXT: movl %ebp, 12(%ebx)
-; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-BMI-NEXT: movl %eax, 16(%ebx)
-; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-BMI-NEXT: movl %eax, 20(%ebx)
-; X86-BMI-NEXT: movl %edi, 24(%ebx)
-; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-BMI-NEXT: movl %eax, 28(%ebx)
+; X86-BMI-NEXT: andnl %ebx, %edx, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %ebp, %ebp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %ebx, %ebx
+; X86-BMI-NEXT: movl %ebx, (%eax)
+; X86-BMI-NEXT: movl %ebp, 4(%eax)
+; X86-BMI-NEXT: movl %edx, 8(%eax)
+; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: movl %esi, 16(%eax)
+; X86-BMI-NEXT: movl %edi, 20(%eax)
+; X86-BMI-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 24(%eax)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT: movl %ecx, 28(%eax)
; X86-BMI-NEXT: jmp .LBB13_3
; X86-BMI-NEXT: .LBB13_2: # %identity
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: movl %edx, (%ebx)
+; X86-BMI-NEXT: movl %edx, (%eax)
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: movl %edx, 4(%ebx)
-; X86-BMI-NEXT: movl %ecx, 8(%ebx)
-; X86-BMI-NEXT: movl %ebp, 12(%ebx)
-; X86-BMI-NEXT: movl %eax, 16(%ebx)
-; X86-BMI-NEXT: movl %edi, 20(%ebx)
+; X86-BMI-NEXT: movl %edx, 4(%eax)
+; X86-BMI-NEXT: movl %ecx, 8(%eax)
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: movl %ecx, 24(%ebx)
-; X86-BMI-NEXT: movl %esi, 28(%ebx)
+; X86-BMI-NEXT: movl %ecx, 12(%eax)
+; X86-BMI-NEXT: movl %esi, 16(%eax)
+; X86-BMI-NEXT: movl %edi, 20(%eax)
+; X86-BMI-NEXT: movl %ebx, 24(%eax)
+; X86-BMI-NEXT: movl %ebp, 28(%eax)
; X86-BMI-NEXT: .LBB13_3: # %identity
-; X86-BMI-NEXT: movl %ebx, %eax
-; X86-BMI-NEXT: addl $12, %esp
+; X86-BMI-NEXT: addl $8, %esp
; X86-BMI-NEXT: popl %esi
; X86-BMI-NEXT: popl %edi
; X86-BMI-NEXT: popl %ebx
@@ -2040,8 +2002,8 @@ define <8 x i8> @and_sink_not_splat_v8i8(<8 x i8> %x, i8 %m, i1 zeroext %cond) n
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %dh
; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %bh
-; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
; X86-BMI-NEXT: je .LBB14_2
; X86-BMI-NEXT: # %bb.1: # %mask
@@ -2060,8 +2022,8 @@ define <8 x i8> @and_sink_not_splat_v8i8(<8 x i8> %x, i8 %m, i1 zeroext %cond) n
; X86-BMI-NEXT: andb %dl, %bh
; X86-BMI-NEXT: andb %dl, %cl
; X86-BMI-NEXT: andb {{[0-9]+}}(%esp), %dl
-; X86-BMI-NEXT: movb %dl, (%eax)
-; X86-BMI-NEXT: movb %cl, 1(%eax)
+; X86-BMI-NEXT: movb %dl, 1(%eax)
+; X86-BMI-NEXT: movb %cl, (%eax)
; X86-BMI-NEXT: movb %bh, 2(%eax)
; X86-BMI-NEXT: movb %bl, 3(%eax)
; X86-BMI-NEXT: movb %dh, 4(%eax)
@@ -2072,8 +2034,8 @@ define <8 x i8> @and_sink_not_splat_v8i8(<8 x i8> %x, i8 %m, i1 zeroext %cond) n
; X86-BMI-NEXT: movb %cl, 7(%eax)
; X86-BMI-NEXT: jmp .LBB14_3
; X86-BMI-NEXT: .LBB14_2: # %identity
-; X86-BMI-NEXT: movb %ch, (%eax)
-; X86-BMI-NEXT: movb %cl, 1(%eax)
+; X86-BMI-NEXT: movb %cl, (%eax)
+; X86-BMI-NEXT: movb %ch, 1(%eax)
; X86-BMI-NEXT: movb %bh, 2(%eax)
; X86-BMI-NEXT: movb %bl, 3(%eax)
; X86-BMI-NEXT: movb %dh, 4(%eax)
@@ -2285,10 +2247,10 @@ define <8 x i8> @and_sink_not_splat_v8i8_swapped(<8 x i8> %x, i8 %m, i1 zeroext
; X86-BMI-NEXT: andb %ch, %dh
; X86-BMI-NEXT: andb %ch, %bl
; X86-BMI-NEXT: andb %ch, %bh
-; X86-BMI-NEXT: andb %ch, %cl
; X86-BMI-NEXT: andb %ch, %dl
-; X86-BMI-NEXT: movb %dl, (%eax)
+; X86-BMI-NEXT: andb %ch, %cl
; X86-BMI-NEXT: movb %cl, 1(%eax)
+; X86-BMI-NEXT: movb %dl, (%eax)
; X86-BMI-NEXT: movb %bh, 2(%eax)
; X86-BMI-NEXT: movb %bl, 3(%eax)
; X86-BMI-NEXT: movb %dh, 4(%eax)
@@ -2466,16 +2428,12 @@ define <4 x i32> @and_sink_not_splat_v4i32(<4 x i32> %x, i32 %m, i1 zeroext %con
; X86-BMI-NEXT: je .LBB16_2
; X86-BMI-NEXT: # %bb.1: # %mask
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: notl %ebx
-; X86-BMI-NEXT: andl %ebx, %ecx
-; X86-BMI-NEXT: andl %ebx, %edx
-; X86-BMI-NEXT: andl %ebx, %esi
-; X86-BMI-NEXT: andl %edi, %ebx
-; X86-BMI-NEXT: movl %ebx, (%eax)
-; X86-BMI-NEXT: jmp .LBB16_3
+; X86-BMI-NEXT: andnl %ecx, %ebx, %ecx
+; X86-BMI-NEXT: andnl %edx, %ebx, %edx
+; X86-BMI-NEXT: andnl %esi, %ebx, %esi
+; X86-BMI-NEXT: andnl %edi, %ebx, %edi
; X86-BMI-NEXT: .LBB16_2: # %identity
; X86-BMI-NEXT: movl %edi, (%eax)
-; X86-BMI-NEXT: .LBB16_3: # %identity
; X86-BMI-NEXT: movl %esi, 4(%eax)
; X86-BMI-NEXT: movl %edx, 8(%eax)
; X86-BMI-NEXT: movl %ecx, 12(%eax)
@@ -2628,11 +2586,10 @@ define <4 x i32> @and_sink_not_splat_v4i32_swapped(<4 x i32> %x, i32 %m, i1 zero
; X86-BMI-NEXT: je .LBB17_2
; X86-BMI-NEXT: # %bb.1: # %mask
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: notl %ebx
-; X86-BMI-NEXT: andl %ebx, %ecx
-; X86-BMI-NEXT: andl %ebx, %edx
-; X86-BMI-NEXT: andl %ebx, %esi
-; X86-BMI-NEXT: andl %ebx, %edi
+; X86-BMI-NEXT: andnl %ecx, %ebx, %ecx
+; X86-BMI-NEXT: andnl %edx, %ebx, %edx
+; X86-BMI-NEXT: andnl %esi, %ebx, %esi
+; X86-BMI-NEXT: andnl %edi, %ebx, %edi
; X86-BMI-NEXT: .LBB17_2: # %identity
; X86-BMI-NEXT: movl %edi, (%eax)
; X86-BMI-NEXT: movl %esi, 4(%eax)
@@ -2828,53 +2785,49 @@ define <4 x i64> @and_sink_not_splat_v4i64(<4 x i64> %x, i64 %m, i1 zeroext %con
; X86-BMI-NEXT: pushl %esi
; X86-BMI-NEXT: subl $8, %esp
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI-NEXT: je .LBB18_2
; X86-BMI-NEXT: # %bb.1: # %mask
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: andnl %ecx, %esi, %ecx
+; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: andnl %ebx, %esi, %ecx
; X86-BMI-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: notl %ecx
-; X86-BMI-NEXT: andl %ecx, %edx
-; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT: andl %ecx, %esi
-; X86-BMI-NEXT: andl %ecx, %ebx
-; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: notl %edx
-; X86-BMI-NEXT: andl %edx, (%esp) # 4-byte Folded Spill
-; X86-BMI-NEXT: andl %edx, %edi
-; X86-BMI-NEXT: andl %edx, %ebp
-; X86-BMI-NEXT: andl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT: movl %edx, (%eax)
-; X86-BMI-NEXT: movl %ecx, 4(%eax)
-; X86-BMI-NEXT: movl %ebp, 8(%eax)
-; X86-BMI-NEXT: movl %ebx, 12(%eax)
+; X86-BMI-NEXT: andnl %ebp, %esi, %ebp
+; X86-BMI-NEXT: andnl %edx, %esi, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %ebx
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %edi
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %ecx
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %esi
+; X86-BMI-NEXT: movl %esi, (%eax)
+; X86-BMI-NEXT: movl %edx, 4(%eax)
+; X86-BMI-NEXT: movl %ecx, 8(%eax)
+; X86-BMI-NEXT: movl %ebp, 12(%eax)
; X86-BMI-NEXT: movl %edi, 16(%eax)
-; X86-BMI-NEXT: movl %esi, 20(%eax)
; X86-BMI-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-BMI-NEXT: movl %ecx, 24(%eax)
+; X86-BMI-NEXT: movl %ecx, 20(%eax)
+; X86-BMI-NEXT: movl %ebx, 24(%eax)
; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-BMI-NEXT: movl %ecx, 28(%eax)
; X86-BMI-NEXT: jmp .LBB18_3
; X86-BMI-NEXT: .LBB18_2: # %identity
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: movl %ecx, (%eax)
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: movl %ecx, 4(%eax)
-; X86-BMI-NEXT: movl %ebp, 8(%eax)
-; X86-BMI-NEXT: movl %ebx, 12(%eax)
-; X86-BMI-NEXT: movl %edi, 16(%eax)
-; X86-BMI-NEXT: movl %esi, 20(%eax)
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: movl %ecx, 24(%eax)
-; X86-BMI-NEXT: movl %edx, 28(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl %edi, (%eax)
+; X86-BMI-NEXT: movl %edx, 4(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, 8(%eax)
+; X86-BMI-NEXT: movl %ebp, 12(%eax)
+; X86-BMI-NEXT: movl %esi, 16(%eax)
+; X86-BMI-NEXT: movl %ebx, 20(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, 24(%eax)
; X86-BMI-NEXT: .LBB18_3: # %identity
+; X86-BMI-NEXT: movl %ecx, 28(%eax)
; X86-BMI-NEXT: addl $8, %esp
; X86-BMI-NEXT: popl %esi
; X86-BMI-NEXT: popl %edi
@@ -3072,59 +3025,52 @@ define <4 x i64> @and_sink_not_splat_v4i64_swapped(<4 x i64> %x, i64 %m, i1 zero
; X86-BMI-NEXT: pushl %ebx
; X86-BMI-NEXT: pushl %edi
; X86-BMI-NEXT: pushl %esi
-; X86-BMI-NEXT: subl $12, %esp
+; X86-BMI-NEXT: subl $8, %esp
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-BMI-NEXT: cmpb $0, {{[0-9]+}}(%esp)
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-BMI-NEXT: je .LBB19_2
; X86-BMI-NEXT: # %bb.1: # %mask
-; X86-BMI-NEXT: movl %esi, (%esp) # 4-byte Spill
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: andl %esi, (%esp) # 4-byte Folded Spill
-; X86-BMI-NEXT: andl %esi, %edi
-; X86-BMI-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT: andl %esi, %ebx
-; X86-BMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT: andl %esi, %ecx
+; X86-BMI-NEXT: andnl %ecx, %esi, %ecx
+; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: andnl %ebx, %esi, %ecx
+; X86-BMI-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: andnl %ebp, %esi, %ebp
+; X86-BMI-NEXT: andnl %edx, %esi, %edx
; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT: notl %esi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT: andl %esi, %edi
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT: andl %esi, %ebx
-; X86-BMI-NEXT: andl %esi, %ebp
-; X86-BMI-NEXT: andl %esi, %edx
-; X86-BMI-NEXT: movl %edx, (%eax)
-; X86-BMI-NEXT: movl %ecx, 4(%eax)
-; X86-BMI-NEXT: movl %ebp, 8(%eax)
-; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-BMI-NEXT: movl %ecx, 12(%eax)
-; X86-BMI-NEXT: movl %ebx, 16(%eax)
-; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-BMI-NEXT: movl %ecx, 20(%eax)
-; X86-BMI-NEXT: movl %edi, 24(%eax)
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %ebx
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %edi
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %ecx
+; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %esi, %esi
+; X86-BMI-NEXT: movl %esi, (%eax)
+; X86-BMI-NEXT: movl %edx, 4(%eax)
+; X86-BMI-NEXT: movl %ecx, 8(%eax)
+; X86-BMI-NEXT: movl %ebp, 12(%eax)
+; X86-BMI-NEXT: movl %edi, 16(%eax)
; X86-BMI-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-BMI-NEXT: movl %ecx, 28(%eax)
+; X86-BMI-NEXT: movl %ecx, 20(%eax)
+; X86-BMI-NEXT: movl %ebx, 24(%eax)
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-BMI-NEXT: jmp .LBB19_3
; X86-BMI-NEXT: .LBB19_2: # %identity
-; X86-BMI-NEXT: movl %edx, (%eax)
-; X86-BMI-NEXT: movl %ecx, 4(%eax)
-; X86-BMI-NEXT: movl %ebp, 8(%eax)
-; X86-BMI-NEXT: movl %ebx, 12(%eax)
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: movl %ecx, 16(%eax)
-; X86-BMI-NEXT: movl %edi, 20(%eax)
-; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT: movl %ecx, 24(%eax)
-; X86-BMI-NEXT: movl %esi, 28(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl %edi, (%eax)
+; X86-BMI-NEXT: movl %edx, 4(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, 8(%eax)
+; X86-BMI-NEXT: movl %ebp, 12(%eax)
+; X86-BMI-NEXT: movl %esi, 16(%eax)
+; X86-BMI-NEXT: movl %ebx, 20(%eax)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT: movl %edx, 24(%eax)
; X86-BMI-NEXT: .LBB19_3: # %identity
-; X86-BMI-NEXT: addl $12, %esp
+; X86-BMI-NEXT: movl %ecx, 28(%eax)
+; X86-BMI-NEXT: addl $8, %esp
; X86-BMI-NEXT: popl %esi
; X86-BMI-NEXT: popl %edi
; X86-BMI-NEXT: popl %ebx
>From c663a380e24c8f12c88af40a4e7d1d9070c3d82f Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik at samsung.com>
Date: Wed, 17 Dec 2025 15:39:31 +0100
Subject: [PATCH 4/4] [X86] Exclude `i8`
---
llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 7f3206aeeed6a..ced4e96721268 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -7202,7 +7202,7 @@ bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I,
(ST->hasBMI() || (I->getType()->isVectorTy() && ST->hasSSE2()))) {
for (auto &Op : I->operands()) {
// (and X, (not Y)) -> (andn X, Y)
- if (match(Op.get(), m_Not(m_Value()))) {
+ if (match(Op.get(), m_Not(m_Value())) && !I->getType()->isIntegerTy(8)) {
Ops.push_back(&Op);
return true;
}
More information about the llvm-commits
mailing list