[llvm] r332903 - [X86][AArch64][NFC] Add tests for vector masked merge unfolding

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Mon May 21 14:40:51 PDT 2018


Author: lebedevri
Date: Mon May 21 14:40:51 2018
New Revision: 332903

URL: http://llvm.org/viewvc/llvm-project?rev=332903&view=rev
Log:
[X86][AArch64][NFC] Add tests for vector masked merge unfolding

Summary:
This is [[ https://bugs.llvm.org/show_bug.cgi?id=37104 | PR37104 ]].

[[ https://bugs.llvm.org/show_bug.cgi?id=6773 | PR6773 ]] will introduce an IR canonicalization that is likely bad for the end assembly.
Previously, `andps`+`andnps` / `bsl` would be generated. (see `@out`)
Now, they would no longer be generated  (see `@in`).

Differential Revision: https://reviews.llvm.org/D46008

Added:
    llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
    llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
    llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
    llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
Modified:
    llvm/trunk/test/CodeGen/X86/machine-cp.ll

Added: llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll?rev=332903&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll Mon May 21 14:40:51 2018
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; ============================================================================ ;
+; Various cases with %x and/or %y being a constant
+; ============================================================================ ;
+
+define <4 x i32> @out_constant_varx_mone(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: out_constant_varx_mone:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    orn v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %mask, %x
+  %my = and <4 x i32> %notmask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @in_constant_varx_mone(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: in_constant_varx_mone:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %n0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> ; %x
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @out_constant_varx_mone_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: out_constant_varx_mone_invmask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %notmask, %x
+  %my = and <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: in_constant_varx_mone_invmask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %n0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> ; %x
+  %n1 = and <4 x i32> %n0, %notmask
+  %r = xor <4 x i32> %n1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @out_constant_varx_42(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: out_constant_varx_42:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %mask, %x
+  %my = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @in_constant_varx_42(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: in_constant_varx_42:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, <i32 42, i32 42, i32 42, i32 42>
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @out_constant_varx_42_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: out_constant_varx_42_invmask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    bsl v2.16b, v1.16b, v0.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %notmask, %x
+  %my = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @in_constant_varx_42_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: in_constant_varx_42_invmask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
+  %n1 = and <4 x i32> %n0, %notmask
+  %r = xor <4 x i32> %n1, <i32 42, i32 42, i32 42, i32 42>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @out_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: out_constant_mone_vary:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %my = and <4 x i32> %notmask, %y
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @in_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: in_constant_mone_vary:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @out_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: out_constant_mone_vary_invmask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    orn v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %notmask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %my = and <4 x i32> %mask, %y
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: in_constant_mone_vary_invmask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn v0.16b, v1.16b
+; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
+  %n1 = and <4 x i32> %n0, %notmask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: out_constant_42_vary:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    movi v2.4s, #42
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
+  %my = and <4 x i32> %notmask, %y
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: in_constant_42_vary:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: out_constant_42_vary_invmask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    movi v2.4s, #42
+; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
+  %my = and <4 x i32> %mask, %y
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; CHECK-LABEL: in_constant_42_vary_invmask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
+  %n1 = and <4 x i32> %n0, %notmask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}

Added: llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll?rev=332903&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll Mon May 21 14:40:51 2018
@@ -0,0 +1,466 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; https://bugs.llvm.org/show_bug.cgi?id=37104
+
+; All the advanced stuff (negative tests, commutativity) is handled in the
+; scalar version of the test only.
+
+; ============================================================================ ;
+; 8-bit vector width
+; ============================================================================ ;
+
+define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
+; CHECK-LABEL: out_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <1 x i8> %x, %mask
+  %notmask = xor <1 x i8> %mask, <i8 -1>
+  %my = and <1 x i8> %y, %notmask
+  %r = or <1 x i8> %mx, %my
+  ret <1 x i8> %r
+}
+
+; ============================================================================ ;
+; 16-bit vector width
+; ============================================================================ ;
+
+define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
+; CHECK-LABEL: out_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d3, #0x0000ff000000ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %mx = and <2 x i8> %x, %mask
+  %notmask = xor <2 x i8> %mask, <i8 -1, i8 -1>
+  %my = and <2 x i8> %y, %notmask
+  %r = or <2 x i8> %mx, %my
+  ret <2 x i8> %r
+}
+
+define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
+; CHECK-LABEL: out_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <1 x i16> %x, %mask
+  %notmask = xor <1 x i16> %mask, <i16 -1>
+  %my = and <1 x i16> %y, %notmask
+  %r = or <1 x i16> %mx, %my
+  ret <1 x i16> %r
+}
+
+; ============================================================================ ;
+; 32-bit vector width
+; ============================================================================ ;
+
+define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
+; CHECK-LABEL: out_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d3, #0xff00ff00ff00ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %mx = and <4 x i8> %x, %mask
+  %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
+  %my = and <4 x i8> %y, %notmask
+  %r = or <4 x i8> %mx, %my
+  ret <4 x i8> %r
+}
+
+define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
+; CHECK-LABEL: out_v4i8_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d3, #0xff00ff00ff00ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %mx = and <4 x i8> %x, %mask
+  %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
+  %my = and <4 x i8> %y, %notmask
+  %r = or <4 x i8> %mx, %my
+  ret <4 x i8> %r
+}
+
+define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
+; CHECK-LABEL: out_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d3, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %mx = and <2 x i16> %x, %mask
+  %notmask = xor <2 x i16> %mask, <i16 -1, i16 -1>
+  %my = and <2 x i16> %y, %notmask
+  %r = or <2 x i16> %mx, %my
+  ret <2 x i16> %r
+}
+
+define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
+; CHECK-LABEL: out_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <1 x i32> %x, %mask
+  %notmask = xor <1 x i32> %mask, <i32 -1>
+  %my = and <1 x i32> %y, %notmask
+  %r = or <1 x i32> %mx, %my
+  ret <1 x i32> %r
+}
+
+; ============================================================================ ;
+; 64-bit vector width
+; ============================================================================ ;
+
+define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
+; CHECK-LABEL: out_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <8 x i8> %x, %mask
+  %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %my = and <8 x i8> %y, %notmask
+  %r = or <8 x i8> %mx, %my
+  ret <8 x i8> %r
+}
+
+define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
+; CHECK-LABEL: out_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <4 x i16> %x, %mask
+  %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %my = and <4 x i16> %y, %notmask
+  %r = or <4 x i16> %mx, %my
+  ret <4 x i16> %r
+}
+
+define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
+; CHECK-LABEL: out_v4i16_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <4 x i16> %x, %mask
+  %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
+  %my = and <4 x i16> %y, %notmask
+  %r = or <4 x i16> %mx, %my
+  ret <4 x i16> %r
+}
+
+define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
+; CHECK-LABEL: out_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <2 x i32> %x, %mask
+  %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
+  %my = and <2 x i32> %y, %notmask
+  %r = or <2 x i32> %mx, %my
+  ret <2 x i32> %r
+}
+
+define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
+; CHECK-LABEL: out_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <1 x i64> %x, %mask
+  %notmask = xor <1 x i64> %mask, <i64 -1>
+  %my = and <1 x i64> %y, %notmask
+  %r = or <1 x i64> %mx, %my
+  ret <1 x i64> %r
+}
+
+; ============================================================================ ;
+; 128-bit vector width
+; ============================================================================ ;
+
+define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
+; CHECK-LABEL: out_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <16 x i8> %x, %mask
+  %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %my = and <16 x i8> %y, %notmask
+  %r = or <16 x i8> %mx, %my
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
+; CHECK-LABEL: out_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <8 x i16> %x, %mask
+  %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %my = and <8 x i16> %y, %notmask
+  %r = or <8 x i16> %mx, %my
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
+; CHECK-LABEL: out_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <4 x i32> %x, %mask
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %my = and <4 x i32> %y, %notmask
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
+; CHECK-LABEL: out_v4i32_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <4 x i32> %x, %mask
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
+  %my = and <4 x i32> %y, %notmask
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
+; CHECK-LABEL: out_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %mx = and <2 x i64> %x, %mask
+  %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
+  %my = and <2 x i64> %y, %notmask
+  %r = or <2 x i64> %mx, %my
+  ret <2 x i64> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Should be the same as the previous one.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; ============================================================================ ;
+; 8-bit vector width
+; ============================================================================ ;
+
+define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
+; CHECK-LABEL: in_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <1 x i8> %x, %y
+  %n1 = and <1 x i8> %n0, %mask
+  %r = xor <1 x i8> %n1, %y
+  ret <1 x i8> %r
+}
+
+; ============================================================================ ;
+; 16-bit vector width
+; ============================================================================ ;
+
+define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
+; CHECK-LABEL: in_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <2 x i8> %x, %y
+  %n1 = and <2 x i8> %n0, %mask
+  %r = xor <2 x i8> %n1, %y
+  ret <2 x i8> %r
+}
+
+define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
+; CHECK-LABEL: in_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <1 x i16> %x, %y
+  %n1 = and <1 x i16> %n0, %mask
+  %r = xor <1 x i16> %n1, %y
+  ret <1 x i16> %r
+}
+
+; ============================================================================ ;
+; 32-bit vector width
+; ============================================================================ ;
+
+define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
+; CHECK-LABEL: in_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <4 x i8> %x, %y
+  %n1 = and <4 x i8> %n0, %mask
+  %r = xor <4 x i8> %n1, %y
+  ret <4 x i8> %r
+}
+
+define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
+; CHECK-LABEL: in_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <2 x i16> %x, %y
+  %n1 = and <2 x i16> %n0, %mask
+  %r = xor <2 x i16> %n1, %y
+  ret <2 x i16> %r
+}
+
+define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
+; CHECK-LABEL: in_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <1 x i32> %x, %y
+  %n1 = and <1 x i32> %n0, %mask
+  %r = xor <1 x i32> %n1, %y
+  ret <1 x i32> %r
+}
+
+; ============================================================================ ;
+; 64-bit vector width
+; ============================================================================ ;
+
+define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
+; CHECK-LABEL: in_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <8 x i8> %x, %y
+  %n1 = and <8 x i8> %n0, %mask
+  %r = xor <8 x i8> %n1, %y
+  ret <8 x i8> %r
+}
+
+define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
+; CHECK-LABEL: in_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <4 x i16> %x, %y
+  %n1 = and <4 x i16> %n0, %mask
+  %r = xor <4 x i16> %n1, %y
+  ret <4 x i16> %r
+}
+
+define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
+; CHECK-LABEL: in_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <2 x i32> %x, %y
+  %n1 = and <2 x i32> %n0, %mask
+  %r = xor <2 x i32> %n1, %y
+  ret <2 x i32> %r
+}
+
+define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
+; CHECK-LABEL: in_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+  %n0 = xor <1 x i64> %x, %y
+  %n1 = and <1 x i64> %n0, %mask
+  %r = xor <1 x i64> %n1, %y
+  ret <1 x i64> %r
+}
+
+; ============================================================================ ;
+; 128-bit vector width
+; ============================================================================ ;
+
+define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
+; CHECK-LABEL: in_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %n0 = xor <16 x i8> %x, %y
+  %n1 = and <16 x i8> %n0, %mask
+  %r = xor <16 x i8> %n1, %y
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
+; CHECK-LABEL: in_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %n0 = xor <8 x i16> %x, %y
+  %n1 = and <8 x i16> %n0, %mask
+  %r = xor <8 x i16> %n1, %y
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
+; CHECK-LABEL: in_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %n0 = xor <4 x i32> %x, %y
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
+; CHECK-LABEL: in_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %n0 = xor <2 x i64> %x, %y
+  %n1 = and <2 x i64> %n0, %mask
+  %r = xor <2 x i64> %n1, %y
+  ret <2 x i64> %r
+}

Modified: llvm/trunk/test/CodeGen/X86/machine-cp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/machine-cp.ll?rev=332903&r1=332902&r2=332903&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/machine-cp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/machine-cp.ll Mon May 21 14:40:51 2018
@@ -10,7 +10,7 @@ define i32 @t1(i32 %a, i32 %b) nounwind
 ; CHECK-NEXT:    testl %esi, %esi
 ; CHECK-NEXT:    je LBB0_1
 ; CHECK-NEXT:  ## %bb.2: ## %while.body.preheader
-; CHECK-NEXT:  movl %esi, %edx
+; CHECK-NEXT:    movl %esi, %edx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_3: ## %while.body
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1

Added: llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll?rev=332903&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll (added)
+++ llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll Mon May 21 14:40:51 2018
@@ -0,0 +1,618 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE1
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+xop < %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
+
+; ============================================================================ ;
+; Various cases with %x and/or %y being a constant
+; ============================================================================ ;
+
+define <4 x i32> @out_constant_varx_mone(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: out_constant_varx_mone:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andps (%rsi), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_constant_varx_mone:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pand (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_constant_varx_mone:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %mask, %x
+  %my = and <4 x i32> %notmask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @in_constant_varx_mone(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: in_constant_varx_mone:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
+; CHECK-SSE1-NEXT:    andnps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_constant_varx_mone:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pandn (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_constant_varx_mone:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vpandn (%rdx), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %n0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> ; %x
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @out_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: out_constant_varx_mone_invmask:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
+; CHECK-SSE1-NEXT:    orps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_constant_varx_mone_invmask:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_constant_varx_mone_invmask:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vandnps (%rdi), %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %notmask, %x
+  %my = and <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: in_constant_varx_mone_invmask:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm2
+; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm2
+; CHECK-SSE1-NEXT:    andnps %xmm2, %xmm0
+; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm2
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_constant_varx_mone_invmask:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vpxor (%rdx), %xmm1, %xmm2
+; CHECK-XOP-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %n0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> ; %x
+  %n1 = and <4 x i32> %n0, %notmask
+  %r = xor <4 x i32> %n1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @out_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: out_constant_varx_42:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_constant_varx_42:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_constant_varx_42:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT:    vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %mask, %x
+  %my = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @in_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: in_constant_varx_42:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm0 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
+; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andps (%rcx), %xmm1
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_constant_varx_42:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_constant_varx_42:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [42,42,42,42]
+; CHECK-XOP-NEXT:    vxorps (%rdi), %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vandps (%rdx), %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, <i32 42, i32 42, i32 42, i32 42>
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @out_constant_varx_42_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: out_constant_varx_42_invmask:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
+; CHECK-SSE1-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_constant_varx_42_invmask:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rdi), %xmm1
+; CHECK-SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_constant_varx_42_invmask:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rdi), %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %notmask, %x
+  %my = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @in_constant_varx_42_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: in_constant_varx_42_invmask:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
+; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm2
+; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm2
+; CHECK-SSE1-NEXT:    andnps %xmm2, %xmm0
+; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_constant_varx_42_invmask:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm2
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    andnps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_constant_varx_42_invmask:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vmovaps {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT:    vxorps (%rdi), %xmm1, %xmm2
+; CHECK-XOP-NEXT:    vandnps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
+  %n1 = and <4 x i32> %n0, %notmask
+  %r = xor <4 x i32> %n1, <i32 42, i32 42, i32 42, i32 42>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @out_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: out_constant_mone_vary:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
+; CHECK-SSE1-NEXT:    orps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_constant_mone_vary:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_constant_mone_vary:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vandnps (%rsi), %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %my = and <4 x i32> %notmask, %y
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @in_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: in_constant_mone_vary:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rcx), %xmm1
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_constant_mone_vary:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andnps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_constant_mone_vary:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
+; CHECK-XOP-NEXT:    vandnps (%rdx), %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @out_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: out_constant_mone_vary_invmask:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_constant_mone_vary_invmask:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movdqa (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pxor %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pand (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_constant_mone_vary_invmask:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vpand (%rsi), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %notmask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %my = and <4 x i32> %mask, %y
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm1
+; CHECK-SSE1-NEXT:    xorps {{.*}}(%rip), %xmm1
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm2
+; CHECK-SSE1-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm2
+; CHECK-SSE1-NEXT:    movaps %xmm2, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movdqa (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-SSE2-NEXT:    pxor (%rdx), %xmm2
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_constant_mone_vary_invmask:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rsi), %xmm0
+; CHECK-XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vpxor (%rdx), %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vpandn %xmm1, %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
+  %n1 = and <4 x i32> %n0, %notmask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @out_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: out_constant_42_vary:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
+; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_constant_42_vary:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_constant_42_vary:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT:    vpcmov %xmm0, (%rsi), %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
+  %my = and <4 x i32> %notmask, %y
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @in_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: in_constant_42_vary:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andps (%rcx), %xmm1
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_constant_42_vary:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42]
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_constant_42_vary:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vandps (%rdx), %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: out_constant_42_vary_invmask:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps {{.*}}(%rip), %xmm1
+; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_constant_42_vary_invmask:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    andps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_constant_42_vary_invmask:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rsi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT:    vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %mx = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
+  %my = and <4 x i32> %mask, %y
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+; This is not a canonical form. Testing for completeness only.
+define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
+; CHECK-SSE1-LABEL: in_constant_42_vary_invmask:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm1
+; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm2 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm2
+; CHECK-SSE1-NEXT:    andnps %xmm2, %xmm1
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_constant_42_vary_invmask:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps {{.*#+}} xmm2 = [42,42,42,42]
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    andnps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_constant_42_vary_invmask:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
+; CHECK-XOP-NEXT:    vmovaps (%rdx), %xmm1
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-XOP-NEXT:    vandnps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
+  %n1 = and <4 x i32> %n0, %notmask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}

Added: llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll?rev=332903&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll (added)
+++ llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll Mon May 21 14:40:51 2018
@@ -0,0 +1,4693 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASELINE
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE1
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+xop < %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
+
+; https://bugs.llvm.org/show_bug.cgi?id=37104
+
+; All the advanced stuff (negative tests, commutativity) is handled in the
+; scalar version of the test only.
+
+; ============================================================================ ;
+; 8-bit vector width
+; ============================================================================ ;
+
+define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
+; CHECK-LABEL: out_v1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notb %dl
+; CHECK-NEXT:    andb %sil, %dl
+; CHECK-NEXT:    orb %dil, %dl
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retq
+  %mx = and <1 x i8> %x, %mask
+  %notmask = xor <1 x i8> %mask, <i8 -1>
+  %my = and <1 x i8> %y, %notmask
+  %r = or <1 x i8> %mx, %my
+  ret <1 x i8> %r
+}
+
+; ============================================================================ ;
+; 16-bit vector width
+; ============================================================================ ;
+
+define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v2i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    notb %r8b
+; CHECK-BASELINE-NEXT:    notb %r9b
+; CHECK-BASELINE-NEXT:    andb %cl, %r9b
+; CHECK-BASELINE-NEXT:    andb %dl, %r8b
+; CHECK-BASELINE-NEXT:    orb %dil, %r8b
+; CHECK-BASELINE-NEXT:    orb %sil, %r9b
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v2i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    notb %r8b
+; CHECK-SSE1-NEXT:    notb %r9b
+; CHECK-SSE1-NEXT:    andb %cl, %r9b
+; CHECK-SSE1-NEXT:    andb %dl, %r8b
+; CHECK-SSE1-NEXT:    orb %dil, %r8b
+; CHECK-SSE1-NEXT:    orb %sil, %r9b
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v2i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v2i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <2 x i8> %x, %mask
+  %notmask = xor <2 x i8> %mask, <i8 -1, i8 -1>
+  %my = and <2 x i8> %y, %notmask
+  %r = or <2 x i8> %mx, %my
+  ret <2 x i8> %r
+}
+
+define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
+; CHECK-LABEL: out_v1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notl %edx
+; CHECK-NEXT:    andl %esi, %edx
+; CHECK-NEXT:    orl %edi, %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retq
+  %mx = and <1 x i16> %x, %mask
+  %notmask = xor <1 x i16> %mask, <i16 -1>
+  %my = and <1 x i16> %y, %notmask
+  %r = or <1 x i16> %mx, %my
+  ret <1 x i16> %r
+}
+
+; ============================================================================ ;
+; 32-bit vector width
+; ============================================================================ ;
+
+define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v4i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    andb %bl, %r8b
+; CHECK-BASELINE-NEXT:    andb %al, %cl
+; CHECK-BASELINE-NEXT:    andb %r11b, %dl
+; CHECK-BASELINE-NEXT:    andb %r10b, %sil
+; CHECK-BASELINE-NEXT:    notb %r11b
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    notb %bl
+; CHECK-BASELINE-NEXT:    notb %r10b
+; CHECK-BASELINE-NEXT:    andb %r9b, %r10b
+; CHECK-BASELINE-NEXT:    orb %sil, %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    orb %r8b, %bl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    orb %cl, %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    orb %dl, %r11b
+; CHECK-BASELINE-NEXT:    movb %bl, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movb %al, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r10b, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v4i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    andb %bl, %r8b
+; CHECK-SSE1-NEXT:    andb %al, %cl
+; CHECK-SSE1-NEXT:    andb %r11b, %dl
+; CHECK-SSE1-NEXT:    andb %r10b, %sil
+; CHECK-SSE1-NEXT:    notb %r11b
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    notb %bl
+; CHECK-SSE1-NEXT:    notb %r10b
+; CHECK-SSE1-NEXT:    andb %r9b, %r10b
+; CHECK-SSE1-NEXT:    orb %sil, %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    orb %r8b, %bl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    orb %cl, %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    orb %dl, %r11b
+; CHECK-SSE1-NEXT:    movb %bl, 3(%rdi)
+; CHECK-SSE1-NEXT:    movb %al, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb %r10b, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v4i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v4i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <4 x i8> %x, %mask
+  %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
+  %my = and <4 x i8> %y, %notmask
+  %r = or <4 x i8> %mx, %my
+  ret <4 x i8> %r
+}
+
+define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v4i8_undef:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    andb %al, %r8b
+; CHECK-BASELINE-NEXT:    andb %r11b, %dl
+; CHECK-BASELINE-NEXT:    andb %r10b, %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    notb %r11b
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    notb %r10b
+; CHECK-BASELINE-NEXT:    andb %r9b, %r10b
+; CHECK-BASELINE-NEXT:    orb %sil, %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    orb %r8b, %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    orb %dl, %r11b
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb %al, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r10b, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v4i8_undef:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    andb %al, %r8b
+; CHECK-SSE1-NEXT:    andb %r11b, %dl
+; CHECK-SSE1-NEXT:    andb %r10b, %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    notb %r11b
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    notb %r10b
+; CHECK-SSE1-NEXT:    andb %r9b, %r10b
+; CHECK-SSE1-NEXT:    orb %sil, %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    orb %r8b, %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    orb %dl, %r11b
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb %al, 3(%rdi)
+; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb %r10b, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v4i8_undef:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v4i8_undef:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <4 x i8> %x, %mask
+  %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
+  %my = and <4 x i8> %y, %notmask
+  %r = or <4 x i8> %mx, %my
+  ret <4 x i8> %r
+}
+
+define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v2i16:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    notl %r8d
+; CHECK-BASELINE-NEXT:    notl %r9d
+; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
+; CHECK-BASELINE-NEXT:    orl %esi, %r9d
+; CHECK-BASELINE-NEXT:    andl %edx, %r8d
+; CHECK-BASELINE-NEXT:    orl %edi, %r8d
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v2i16:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    notl %r8d
+; CHECK-SSE1-NEXT:    notl %r9d
+; CHECK-SSE1-NEXT:    andl %ecx, %r9d
+; CHECK-SSE1-NEXT:    orl %esi, %r9d
+; CHECK-SSE1-NEXT:    andl %edx, %r8d
+; CHECK-SSE1-NEXT:    orl %edi, %r8d
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v2i16:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v2i16:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <2 x i16> %x, %mask
+  %notmask = xor <2 x i16> %mask, <i16 -1, i16 -1>
+  %my = and <2 x i16> %y, %notmask
+  %r = or <2 x i16> %mx, %my
+  ret <2 x i16> %r
+}
+
+define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
+; CHECK-LABEL: out_v1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notl %edx
+; CHECK-NEXT:    andl %esi, %edx
+; CHECK-NEXT:    orl %edi, %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retq
+  %mx = and <1 x i32> %x, %mask
+  %notmask = xor <1 x i32> %mask, <i32 -1>
+  %my = and <1 x i32> %y, %notmask
+  %r = or <1 x i32> %mx, %my
+  ret <1 x i32> %r
+}
+
+; ============================================================================ ;
+; 64-bit vector width
+; ============================================================================ ;
+
+define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v8i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    andb %al, %r9b
+; CHECK-BASELINE-NEXT:    andb %bl, %r8b
+; CHECK-BASELINE-NEXT:    andb %r14b, %cl
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    andb %r11b, %dl
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    andb %r10b, %sil
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    andb %r12b, %r13b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    andb %r15b, %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    andb %bpl, %dl
+; CHECK-BASELINE-NEXT:    notb %r10b
+; CHECK-BASELINE-NEXT:    notb %r11b
+; CHECK-BASELINE-NEXT:    notb %r14b
+; CHECK-BASELINE-NEXT:    notb %bl
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    notb %bpl
+; CHECK-BASELINE-NEXT:    notb %r15b
+; CHECK-BASELINE-NEXT:    notb %r12b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    orb %r13b, %r12b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    orb %cl, %r15b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    orb %dl, %bpl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    orb %r9b, %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    orb %r8b, %bl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    orb %sil, %r10b
+; CHECK-BASELINE-NEXT:    movb %r12b, 7(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r15b, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bpl, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb %al, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bl, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r14b, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r10b, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v8i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    andb %al, %r9b
+; CHECK-SSE1-NEXT:    andb %bl, %r8b
+; CHECK-SSE1-NEXT:    andb %r14b, %cl
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    andb %r11b, %dl
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    andb %r10b, %sil
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    andb %r12b, %r13b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    andb %r15b, %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    andb %bpl, %dl
+; CHECK-SSE1-NEXT:    notb %r10b
+; CHECK-SSE1-NEXT:    notb %r11b
+; CHECK-SSE1-NEXT:    notb %r14b
+; CHECK-SSE1-NEXT:    notb %bl
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    notb %bpl
+; CHECK-SSE1-NEXT:    notb %r15b
+; CHECK-SSE1-NEXT:    notb %r12b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    orb %r13b, %r12b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    orb %cl, %r15b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    orb %dl, %bpl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    orb %r9b, %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    orb %r8b, %bl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    orb %sil, %r10b
+; CHECK-SSE1-NEXT:    movb %r12b, 7(%rdi)
+; CHECK-SSE1-NEXT:    movb %r15b, 6(%rdi)
+; CHECK-SSE1-NEXT:    movb %bpl, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb %al, 4(%rdi)
+; CHECK-SSE1-NEXT:    movb %bl, 3(%rdi)
+; CHECK-SSE1-NEXT:    movb %r14b, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb %r10b, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v8i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v8i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <8 x i8> %x, %mask
+  %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %my = and <8 x i8> %y, %notmask
+  %r = or <8 x i8> %mx, %my
+  ret <8 x i8> %r
+}
+
+define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v4i16:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    andl %ebx, %esi
+; CHECK-BASELINE-NEXT:    andl %eax, %r8d
+; CHECK-BASELINE-NEXT:    andl %r11d, %ecx
+; CHECK-BASELINE-NEXT:    andl %r10d, %edx
+; CHECK-BASELINE-NEXT:    notl %r10d
+; CHECK-BASELINE-NEXT:    notl %r11d
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    notl %ebx
+; CHECK-BASELINE-NEXT:    andl %r9d, %ebx
+; CHECK-BASELINE-NEXT:    orl %esi, %ebx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %ax
+; CHECK-BASELINE-NEXT:    orl %r8d, %eax
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT:    orl %ecx, %r11d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT:    orl %edx, %r10d
+; CHECK-BASELINE-NEXT:    movw %bx, (%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r11w, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v4i16:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    andl %ebx, %esi
+; CHECK-SSE1-NEXT:    andl %eax, %r8d
+; CHECK-SSE1-NEXT:    andl %r11d, %ecx
+; CHECK-SSE1-NEXT:    andl %r10d, %edx
+; CHECK-SSE1-NEXT:    notl %r10d
+; CHECK-SSE1-NEXT:    notl %r11d
+; CHECK-SSE1-NEXT:    notl %eax
+; CHECK-SSE1-NEXT:    notl %ebx
+; CHECK-SSE1-NEXT:    andl %r9d, %ebx
+; CHECK-SSE1-NEXT:    orl %esi, %ebx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %ax
+; CHECK-SSE1-NEXT:    orl %r8d, %eax
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT:    orl %ecx, %r11d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT:    orl %edx, %r10d
+; CHECK-SSE1-NEXT:    movw %bx, (%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %r11w, 4(%rdi)
+; CHECK-SSE1-NEXT:    movw %r10w, 2(%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v4i16:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v4i16:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <4 x i16> %x, %mask
+  %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %my = and <4 x i16> %y, %notmask
+  %r = or <4 x i16> %mx, %my
+  ret <4 x i16> %r
+}
+
+define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v4i16_undef:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    andl %eax, %esi
+; CHECK-BASELINE-NEXT:    andl %r11d, %r8d
+; CHECK-BASELINE-NEXT:    andl %r10d, %edx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    notl %r10d
+; CHECK-BASELINE-NEXT:    notl %r11d
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    andl %r9d, %eax
+; CHECK-BASELINE-NEXT:    orl %esi, %eax
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT:    orl %r8d, %r11d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT:    orl %edx, %r10d
+; CHECK-BASELINE-NEXT:    movw %cx, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
+; CHECK-BASELINE-NEXT:    movw %r11w, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v4i16_undef:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    andl %eax, %esi
+; CHECK-SSE1-NEXT:    andl %r11d, %r8d
+; CHECK-SSE1-NEXT:    andl %r10d, %edx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    notl %r10d
+; CHECK-SSE1-NEXT:    notl %r11d
+; CHECK-SSE1-NEXT:    notl %eax
+; CHECK-SSE1-NEXT:    andl %r9d, %eax
+; CHECK-SSE1-NEXT:    orl %esi, %eax
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT:    orl %r8d, %r11d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT:    orl %edx, %r10d
+; CHECK-SSE1-NEXT:    movw %cx, 4(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
+; CHECK-SSE1-NEXT:    movw %r11w, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %r10w, 2(%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v4i16_undef:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v4i16_undef:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <4 x i16> %x, %mask
+  %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
+  %my = and <4 x i16> %y, %notmask
+  %r = or <4 x i16> %mx, %my
+  ret <4 x i16> %r
+}
+
+define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v2i32:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    notl %r8d
+; CHECK-BASELINE-NEXT:    notl %r9d
+; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
+; CHECK-BASELINE-NEXT:    orl %esi, %r9d
+; CHECK-BASELINE-NEXT:    andl %edx, %r8d
+; CHECK-BASELINE-NEXT:    orl %edi, %r8d
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v2i32:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    notl %r8d
+; CHECK-SSE1-NEXT:    notl %r9d
+; CHECK-SSE1-NEXT:    andl %ecx, %r9d
+; CHECK-SSE1-NEXT:    orl %esi, %r9d
+; CHECK-SSE1-NEXT:    andl %edx, %r8d
+; CHECK-SSE1-NEXT:    orl %edi, %r8d
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v2i32:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v2i32:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <2 x i32> %x, %mask
+  %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
+  %my = and <2 x i32> %y, %notmask
+  %r = or <2 x i32> %mx, %my
+  ret <2 x i32> %r
+}
+
+define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
+; CHECK-LABEL: out_v1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andq %rdx, %rdi
+; CHECK-NEXT:    notq %rdx
+; CHECK-NEXT:    andq %rsi, %rdx
+; CHECK-NEXT:    orq %rdi, %rdx
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+  %mx = and <1 x i64> %x, %mask
+  %notmask = xor <1 x i64> %mask, <i64 -1>
+  %my = and <1 x i64> %y, %notmask
+  %r = or <1 x i64> %mx, %my
+  ret <1 x i64> %r
+}
+
+; ============================================================================ ;
+; 128-bit vector width
+; ============================================================================ ;
+
+define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v16i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %al, %sil
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    orb %sil, %al
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %cl, %sil
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    orb %sil, %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %dl, %sil
+; CHECK-BASELINE-NEXT:    notb %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    orb %sil, %dl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %bl, %sil
+; CHECK-BASELINE-NEXT:    notb %bl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    orb %sil, %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %r13b, %sil
+; CHECK-BASELINE-NEXT:    notb %r13b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    orb %sil, %r13b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %r12b, %sil
+; CHECK-BASELINE-NEXT:    notb %r12b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    orb %sil, %r12b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %r15b, %sil
+; CHECK-BASELINE-NEXT:    notb %r15b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    orb %sil, %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %r14b, %sil
+; CHECK-BASELINE-NEXT:    notb %r14b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    orb %sil, %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %bpl, %sil
+; CHECK-BASELINE-NEXT:    notb %bpl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    orb %sil, %bpl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %r11b, %sil
+; CHECK-BASELINE-NEXT:    notb %r11b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    orb %sil, %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %r10b, %sil
+; CHECK-BASELINE-NEXT:    notb %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    orb %sil, %r10b
+; CHECK-BASELINE-NEXT:    movb %al, 15(%rdi)
+; CHECK-BASELINE-NEXT:    movb %cl, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movb %dl, 13(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bl, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r13b, 11(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r12b, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r15b, 9(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r14b, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bpl, 7(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r11b, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    andb %al, %r9b
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    orb %r9b, %al
+; CHECK-BASELINE-NEXT:    movb %r10b, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    andb %cl, %r8b
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    orb %r8b, %cl
+; CHECK-BASELINE-NEXT:    movb %al, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andb %al, %dl
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    orb %dl, %al
+; CHECK-BASELINE-NEXT:    movb %cl, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %dl
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    orb %dl, %cl
+; CHECK-BASELINE-NEXT:    movb %al, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andb %al, %dl
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    orb %dl, %al
+; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb %al, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v16i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %al, %sil
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    orb %sil, %al
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %cl, %sil
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    orb %sil, %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %dl, %sil
+; CHECK-SSE1-NEXT:    notb %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    orb %sil, %dl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %bl, %sil
+; CHECK-SSE1-NEXT:    notb %bl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    orb %sil, %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %r13b, %sil
+; CHECK-SSE1-NEXT:    notb %r13b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    orb %sil, %r13b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %r12b, %sil
+; CHECK-SSE1-NEXT:    notb %r12b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    orb %sil, %r12b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %r15b, %sil
+; CHECK-SSE1-NEXT:    notb %r15b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    orb %sil, %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %r14b, %sil
+; CHECK-SSE1-NEXT:    notb %r14b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    orb %sil, %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %bpl, %sil
+; CHECK-SSE1-NEXT:    notb %bpl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    orb %sil, %bpl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %r11b, %sil
+; CHECK-SSE1-NEXT:    notb %r11b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    orb %sil, %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %r10b, %sil
+; CHECK-SSE1-NEXT:    notb %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    orb %sil, %r10b
+; CHECK-SSE1-NEXT:    movb %al, 15(%rdi)
+; CHECK-SSE1-NEXT:    movb %cl, 14(%rdi)
+; CHECK-SSE1-NEXT:    movb %dl, 13(%rdi)
+; CHECK-SSE1-NEXT:    movb %bl, 12(%rdi)
+; CHECK-SSE1-NEXT:    movb %r13b, 11(%rdi)
+; CHECK-SSE1-NEXT:    movb %r12b, 10(%rdi)
+; CHECK-SSE1-NEXT:    movb %r15b, 9(%rdi)
+; CHECK-SSE1-NEXT:    movb %r14b, 8(%rdi)
+; CHECK-SSE1-NEXT:    movb %bpl, 7(%rdi)
+; CHECK-SSE1-NEXT:    movb %r11b, 6(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    andb %al, %r9b
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    orb %r9b, %al
+; CHECK-SSE1-NEXT:    movb %r10b, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    andb %cl, %r8b
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    orb %r8b, %cl
+; CHECK-SSE1-NEXT:    movb %al, 4(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-SSE1-NEXT:    andb %al, %dl
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    orb %dl, %al
+; CHECK-SSE1-NEXT:    movb %cl, 3(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %dl
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    orb %dl, %cl
+; CHECK-SSE1-NEXT:    movb %al, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-SSE1-NEXT:    andb %al, %dl
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    orb %dl, %al
+; CHECK-SSE1-NEXT:    movb %cl, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb %al, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v16i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v16i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <16 x i8> %x, %mask
+  %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %my = and <16 x i8> %y, %notmask
+  %r = or <16 x i8> %mx, %my
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v8i16:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    andw %r14w, %bx
+; CHECK-BASELINE-NEXT:    notl %r14d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r14w
+; CHECK-BASELINE-NEXT:    orl %ebx, %r14d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    andw %r11w, %bx
+; CHECK-BASELINE-NEXT:    notl %r11d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT:    orl %ebx, %r11d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    andw %r10w, %bx
+; CHECK-BASELINE-NEXT:    notl %r10d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT:    orl %ebx, %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    andl %ebx, %r9d
+; CHECK-BASELINE-NEXT:    notl %ebx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
+; CHECK-BASELINE-NEXT:    orl %r9d, %ebx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    andl %eax, %r8d
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %ax
+; CHECK-BASELINE-NEXT:    orl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-BASELINE-NEXT:    andl %ebp, %ecx
+; CHECK-BASELINE-NEXT:    notl %ebp
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bp
+; CHECK-BASELINE-NEXT:    orl %ecx, %ebp
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    andl %ecx, %edx
+; CHECK-BASELINE-NEXT:    notl %ecx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    orl %edx, %ecx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; CHECK-BASELINE-NEXT:    andl %edx, %esi
+; CHECK-BASELINE-NEXT:    notl %edx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-BASELINE-NEXT:    orl %esi, %edx
+; CHECK-BASELINE-NEXT:    movw %r14w, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r11w, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r10w, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bx, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bp, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movw %cx, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movw %dx, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v8i16:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    andw %r14w, %bx
+; CHECK-SSE1-NEXT:    notl %r14d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r14w
+; CHECK-SSE1-NEXT:    orl %ebx, %r14d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    andw %r11w, %bx
+; CHECK-SSE1-NEXT:    notl %r11d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT:    orl %ebx, %r11d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    andw %r10w, %bx
+; CHECK-SSE1-NEXT:    notl %r10d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT:    orl %ebx, %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    andl %ebx, %r9d
+; CHECK-SSE1-NEXT:    notl %ebx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
+; CHECK-SSE1-NEXT:    orl %r9d, %ebx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    andl %eax, %r8d
+; CHECK-SSE1-NEXT:    notl %eax
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %ax
+; CHECK-SSE1-NEXT:    orl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-SSE1-NEXT:    andl %ebp, %ecx
+; CHECK-SSE1-NEXT:    notl %ebp
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bp
+; CHECK-SSE1-NEXT:    orl %ecx, %ebp
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    andl %ecx, %edx
+; CHECK-SSE1-NEXT:    notl %ecx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    orl %edx, %ecx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; CHECK-SSE1-NEXT:    andl %edx, %esi
+; CHECK-SSE1-NEXT:    notl %edx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-SSE1-NEXT:    orl %esi, %edx
+; CHECK-SSE1-NEXT:    movw %r14w, 14(%rdi)
+; CHECK-SSE1-NEXT:    movw %r11w, 12(%rdi)
+; CHECK-SSE1-NEXT:    movw %r10w, 10(%rdi)
+; CHECK-SSE1-NEXT:    movw %bx, 8(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %bp, 4(%rdi)
+; CHECK-SSE1-NEXT:    movw %cx, 2(%rdi)
+; CHECK-SSE1-NEXT:    movw %dx, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v8i16:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v8i16:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <8 x i16> %x, %mask
+  %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %my = and <8 x i16> %y, %notmask
+  %r = or <8 x i16> %mx, %my
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @out_v4i32(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: out_v4i32:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl (%rcx), %r8d
+; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r9d
+; CHECK-BASELINE-NEXT:    movl 8(%rcx), %eax
+; CHECK-BASELINE-NEXT:    movl 12(%rcx), %ecx
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %r10d
+; CHECK-BASELINE-NEXT:    andl %ecx, %r10d
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    andl %eax, %r11d
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %ebx
+; CHECK-BASELINE-NEXT:    andl %r9d, %ebx
+; CHECK-BASELINE-NEXT:    movl (%rsi), %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %esi
+; CHECK-BASELINE-NEXT:    notl %r8d
+; CHECK-BASELINE-NEXT:    notl %r9d
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    notl %ecx
+; CHECK-BASELINE-NEXT:    andl 12(%rdx), %ecx
+; CHECK-BASELINE-NEXT:    orl %r10d, %ecx
+; CHECK-BASELINE-NEXT:    andl 8(%rdx), %eax
+; CHECK-BASELINE-NEXT:    orl %r11d, %eax
+; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    orl %ebx, %r9d
+; CHECK-BASELINE-NEXT:    andl (%rdx), %r8d
+; CHECK-BASELINE-NEXT:    orl %esi, %r8d
+; CHECK-BASELINE-NEXT:    movl %ecx, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movl %eax, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r8d, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v4i32:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v4i32:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v4i32:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %mx = and <4 x i32> %x, %mask
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %my = and <4 x i32> %y, %notmask
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @out_v4i32_undef(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: out_v4i32_undef:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r8d
+; CHECK-BASELINE-NEXT:    movl (%rcx), %r9d
+; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r10d
+; CHECK-BASELINE-NEXT:    movl 12(%rcx), %eax
+; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r8d
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %ecx
+; CHECK-BASELINE-NEXT:    andl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    andl %r10d, %r11d
+; CHECK-BASELINE-NEXT:    movl (%rsi), %esi
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    notl %r9d
+; CHECK-BASELINE-NEXT:    notl %r10d
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    andl 12(%rdx), %eax
+; CHECK-BASELINE-NEXT:    orl %ecx, %eax
+; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    orl %r11d, %r10d
+; CHECK-BASELINE-NEXT:    andl (%rdx), %r9d
+; CHECK-BASELINE-NEXT:    orl %esi, %r9d
+; CHECK-BASELINE-NEXT:    movl %r8d, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movl %eax, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r10d, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r9d, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v4i32_undef:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v4i32_undef:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v4i32_undef:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %mx = and <4 x i32> %x, %mask
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
+  %my = and <4 x i32> %y, %notmask
+  %r = or <4 x i32> %mx, %my
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
+; CHECK-BASELINE-LABEL: out_v2i64:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    andq %r9, %rsi
+; CHECK-BASELINE-NEXT:    andq %r8, %rdi
+; CHECK-BASELINE-NEXT:    notq %r8
+; CHECK-BASELINE-NEXT:    notq %r9
+; CHECK-BASELINE-NEXT:    andq %rcx, %r9
+; CHECK-BASELINE-NEXT:    orq %rsi, %r9
+; CHECK-BASELINE-NEXT:    andq %rdx, %r8
+; CHECK-BASELINE-NEXT:    orq %rdi, %r8
+; CHECK-BASELINE-NEXT:    movq %r8, %rax
+; CHECK-BASELINE-NEXT:    movq %r9, %rdx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v2i64:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    andq %r9, %rsi
+; CHECK-SSE1-NEXT:    andq %r8, %rdi
+; CHECK-SSE1-NEXT:    notq %r8
+; CHECK-SSE1-NEXT:    notq %r9
+; CHECK-SSE1-NEXT:    andq %rcx, %r9
+; CHECK-SSE1-NEXT:    orq %rsi, %r9
+; CHECK-SSE1-NEXT:    andq %rdx, %r8
+; CHECK-SSE1-NEXT:    orq %rdi, %r8
+; CHECK-SSE1-NEXT:    movq %r8, %rax
+; CHECK-SSE1-NEXT:    movq %r9, %rdx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v2i64:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v2i64:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %mx = and <2 x i64> %x, %mask
+  %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
+  %my = and <2 x i64> %y, %notmask
+  %r = or <2 x i64> %mx, %my
+  ret <2 x i64> %r
+}
+
+; ============================================================================ ;
+; 256-bit vector width
+; ============================================================================ ;
+
+define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: out_v32i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rcx, %r15
+; CHECK-BASELINE-NEXT:    movq %rsi, %r14
+; CHECK-BASELINE-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-BASELINE-NEXT:    movb 15(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 16(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 17(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 18(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 19(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 20(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 21(%rcx), %r12b
+; CHECK-BASELINE-NEXT:    movb 22(%rcx), %r10b
+; CHECK-BASELINE-NEXT:    movb 23(%rcx), %r11b
+; CHECK-BASELINE-NEXT:    movb 24(%rcx), %bpl
+; CHECK-BASELINE-NEXT:    movb 25(%rcx), %r13b
+; CHECK-BASELINE-NEXT:    movb 26(%rcx), %r9b
+; CHECK-BASELINE-NEXT:    movb 27(%rcx), %r8b
+; CHECK-BASELINE-NEXT:    movb 28(%rcx), %dil
+; CHECK-BASELINE-NEXT:    movb 29(%rcx), %sil
+; CHECK-BASELINE-NEXT:    movb 30(%rcx), %bl
+; CHECK-BASELINE-NEXT:    movb 31(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb 31(%r14), %cl
+; CHECK-BASELINE-NEXT:    andb %al, %cl
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    andb 31(%rdx), %al
+; CHECK-BASELINE-NEXT:    orb %cl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 30(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %bl, %al
+; CHECK-BASELINE-NEXT:    notb %bl
+; CHECK-BASELINE-NEXT:    andb 30(%rdx), %bl
+; CHECK-BASELINE-NEXT:    orb %al, %bl
+; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 29(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %sil, %al
+; CHECK-BASELINE-NEXT:    notb %sil
+; CHECK-BASELINE-NEXT:    andb 29(%rdx), %sil
+; CHECK-BASELINE-NEXT:    orb %al, %sil
+; CHECK-BASELINE-NEXT:    movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 28(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %dil, %al
+; CHECK-BASELINE-NEXT:    notb %dil
+; CHECK-BASELINE-NEXT:    andb 28(%rdx), %dil
+; CHECK-BASELINE-NEXT:    orb %al, %dil
+; CHECK-BASELINE-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 27(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r8b, %al
+; CHECK-BASELINE-NEXT:    notb %r8b
+; CHECK-BASELINE-NEXT:    andb 27(%rdx), %r8b
+; CHECK-BASELINE-NEXT:    orb %al, %r8b
+; CHECK-BASELINE-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 26(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r9b, %al
+; CHECK-BASELINE-NEXT:    notb %r9b
+; CHECK-BASELINE-NEXT:    andb 26(%rdx), %r9b
+; CHECK-BASELINE-NEXT:    orb %al, %r9b
+; CHECK-BASELINE-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 25(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r13b, %al
+; CHECK-BASELINE-NEXT:    notb %r13b
+; CHECK-BASELINE-NEXT:    andb 25(%rdx), %r13b
+; CHECK-BASELINE-NEXT:    orb %al, %r13b
+; CHECK-BASELINE-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 24(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %bpl, %al
+; CHECK-BASELINE-NEXT:    notb %bpl
+; CHECK-BASELINE-NEXT:    andb 24(%rdx), %bpl
+; CHECK-BASELINE-NEXT:    orb %al, %bpl
+; CHECK-BASELINE-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 23(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r11b, %al
+; CHECK-BASELINE-NEXT:    notb %r11b
+; CHECK-BASELINE-NEXT:    andb 23(%rdx), %r11b
+; CHECK-BASELINE-NEXT:    orb %al, %r11b
+; CHECK-BASELINE-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 22(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r10b, %al
+; CHECK-BASELINE-NEXT:    notb %r10b
+; CHECK-BASELINE-NEXT:    andb 22(%rdx), %r10b
+; CHECK-BASELINE-NEXT:    orb %al, %r10b
+; CHECK-BASELINE-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 21(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r12b, %al
+; CHECK-BASELINE-NEXT:    notb %r12b
+; CHECK-BASELINE-NEXT:    andb 21(%rdx), %r12b
+; CHECK-BASELINE-NEXT:    orb %al, %r12b
+; CHECK-BASELINE-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 20(%r14), %al
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb 20(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 19(%r14), %al
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb 19(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 18(%r14), %al
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb 18(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 17(%r14), %al
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb 17(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 16(%r14), %al
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    movq %rdx, %rbx
+; CHECK-BASELINE-NEXT:    andb 16(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 15(%r14), %al
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb 15(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 14(%r15), %cl
+; CHECK-BASELINE-NEXT:    movb 14(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb 14(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 13(%r15), %cl
+; CHECK-BASELINE-NEXT:    movb 13(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb 13(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 12(%r15), %cl
+; CHECK-BASELINE-NEXT:    movb 12(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %cl, %al
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb 12(%rdx), %cl
+; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 11(%r15), %r13b
+; CHECK-BASELINE-NEXT:    movb 11(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r13b, %al
+; CHECK-BASELINE-NEXT:    notb %r13b
+; CHECK-BASELINE-NEXT:    andb 11(%rdx), %r13b
+; CHECK-BASELINE-NEXT:    orb %al, %r13b
+; CHECK-BASELINE-NEXT:    movb 10(%r15), %r12b
+; CHECK-BASELINE-NEXT:    movb 10(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r12b, %al
+; CHECK-BASELINE-NEXT:    notb %r12b
+; CHECK-BASELINE-NEXT:    andb 10(%rdx), %r12b
+; CHECK-BASELINE-NEXT:    orb %al, %r12b
+; CHECK-BASELINE-NEXT:    movb 9(%r15), %bpl
+; CHECK-BASELINE-NEXT:    movb 9(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %bpl, %al
+; CHECK-BASELINE-NEXT:    notb %bpl
+; CHECK-BASELINE-NEXT:    andb 9(%rdx), %bpl
+; CHECK-BASELINE-NEXT:    orb %al, %bpl
+; CHECK-BASELINE-NEXT:    movb 8(%r15), %r11b
+; CHECK-BASELINE-NEXT:    movb 8(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r11b, %al
+; CHECK-BASELINE-NEXT:    notb %r11b
+; CHECK-BASELINE-NEXT:    andb 8(%rdx), %r11b
+; CHECK-BASELINE-NEXT:    orb %al, %r11b
+; CHECK-BASELINE-NEXT:    movb 7(%r15), %r10b
+; CHECK-BASELINE-NEXT:    movb 7(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r10b, %al
+; CHECK-BASELINE-NEXT:    notb %r10b
+; CHECK-BASELINE-NEXT:    andb 7(%rdx), %r10b
+; CHECK-BASELINE-NEXT:    orb %al, %r10b
+; CHECK-BASELINE-NEXT:    movb 6(%r15), %r9b
+; CHECK-BASELINE-NEXT:    movb 6(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r9b, %al
+; CHECK-BASELINE-NEXT:    notb %r9b
+; CHECK-BASELINE-NEXT:    andb 6(%rdx), %r9b
+; CHECK-BASELINE-NEXT:    orb %al, %r9b
+; CHECK-BASELINE-NEXT:    movb 5(%r15), %r8b
+; CHECK-BASELINE-NEXT:    movb 5(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %r8b, %al
+; CHECK-BASELINE-NEXT:    notb %r8b
+; CHECK-BASELINE-NEXT:    andb 5(%rdx), %r8b
+; CHECK-BASELINE-NEXT:    orb %al, %r8b
+; CHECK-BASELINE-NEXT:    movb 4(%r15), %dil
+; CHECK-BASELINE-NEXT:    movb 4(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %dil, %al
+; CHECK-BASELINE-NEXT:    notb %dil
+; CHECK-BASELINE-NEXT:    andb 4(%rdx), %dil
+; CHECK-BASELINE-NEXT:    orb %al, %dil
+; CHECK-BASELINE-NEXT:    movb 3(%r15), %sil
+; CHECK-BASELINE-NEXT:    movb 3(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %sil, %al
+; CHECK-BASELINE-NEXT:    notb %sil
+; CHECK-BASELINE-NEXT:    andb 3(%rdx), %sil
+; CHECK-BASELINE-NEXT:    orb %al, %sil
+; CHECK-BASELINE-NEXT:    movb 2(%r15), %dl
+; CHECK-BASELINE-NEXT:    movb 2(%r14), %al
+; CHECK-BASELINE-NEXT:    andb %dl, %al
+; CHECK-BASELINE-NEXT:    notb %dl
+; CHECK-BASELINE-NEXT:    andb 2(%rbx), %dl
+; CHECK-BASELINE-NEXT:    orb %al, %dl
+; CHECK-BASELINE-NEXT:    movb 1(%r15), %al
+; CHECK-BASELINE-NEXT:    movb 1(%r14), %cl
+; CHECK-BASELINE-NEXT:    andb %al, %cl
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    andb 1(%rbx), %al
+; CHECK-BASELINE-NEXT:    orb %cl, %al
+; CHECK-BASELINE-NEXT:    movb (%r15), %r15b
+; CHECK-BASELINE-NEXT:    movb (%r14), %r14b
+; CHECK-BASELINE-NEXT:    andb %r15b, %r14b
+; CHECK-BASELINE-NEXT:    notb %r15b
+; CHECK-BASELINE-NEXT:    andb (%rbx), %r15b
+; CHECK-BASELINE-NEXT:    orb %r14b, %r15b
+; CHECK-BASELINE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 31(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 30(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 29(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 28(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 27(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 26(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 25(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 24(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 23(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 22(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 21(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 20(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 19(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 18(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 17(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 16(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 15(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 14(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 13(%rcx)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 12(%rcx)
+; CHECK-BASELINE-NEXT:    movb %r13b, 11(%rcx)
+; CHECK-BASELINE-NEXT:    movb %r12b, 10(%rcx)
+; CHECK-BASELINE-NEXT:    movb %bpl, 9(%rcx)
+; CHECK-BASELINE-NEXT:    movb %r11b, 8(%rcx)
+; CHECK-BASELINE-NEXT:    movb %r10b, 7(%rcx)
+; CHECK-BASELINE-NEXT:    movb %r9b, 6(%rcx)
+; CHECK-BASELINE-NEXT:    movb %r8b, 5(%rcx)
+; CHECK-BASELINE-NEXT:    movb %dil, 4(%rcx)
+; CHECK-BASELINE-NEXT:    movb %sil, 3(%rcx)
+; CHECK-BASELINE-NEXT:    movb %dl, 2(%rcx)
+; CHECK-BASELINE-NEXT:    movb %al, 1(%rcx)
+; CHECK-BASELINE-NEXT:    movb %r15b, (%rcx)
+; CHECK-BASELINE-NEXT:    movq %rcx, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v32i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rcx, %r15
+; CHECK-SSE1-NEXT:    movq %rsi, %r14
+; CHECK-SSE1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE1-NEXT:    movb 15(%rcx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 16(%rcx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 17(%rcx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 18(%rcx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 19(%rcx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 20(%rcx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 21(%rcx), %r12b
+; CHECK-SSE1-NEXT:    movb 22(%rcx), %r10b
+; CHECK-SSE1-NEXT:    movb 23(%rcx), %r11b
+; CHECK-SSE1-NEXT:    movb 24(%rcx), %bpl
+; CHECK-SSE1-NEXT:    movb 25(%rcx), %r13b
+; CHECK-SSE1-NEXT:    movb 26(%rcx), %r9b
+; CHECK-SSE1-NEXT:    movb 27(%rcx), %r8b
+; CHECK-SSE1-NEXT:    movb 28(%rcx), %dil
+; CHECK-SSE1-NEXT:    movb 29(%rcx), %sil
+; CHECK-SSE1-NEXT:    movb 30(%rcx), %bl
+; CHECK-SSE1-NEXT:    movb 31(%rcx), %al
+; CHECK-SSE1-NEXT:    movb 31(%r14), %cl
+; CHECK-SSE1-NEXT:    andb %al, %cl
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    andb 31(%rdx), %al
+; CHECK-SSE1-NEXT:    orb %cl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 30(%r14), %al
+; CHECK-SSE1-NEXT:    andb %bl, %al
+; CHECK-SSE1-NEXT:    notb %bl
+; CHECK-SSE1-NEXT:    andb 30(%rdx), %bl
+; CHECK-SSE1-NEXT:    orb %al, %bl
+; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 29(%r14), %al
+; CHECK-SSE1-NEXT:    andb %sil, %al
+; CHECK-SSE1-NEXT:    notb %sil
+; CHECK-SSE1-NEXT:    andb 29(%rdx), %sil
+; CHECK-SSE1-NEXT:    orb %al, %sil
+; CHECK-SSE1-NEXT:    movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 28(%r14), %al
+; CHECK-SSE1-NEXT:    andb %dil, %al
+; CHECK-SSE1-NEXT:    notb %dil
+; CHECK-SSE1-NEXT:    andb 28(%rdx), %dil
+; CHECK-SSE1-NEXT:    orb %al, %dil
+; CHECK-SSE1-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 27(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r8b, %al
+; CHECK-SSE1-NEXT:    notb %r8b
+; CHECK-SSE1-NEXT:    andb 27(%rdx), %r8b
+; CHECK-SSE1-NEXT:    orb %al, %r8b
+; CHECK-SSE1-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 26(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r9b, %al
+; CHECK-SSE1-NEXT:    notb %r9b
+; CHECK-SSE1-NEXT:    andb 26(%rdx), %r9b
+; CHECK-SSE1-NEXT:    orb %al, %r9b
+; CHECK-SSE1-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 25(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r13b, %al
+; CHECK-SSE1-NEXT:    notb %r13b
+; CHECK-SSE1-NEXT:    andb 25(%rdx), %r13b
+; CHECK-SSE1-NEXT:    orb %al, %r13b
+; CHECK-SSE1-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 24(%r14), %al
+; CHECK-SSE1-NEXT:    andb %bpl, %al
+; CHECK-SSE1-NEXT:    notb %bpl
+; CHECK-SSE1-NEXT:    andb 24(%rdx), %bpl
+; CHECK-SSE1-NEXT:    orb %al, %bpl
+; CHECK-SSE1-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 23(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r11b, %al
+; CHECK-SSE1-NEXT:    notb %r11b
+; CHECK-SSE1-NEXT:    andb 23(%rdx), %r11b
+; CHECK-SSE1-NEXT:    orb %al, %r11b
+; CHECK-SSE1-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 22(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r10b, %al
+; CHECK-SSE1-NEXT:    notb %r10b
+; CHECK-SSE1-NEXT:    andb 22(%rdx), %r10b
+; CHECK-SSE1-NEXT:    orb %al, %r10b
+; CHECK-SSE1-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 21(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r12b, %al
+; CHECK-SSE1-NEXT:    notb %r12b
+; CHECK-SSE1-NEXT:    andb 21(%rdx), %r12b
+; CHECK-SSE1-NEXT:    orb %al, %r12b
+; CHECK-SSE1-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 20(%r14), %al
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb 20(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 19(%r14), %al
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb 19(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 18(%r14), %al
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb 18(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 17(%r14), %al
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb 17(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 16(%r14), %al
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    movq %rdx, %rbx
+; CHECK-SSE1-NEXT:    andb 16(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 15(%r14), %al
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb 15(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 14(%r15), %cl
+; CHECK-SSE1-NEXT:    movb 14(%r14), %al
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb 14(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 13(%r15), %cl
+; CHECK-SSE1-NEXT:    movb 13(%r14), %al
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb 13(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 12(%r15), %cl
+; CHECK-SSE1-NEXT:    movb 12(%r14), %al
+; CHECK-SSE1-NEXT:    andb %cl, %al
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb 12(%rdx), %cl
+; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 11(%r15), %r13b
+; CHECK-SSE1-NEXT:    movb 11(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r13b, %al
+; CHECK-SSE1-NEXT:    notb %r13b
+; CHECK-SSE1-NEXT:    andb 11(%rdx), %r13b
+; CHECK-SSE1-NEXT:    orb %al, %r13b
+; CHECK-SSE1-NEXT:    movb 10(%r15), %r12b
+; CHECK-SSE1-NEXT:    movb 10(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r12b, %al
+; CHECK-SSE1-NEXT:    notb %r12b
+; CHECK-SSE1-NEXT:    andb 10(%rdx), %r12b
+; CHECK-SSE1-NEXT:    orb %al, %r12b
+; CHECK-SSE1-NEXT:    movb 9(%r15), %bpl
+; CHECK-SSE1-NEXT:    movb 9(%r14), %al
+; CHECK-SSE1-NEXT:    andb %bpl, %al
+; CHECK-SSE1-NEXT:    notb %bpl
+; CHECK-SSE1-NEXT:    andb 9(%rdx), %bpl
+; CHECK-SSE1-NEXT:    orb %al, %bpl
+; CHECK-SSE1-NEXT:    movb 8(%r15), %r11b
+; CHECK-SSE1-NEXT:    movb 8(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r11b, %al
+; CHECK-SSE1-NEXT:    notb %r11b
+; CHECK-SSE1-NEXT:    andb 8(%rdx), %r11b
+; CHECK-SSE1-NEXT:    orb %al, %r11b
+; CHECK-SSE1-NEXT:    movb 7(%r15), %r10b
+; CHECK-SSE1-NEXT:    movb 7(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r10b, %al
+; CHECK-SSE1-NEXT:    notb %r10b
+; CHECK-SSE1-NEXT:    andb 7(%rdx), %r10b
+; CHECK-SSE1-NEXT:    orb %al, %r10b
+; CHECK-SSE1-NEXT:    movb 6(%r15), %r9b
+; CHECK-SSE1-NEXT:    movb 6(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r9b, %al
+; CHECK-SSE1-NEXT:    notb %r9b
+; CHECK-SSE1-NEXT:    andb 6(%rdx), %r9b
+; CHECK-SSE1-NEXT:    orb %al, %r9b
+; CHECK-SSE1-NEXT:    movb 5(%r15), %r8b
+; CHECK-SSE1-NEXT:    movb 5(%r14), %al
+; CHECK-SSE1-NEXT:    andb %r8b, %al
+; CHECK-SSE1-NEXT:    notb %r8b
+; CHECK-SSE1-NEXT:    andb 5(%rdx), %r8b
+; CHECK-SSE1-NEXT:    orb %al, %r8b
+; CHECK-SSE1-NEXT:    movb 4(%r15), %dil
+; CHECK-SSE1-NEXT:    movb 4(%r14), %al
+; CHECK-SSE1-NEXT:    andb %dil, %al
+; CHECK-SSE1-NEXT:    notb %dil
+; CHECK-SSE1-NEXT:    andb 4(%rdx), %dil
+; CHECK-SSE1-NEXT:    orb %al, %dil
+; CHECK-SSE1-NEXT:    movb 3(%r15), %sil
+; CHECK-SSE1-NEXT:    movb 3(%r14), %al
+; CHECK-SSE1-NEXT:    andb %sil, %al
+; CHECK-SSE1-NEXT:    notb %sil
+; CHECK-SSE1-NEXT:    andb 3(%rdx), %sil
+; CHECK-SSE1-NEXT:    orb %al, %sil
+; CHECK-SSE1-NEXT:    movb 2(%r15), %dl
+; CHECK-SSE1-NEXT:    movb 2(%r14), %al
+; CHECK-SSE1-NEXT:    andb %dl, %al
+; CHECK-SSE1-NEXT:    notb %dl
+; CHECK-SSE1-NEXT:    andb 2(%rbx), %dl
+; CHECK-SSE1-NEXT:    orb %al, %dl
+; CHECK-SSE1-NEXT:    movb 1(%r15), %al
+; CHECK-SSE1-NEXT:    movb 1(%r14), %cl
+; CHECK-SSE1-NEXT:    andb %al, %cl
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    andb 1(%rbx), %al
+; CHECK-SSE1-NEXT:    orb %cl, %al
+; CHECK-SSE1-NEXT:    movb (%r15), %r15b
+; CHECK-SSE1-NEXT:    movb (%r14), %r14b
+; CHECK-SSE1-NEXT:    andb %r15b, %r14b
+; CHECK-SSE1-NEXT:    notb %r15b
+; CHECK-SSE1-NEXT:    andb (%rbx), %r15b
+; CHECK-SSE1-NEXT:    orb %r14b, %r15b
+; CHECK-SSE1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 31(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 30(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 29(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 28(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 27(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 26(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 25(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 24(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 23(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 22(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 21(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 20(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 19(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 18(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 17(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 16(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 15(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 14(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 13(%rcx)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 12(%rcx)
+; CHECK-SSE1-NEXT:    movb %r13b, 11(%rcx)
+; CHECK-SSE1-NEXT:    movb %r12b, 10(%rcx)
+; CHECK-SSE1-NEXT:    movb %bpl, 9(%rcx)
+; CHECK-SSE1-NEXT:    movb %r11b, 8(%rcx)
+; CHECK-SSE1-NEXT:    movb %r10b, 7(%rcx)
+; CHECK-SSE1-NEXT:    movb %r9b, 6(%rcx)
+; CHECK-SSE1-NEXT:    movb %r8b, 5(%rcx)
+; CHECK-SSE1-NEXT:    movb %dil, 4(%rcx)
+; CHECK-SSE1-NEXT:    movb %sil, 3(%rcx)
+; CHECK-SSE1-NEXT:    movb %dl, 2(%rcx)
+; CHECK-SSE1-NEXT:    movb %al, 1(%rcx)
+; CHECK-SSE1-NEXT:    movb %r15b, (%rcx)
+; CHECK-SSE1-NEXT:    movq %rcx, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v32i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v32i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <32 x i8>, <32 x i8> *%px, align 32
+  %y = load <32 x i8>, <32 x i8> *%py, align 32
+  %mask = load <32 x i8>, <32 x i8> *%pmask, align 32
+  %mx = and <32 x i8> %x, %mask
+  %notmask = xor <32 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %my = and <32 x i8> %y, %notmask
+  %r = or <32 x i8> %mx, %my
+  ret <32 x i8> %r
+}
+
+define <16 x i16> @out_v16i16(<16 x i16> *%px, <16 x i16> *%py, <16 x i16> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: out_v16i16:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rcx, %r9
+; CHECK-BASELINE-NEXT:    movq %rdx, %r10
+; CHECK-BASELINE-NEXT:    movq %rsi, %r8
+; CHECK-BASELINE-NEXT:    movq %rdi, %r11
+; CHECK-BASELINE-NEXT:    movl 12(%rcx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 14(%rcx), %edx
+; CHECK-BASELINE-NEXT:    movl 16(%rcx), %esi
+; CHECK-BASELINE-NEXT:    movzwl 18(%rcx), %edi
+; CHECK-BASELINE-NEXT:    movl 20(%rcx), %ecx
+; CHECK-BASELINE-NEXT:    movzwl 22(%r9), %r15d
+; CHECK-BASELINE-NEXT:    movl 24(%r9), %r12d
+; CHECK-BASELINE-NEXT:    movzwl 26(%r9), %r14d
+; CHECK-BASELINE-NEXT:    movl 28(%r9), %ebx
+; CHECK-BASELINE-NEXT:    movzwl 30(%r9), %ebp
+; CHECK-BASELINE-NEXT:    movzwl 30(%r8), %r13d
+; CHECK-BASELINE-NEXT:    andw %bp, %r13w
+; CHECK-BASELINE-NEXT:    notl %ebp
+; CHECK-BASELINE-NEXT:    andw 30(%r10), %bp
+; CHECK-BASELINE-NEXT:    orl %r13d, %ebp
+; CHECK-BASELINE-NEXT:    movzwl 28(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %bx, %ax
+; CHECK-BASELINE-NEXT:    notl %ebx
+; CHECK-BASELINE-NEXT:    andw 28(%r10), %bx
+; CHECK-BASELINE-NEXT:    orl %eax, %ebx
+; CHECK-BASELINE-NEXT:    movzwl 26(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %r14w, %ax
+; CHECK-BASELINE-NEXT:    notl %r14d
+; CHECK-BASELINE-NEXT:    andw 26(%r10), %r14w
+; CHECK-BASELINE-NEXT:    orl %eax, %r14d
+; CHECK-BASELINE-NEXT:    movzwl 24(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %r12w, %ax
+; CHECK-BASELINE-NEXT:    notl %r12d
+; CHECK-BASELINE-NEXT:    andw 24(%r10), %r12w
+; CHECK-BASELINE-NEXT:    orl %eax, %r12d
+; CHECK-BASELINE-NEXT:    movzwl 22(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %r15w, %ax
+; CHECK-BASELINE-NEXT:    notl %r15d
+; CHECK-BASELINE-NEXT:    andw 22(%r10), %r15w
+; CHECK-BASELINE-NEXT:    orl %eax, %r15d
+; CHECK-BASELINE-NEXT:    movzwl 20(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %cx, %ax
+; CHECK-BASELINE-NEXT:    notl %ecx
+; CHECK-BASELINE-NEXT:    andw 20(%r10), %cx
+; CHECK-BASELINE-NEXT:    orl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 18(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %di, %ax
+; CHECK-BASELINE-NEXT:    notl %edi
+; CHECK-BASELINE-NEXT:    andw 18(%r10), %di
+; CHECK-BASELINE-NEXT:    orl %eax, %edi
+; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 16(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %si, %ax
+; CHECK-BASELINE-NEXT:    notl %esi
+; CHECK-BASELINE-NEXT:    andw 16(%r10), %si
+; CHECK-BASELINE-NEXT:    orl %eax, %esi
+; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 14(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %dx, %ax
+; CHECK-BASELINE-NEXT:    notl %edx
+; CHECK-BASELINE-NEXT:    andw 14(%r10), %dx
+; CHECK-BASELINE-NEXT:    orl %eax, %edx
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 12(%r8), %eax
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw %cx, %ax
+; CHECK-BASELINE-NEXT:    notl %ecx
+; CHECK-BASELINE-NEXT:    andw 12(%r10), %cx
+; CHECK-BASELINE-NEXT:    orl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%r9), %r13d
+; CHECK-BASELINE-NEXT:    movzwl 10(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %r13w, %ax
+; CHECK-BASELINE-NEXT:    notl %r13d
+; CHECK-BASELINE-NEXT:    andw 10(%r10), %r13w
+; CHECK-BASELINE-NEXT:    orl %eax, %r13d
+; CHECK-BASELINE-NEXT:    movl 8(%r9), %edi
+; CHECK-BASELINE-NEXT:    movzwl 8(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %di, %ax
+; CHECK-BASELINE-NEXT:    notl %edi
+; CHECK-BASELINE-NEXT:    andw 8(%r10), %di
+; CHECK-BASELINE-NEXT:    orl %eax, %edi
+; CHECK-BASELINE-NEXT:    movzwl 6(%r9), %esi
+; CHECK-BASELINE-NEXT:    movzwl 6(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %si, %ax
+; CHECK-BASELINE-NEXT:    notl %esi
+; CHECK-BASELINE-NEXT:    andw 6(%r10), %si
+; CHECK-BASELINE-NEXT:    orl %eax, %esi
+; CHECK-BASELINE-NEXT:    movl 4(%r9), %edx
+; CHECK-BASELINE-NEXT:    movzwl 4(%r8), %eax
+; CHECK-BASELINE-NEXT:    andw %dx, %ax
+; CHECK-BASELINE-NEXT:    notl %edx
+; CHECK-BASELINE-NEXT:    andw 4(%r10), %dx
+; CHECK-BASELINE-NEXT:    orl %eax, %edx
+; CHECK-BASELINE-NEXT:    movzwl 2(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzwl 2(%r8), %ecx
+; CHECK-BASELINE-NEXT:    andw %ax, %cx
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    andw 2(%r10), %ax
+; CHECK-BASELINE-NEXT:    orl %ecx, %eax
+; CHECK-BASELINE-NEXT:    movl (%r9), %r9d
+; CHECK-BASELINE-NEXT:    movzwl (%r8), %ecx
+; CHECK-BASELINE-NEXT:    andw %r9w, %cx
+; CHECK-BASELINE-NEXT:    notl %r9d
+; CHECK-BASELINE-NEXT:    andw (%r10), %r9w
+; CHECK-BASELINE-NEXT:    orl %ecx, %r9d
+; CHECK-BASELINE-NEXT:    movw %bp, 30(%r11)
+; CHECK-BASELINE-NEXT:    movw %bx, 28(%r11)
+; CHECK-BASELINE-NEXT:    movw %r14w, 26(%r11)
+; CHECK-BASELINE-NEXT:    movw %r12w, 24(%r11)
+; CHECK-BASELINE-NEXT:    movw %r15w, 22(%r11)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %cx, 20(%r11)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %cx, 18(%r11)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %cx, 16(%r11)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %cx, 14(%r11)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %cx, 12(%r11)
+; CHECK-BASELINE-NEXT:    movw %r13w, 10(%r11)
+; CHECK-BASELINE-NEXT:    movw %di, 8(%r11)
+; CHECK-BASELINE-NEXT:    movw %si, 6(%r11)
+; CHECK-BASELINE-NEXT:    movw %dx, 4(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 2(%r11)
+; CHECK-BASELINE-NEXT:    movw %r9w, (%r11)
+; CHECK-BASELINE-NEXT:    movq %r11, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v16i16:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rcx, %r9
+; CHECK-SSE1-NEXT:    movq %rdx, %r10
+; CHECK-SSE1-NEXT:    movq %rsi, %r8
+; CHECK-SSE1-NEXT:    movq %rdi, %r11
+; CHECK-SSE1-NEXT:    movl 12(%rcx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 14(%rcx), %edx
+; CHECK-SSE1-NEXT:    movl 16(%rcx), %esi
+; CHECK-SSE1-NEXT:    movzwl 18(%rcx), %edi
+; CHECK-SSE1-NEXT:    movl 20(%rcx), %ecx
+; CHECK-SSE1-NEXT:    movzwl 22(%r9), %r15d
+; CHECK-SSE1-NEXT:    movl 24(%r9), %r12d
+; CHECK-SSE1-NEXT:    movzwl 26(%r9), %r14d
+; CHECK-SSE1-NEXT:    movl 28(%r9), %ebx
+; CHECK-SSE1-NEXT:    movzwl 30(%r9), %ebp
+; CHECK-SSE1-NEXT:    movzwl 30(%r8), %r13d
+; CHECK-SSE1-NEXT:    andw %bp, %r13w
+; CHECK-SSE1-NEXT:    notl %ebp
+; CHECK-SSE1-NEXT:    andw 30(%r10), %bp
+; CHECK-SSE1-NEXT:    orl %r13d, %ebp
+; CHECK-SSE1-NEXT:    movzwl 28(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %bx, %ax
+; CHECK-SSE1-NEXT:    notl %ebx
+; CHECK-SSE1-NEXT:    andw 28(%r10), %bx
+; CHECK-SSE1-NEXT:    orl %eax, %ebx
+; CHECK-SSE1-NEXT:    movzwl 26(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %r14w, %ax
+; CHECK-SSE1-NEXT:    notl %r14d
+; CHECK-SSE1-NEXT:    andw 26(%r10), %r14w
+; CHECK-SSE1-NEXT:    orl %eax, %r14d
+; CHECK-SSE1-NEXT:    movzwl 24(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %r12w, %ax
+; CHECK-SSE1-NEXT:    notl %r12d
+; CHECK-SSE1-NEXT:    andw 24(%r10), %r12w
+; CHECK-SSE1-NEXT:    orl %eax, %r12d
+; CHECK-SSE1-NEXT:    movzwl 22(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %r15w, %ax
+; CHECK-SSE1-NEXT:    notl %r15d
+; CHECK-SSE1-NEXT:    andw 22(%r10), %r15w
+; CHECK-SSE1-NEXT:    orl %eax, %r15d
+; CHECK-SSE1-NEXT:    movzwl 20(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %cx, %ax
+; CHECK-SSE1-NEXT:    notl %ecx
+; CHECK-SSE1-NEXT:    andw 20(%r10), %cx
+; CHECK-SSE1-NEXT:    orl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 18(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %di, %ax
+; CHECK-SSE1-NEXT:    notl %edi
+; CHECK-SSE1-NEXT:    andw 18(%r10), %di
+; CHECK-SSE1-NEXT:    orl %eax, %edi
+; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 16(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %si, %ax
+; CHECK-SSE1-NEXT:    notl %esi
+; CHECK-SSE1-NEXT:    andw 16(%r10), %si
+; CHECK-SSE1-NEXT:    orl %eax, %esi
+; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 14(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %dx, %ax
+; CHECK-SSE1-NEXT:    notl %edx
+; CHECK-SSE1-NEXT:    andw 14(%r10), %dx
+; CHECK-SSE1-NEXT:    orl %eax, %edx
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 12(%r8), %eax
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw %cx, %ax
+; CHECK-SSE1-NEXT:    notl %ecx
+; CHECK-SSE1-NEXT:    andw 12(%r10), %cx
+; CHECK-SSE1-NEXT:    orl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%r9), %r13d
+; CHECK-SSE1-NEXT:    movzwl 10(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %r13w, %ax
+; CHECK-SSE1-NEXT:    notl %r13d
+; CHECK-SSE1-NEXT:    andw 10(%r10), %r13w
+; CHECK-SSE1-NEXT:    orl %eax, %r13d
+; CHECK-SSE1-NEXT:    movl 8(%r9), %edi
+; CHECK-SSE1-NEXT:    movzwl 8(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %di, %ax
+; CHECK-SSE1-NEXT:    notl %edi
+; CHECK-SSE1-NEXT:    andw 8(%r10), %di
+; CHECK-SSE1-NEXT:    orl %eax, %edi
+; CHECK-SSE1-NEXT:    movzwl 6(%r9), %esi
+; CHECK-SSE1-NEXT:    movzwl 6(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %si, %ax
+; CHECK-SSE1-NEXT:    notl %esi
+; CHECK-SSE1-NEXT:    andw 6(%r10), %si
+; CHECK-SSE1-NEXT:    orl %eax, %esi
+; CHECK-SSE1-NEXT:    movl 4(%r9), %edx
+; CHECK-SSE1-NEXT:    movzwl 4(%r8), %eax
+; CHECK-SSE1-NEXT:    andw %dx, %ax
+; CHECK-SSE1-NEXT:    notl %edx
+; CHECK-SSE1-NEXT:    andw 4(%r10), %dx
+; CHECK-SSE1-NEXT:    orl %eax, %edx
+; CHECK-SSE1-NEXT:    movzwl 2(%r9), %eax
+; CHECK-SSE1-NEXT:    movzwl 2(%r8), %ecx
+; CHECK-SSE1-NEXT:    andw %ax, %cx
+; CHECK-SSE1-NEXT:    notl %eax
+; CHECK-SSE1-NEXT:    andw 2(%r10), %ax
+; CHECK-SSE1-NEXT:    orl %ecx, %eax
+; CHECK-SSE1-NEXT:    movl (%r9), %r9d
+; CHECK-SSE1-NEXT:    movzwl (%r8), %ecx
+; CHECK-SSE1-NEXT:    andw %r9w, %cx
+; CHECK-SSE1-NEXT:    notl %r9d
+; CHECK-SSE1-NEXT:    andw (%r10), %r9w
+; CHECK-SSE1-NEXT:    orl %ecx, %r9d
+; CHECK-SSE1-NEXT:    movw %bp, 30(%r11)
+; CHECK-SSE1-NEXT:    movw %bx, 28(%r11)
+; CHECK-SSE1-NEXT:    movw %r14w, 26(%r11)
+; CHECK-SSE1-NEXT:    movw %r12w, 24(%r11)
+; CHECK-SSE1-NEXT:    movw %r15w, 22(%r11)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %cx, 20(%r11)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %cx, 18(%r11)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %cx, 16(%r11)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %cx, 14(%r11)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %cx, 12(%r11)
+; CHECK-SSE1-NEXT:    movw %r13w, 10(%r11)
+; CHECK-SSE1-NEXT:    movw %di, 8(%r11)
+; CHECK-SSE1-NEXT:    movw %si, 6(%r11)
+; CHECK-SSE1-NEXT:    movw %dx, 4(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 2(%r11)
+; CHECK-SSE1-NEXT:    movw %r9w, (%r11)
+; CHECK-SSE1-NEXT:    movq %r11, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v16i16:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v16i16:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <16 x i16>, <16 x i16> *%px, align 32
+  %y = load <16 x i16>, <16 x i16> *%py, align 32
+  %mask = load <16 x i16>, <16 x i16> *%pmask, align 32
+  %mx = and <16 x i16> %x, %mask
+  %notmask = xor <16 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %my = and <16 x i16> %y, %notmask
+  %r = or <16 x i16> %mx, %my
+  ret <16 x i16> %r
+}
+
+define <8 x i32> @out_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: out_v8i32:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r8d
+; CHECK-BASELINE-NEXT:    movl 8(%rcx), %r9d
+; CHECK-BASELINE-NEXT:    movl 12(%rcx), %r10d
+; CHECK-BASELINE-NEXT:    movl 16(%rcx), %r11d
+; CHECK-BASELINE-NEXT:    movl 20(%rcx), %r15d
+; CHECK-BASELINE-NEXT:    movl 24(%rcx), %ebx
+; CHECK-BASELINE-NEXT:    movl 28(%rcx), %ebp
+; CHECK-BASELINE-NEXT:    movl 28(%rsi), %r14d
+; CHECK-BASELINE-NEXT:    andl %ebp, %r14d
+; CHECK-BASELINE-NEXT:    notl %ebp
+; CHECK-BASELINE-NEXT:    andl 28(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    orl %r14d, %ebp
+; CHECK-BASELINE-NEXT:    movl 24(%rsi), %eax
+; CHECK-BASELINE-NEXT:    andl %ebx, %eax
+; CHECK-BASELINE-NEXT:    notl %ebx
+; CHECK-BASELINE-NEXT:    andl 24(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    orl %eax, %ebx
+; CHECK-BASELINE-NEXT:    movl 20(%rsi), %eax
+; CHECK-BASELINE-NEXT:    andl %r15d, %eax
+; CHECK-BASELINE-NEXT:    notl %r15d
+; CHECK-BASELINE-NEXT:    andl 20(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    orl %eax, %r15d
+; CHECK-BASELINE-NEXT:    movl 16(%rsi), %eax
+; CHECK-BASELINE-NEXT:    andl %r11d, %eax
+; CHECK-BASELINE-NEXT:    notl %r11d
+; CHECK-BASELINE-NEXT:    andl 16(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    orl %eax, %r11d
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %eax
+; CHECK-BASELINE-NEXT:    andl %r10d, %eax
+; CHECK-BASELINE-NEXT:    notl %r10d
+; CHECK-BASELINE-NEXT:    andl 12(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    orl %eax, %r10d
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %eax
+; CHECK-BASELINE-NEXT:    andl %r9d, %eax
+; CHECK-BASELINE-NEXT:    notl %r9d
+; CHECK-BASELINE-NEXT:    andl 8(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    orl %eax, %r9d
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    notl %r8d
+; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    orl %eax, %r8d
+; CHECK-BASELINE-NEXT:    movl (%rcx), %eax
+; CHECK-BASELINE-NEXT:    movl (%rsi), %ecx
+; CHECK-BASELINE-NEXT:    andl %eax, %ecx
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    andl (%rdx), %eax
+; CHECK-BASELINE-NEXT:    orl %ecx, %eax
+; CHECK-BASELINE-NEXT:    movl %ebp, 28(%rdi)
+; CHECK-BASELINE-NEXT:    movl %ebx, 24(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r15d, 20(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r11d, 16(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r10d, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r9d, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r8d, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movl %eax, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v8i32:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movl 4(%rcx), %r8d
+; CHECK-SSE1-NEXT:    movl 8(%rcx), %r9d
+; CHECK-SSE1-NEXT:    movl 12(%rcx), %r10d
+; CHECK-SSE1-NEXT:    movl 16(%rcx), %r11d
+; CHECK-SSE1-NEXT:    movl 20(%rcx), %r15d
+; CHECK-SSE1-NEXT:    movl 24(%rcx), %ebx
+; CHECK-SSE1-NEXT:    movl 28(%rcx), %ebp
+; CHECK-SSE1-NEXT:    movl 28(%rsi), %r14d
+; CHECK-SSE1-NEXT:    andl %ebp, %r14d
+; CHECK-SSE1-NEXT:    notl %ebp
+; CHECK-SSE1-NEXT:    andl 28(%rdx), %ebp
+; CHECK-SSE1-NEXT:    orl %r14d, %ebp
+; CHECK-SSE1-NEXT:    movl 24(%rsi), %eax
+; CHECK-SSE1-NEXT:    andl %ebx, %eax
+; CHECK-SSE1-NEXT:    notl %ebx
+; CHECK-SSE1-NEXT:    andl 24(%rdx), %ebx
+; CHECK-SSE1-NEXT:    orl %eax, %ebx
+; CHECK-SSE1-NEXT:    movl 20(%rsi), %eax
+; CHECK-SSE1-NEXT:    andl %r15d, %eax
+; CHECK-SSE1-NEXT:    notl %r15d
+; CHECK-SSE1-NEXT:    andl 20(%rdx), %r15d
+; CHECK-SSE1-NEXT:    orl %eax, %r15d
+; CHECK-SSE1-NEXT:    movl 16(%rsi), %eax
+; CHECK-SSE1-NEXT:    andl %r11d, %eax
+; CHECK-SSE1-NEXT:    notl %r11d
+; CHECK-SSE1-NEXT:    andl 16(%rdx), %r11d
+; CHECK-SSE1-NEXT:    orl %eax, %r11d
+; CHECK-SSE1-NEXT:    movl 12(%rsi), %eax
+; CHECK-SSE1-NEXT:    andl %r10d, %eax
+; CHECK-SSE1-NEXT:    notl %r10d
+; CHECK-SSE1-NEXT:    andl 12(%rdx), %r10d
+; CHECK-SSE1-NEXT:    orl %eax, %r10d
+; CHECK-SSE1-NEXT:    movl 8(%rsi), %eax
+; CHECK-SSE1-NEXT:    andl %r9d, %eax
+; CHECK-SSE1-NEXT:    notl %r9d
+; CHECK-SSE1-NEXT:    andl 8(%rdx), %r9d
+; CHECK-SSE1-NEXT:    orl %eax, %r9d
+; CHECK-SSE1-NEXT:    movl 4(%rsi), %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    notl %r8d
+; CHECK-SSE1-NEXT:    andl 4(%rdx), %r8d
+; CHECK-SSE1-NEXT:    orl %eax, %r8d
+; CHECK-SSE1-NEXT:    movl (%rcx), %eax
+; CHECK-SSE1-NEXT:    movl (%rsi), %ecx
+; CHECK-SSE1-NEXT:    andl %eax, %ecx
+; CHECK-SSE1-NEXT:    notl %eax
+; CHECK-SSE1-NEXT:    andl (%rdx), %eax
+; CHECK-SSE1-NEXT:    orl %ecx, %eax
+; CHECK-SSE1-NEXT:    movl %ebp, 28(%rdi)
+; CHECK-SSE1-NEXT:    movl %ebx, 24(%rdi)
+; CHECK-SSE1-NEXT:    movl %r15d, 20(%rdi)
+; CHECK-SSE1-NEXT:    movl %r11d, 16(%rdi)
+; CHECK-SSE1-NEXT:    movl %r10d, 12(%rdi)
+; CHECK-SSE1-NEXT:    movl %r9d, 8(%rdi)
+; CHECK-SSE1-NEXT:    movl %r8d, 4(%rdi)
+; CHECK-SSE1-NEXT:    movl %eax, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v8i32:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v8i32:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <8 x i32>, <8 x i32> *%px, align 32
+  %y = load <8 x i32>, <8 x i32> *%py, align 32
+  %mask = load <8 x i32>, <8 x i32> *%pmask, align 32
+  %mx = and <8 x i32> %x, %mask
+  %notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %my = and <8 x i32> %y, %notmask
+  %r = or <8 x i32> %mx, %my
+  ret <8 x i32> %r
+}
+
+define <4 x i64> @out_v4i64(<4 x i64> *%px, <4 x i64> *%py, <4 x i64> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: out_v4i64:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq (%rcx), %r8
+; CHECK-BASELINE-NEXT:    movq 8(%rcx), %r9
+; CHECK-BASELINE-NEXT:    movq 16(%rcx), %rax
+; CHECK-BASELINE-NEXT:    movq 24(%rcx), %rcx
+; CHECK-BASELINE-NEXT:    movq 24(%rsi), %r10
+; CHECK-BASELINE-NEXT:    andq %rcx, %r10
+; CHECK-BASELINE-NEXT:    movq 16(%rsi), %r11
+; CHECK-BASELINE-NEXT:    andq %rax, %r11
+; CHECK-BASELINE-NEXT:    movq 8(%rsi), %rbx
+; CHECK-BASELINE-NEXT:    andq %r9, %rbx
+; CHECK-BASELINE-NEXT:    movq (%rsi), %rsi
+; CHECK-BASELINE-NEXT:    andq %r8, %rsi
+; CHECK-BASELINE-NEXT:    notq %r8
+; CHECK-BASELINE-NEXT:    notq %r9
+; CHECK-BASELINE-NEXT:    notq %rax
+; CHECK-BASELINE-NEXT:    notq %rcx
+; CHECK-BASELINE-NEXT:    andq 24(%rdx), %rcx
+; CHECK-BASELINE-NEXT:    orq %r10, %rcx
+; CHECK-BASELINE-NEXT:    andq 16(%rdx), %rax
+; CHECK-BASELINE-NEXT:    orq %r11, %rax
+; CHECK-BASELINE-NEXT:    andq 8(%rdx), %r9
+; CHECK-BASELINE-NEXT:    orq %rbx, %r9
+; CHECK-BASELINE-NEXT:    andq (%rdx), %r8
+; CHECK-BASELINE-NEXT:    orq %rsi, %r8
+; CHECK-BASELINE-NEXT:    movq %rcx, 24(%rdi)
+; CHECK-BASELINE-NEXT:    movq %rax, 16(%rdi)
+; CHECK-BASELINE-NEXT:    movq %r9, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movq %r8, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: out_v4i64:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq (%rcx), %r8
+; CHECK-SSE1-NEXT:    movq 8(%rcx), %r9
+; CHECK-SSE1-NEXT:    movq 16(%rcx), %rax
+; CHECK-SSE1-NEXT:    movq 24(%rcx), %rcx
+; CHECK-SSE1-NEXT:    movq 24(%rsi), %r10
+; CHECK-SSE1-NEXT:    andq %rcx, %r10
+; CHECK-SSE1-NEXT:    movq 16(%rsi), %r11
+; CHECK-SSE1-NEXT:    andq %rax, %r11
+; CHECK-SSE1-NEXT:    movq 8(%rsi), %rbx
+; CHECK-SSE1-NEXT:    andq %r9, %rbx
+; CHECK-SSE1-NEXT:    movq (%rsi), %rsi
+; CHECK-SSE1-NEXT:    andq %r8, %rsi
+; CHECK-SSE1-NEXT:    notq %r8
+; CHECK-SSE1-NEXT:    notq %r9
+; CHECK-SSE1-NEXT:    notq %rax
+; CHECK-SSE1-NEXT:    notq %rcx
+; CHECK-SSE1-NEXT:    andq 24(%rdx), %rcx
+; CHECK-SSE1-NEXT:    orq %r10, %rcx
+; CHECK-SSE1-NEXT:    andq 16(%rdx), %rax
+; CHECK-SSE1-NEXT:    orq %r11, %rax
+; CHECK-SSE1-NEXT:    andq 8(%rdx), %r9
+; CHECK-SSE1-NEXT:    orq %rbx, %r9
+; CHECK-SSE1-NEXT:    andq (%rdx), %r8
+; CHECK-SSE1-NEXT:    orq %rsi, %r8
+; CHECK-SSE1-NEXT:    movq %rcx, 24(%rdi)
+; CHECK-SSE1-NEXT:    movq %rax, 16(%rdi)
+; CHECK-SSE1-NEXT:    movq %r9, 8(%rdi)
+; CHECK-SSE1-NEXT:    movq %r8, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: out_v4i64:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
+; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
+; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
+; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: out_v4i64:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i64>, <4 x i64> *%px, align 32
+  %y = load <4 x i64>, <4 x i64> *%py, align 32
+  %mask = load <4 x i64>, <4 x i64> *%pmask, align 32
+  %mx = and <4 x i64> %x, %mask
+  %notmask = xor <4 x i64> %mask, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %my = and <4 x i64> %y, %notmask
+  %r = or <4 x i64> %mx, %my
+  ret <4 x i64> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Should be the same as the previous one.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; ============================================================================ ;
+; 8-bit vector width
+; ============================================================================ ;
+
+define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
+; CHECK-LABEL: in_v1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+  %n0 = xor <1 x i8> %x, %y
+  %n1 = and <1 x i8> %n0, %mask
+  %r = xor <1 x i8> %n1, %y
+  ret <1 x i8> %r
+}
+
+; ============================================================================ ;
+; 16-bit vector width
+; ============================================================================ ;
+
+define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v2i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    xorl %edx, %edi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    xorl %edx, %edi
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v2i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    xorl %edx, %edi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    xorl %edx, %edi
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    movl %esi, %edx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v2i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v2i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <2 x i8> %x, %y
+  %n1 = and <2 x i8> %n0, %mask
+  %r = xor <2 x i8> %n1, %y
+  ret <2 x i8> %r
+}
+
+define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
+; CHECK-LABEL: in_v1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+  %n0 = xor <1 x i16> %x, %y
+  %n1 = and <1 x i16> %n0, %mask
+  %r = xor <1 x i16> %n1, %y
+  ret <1 x i16> %r
+}
+
+; ============================================================================ ;
+; 32-bit vector width
+; ============================================================================ ;
+
+define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v4i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    xorb %r11b, %cl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %r8b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    xorb %r11b, %cl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %r8b
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v4i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    xorb %r11b, %cl
+; CHECK-SSE1-NEXT:    xorb %r10b, %r8b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    xorb %r11b, %cl
+; CHECK-SSE1-NEXT:    xorb %r10b, %r8b
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v4i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v4i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <4 x i8> %x, %y
+  %n1 = and <4 x i8> %n0, %mask
+  %r = xor <4 x i8> %n1, %y
+  ret <4 x i8> %r
+}
+
+define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v2i16:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    xorl %edx, %edi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    xorl %edx, %edi
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v2i16:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    xorl %edx, %edi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    xorl %edx, %edi
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    movl %esi, %edx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v2i16:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v2i16:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <2 x i16> %x, %y
+  %n1 = and <2 x i16> %n0, %mask
+  %r = xor <2 x i16> %n1, %y
+  ret <2 x i16> %r
+}
+
+define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
+; CHECK-LABEL: in_v1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+  %n0 = xor <1 x i32> %x, %y
+  %n1 = and <1 x i32> %n0, %mask
+  %r = xor <1 x i32> %n1, %y
+  ret <1 x i32> %r
+}
+
+; ============================================================================ ;
+; 64-bit vector width
+; ============================================================================ ;
+
+define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v8i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl %ecx, %r10d
+; CHECK-BASELINE-NEXT:    movl %edx, %r11d
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    xorb %bpl, %sil
+; CHECK-BASELINE-NEXT:    xorb %r13b, %r11b
+; CHECK-BASELINE-NEXT:    xorb %r12b, %r10b
+; CHECK-BASELINE-NEXT:    xorb %r15b, %r8b
+; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %bl, %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %bpl, %sil
+; CHECK-BASELINE-NEXT:    xorb %r13b, %r11b
+; CHECK-BASELINE-NEXT:    xorb %r12b, %r10b
+; CHECK-BASELINE-NEXT:    xorb %r15b, %r8b
+; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %bl, %al
+; CHECK-BASELINE-NEXT:    movb %al, 7(%rdi)
+; CHECK-BASELINE-NEXT:    movb %cl, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movb %dl, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r10b, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v8i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movl %ecx, %r10d
+; CHECK-SSE1-NEXT:    movl %edx, %r11d
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    xorb %bpl, %sil
+; CHECK-SSE1-NEXT:    xorb %r13b, %r11b
+; CHECK-SSE1-NEXT:    xorb %r12b, %r10b
+; CHECK-SSE1-NEXT:    xorb %r15b, %r8b
+; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %bl, %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %bpl, %sil
+; CHECK-SSE1-NEXT:    xorb %r13b, %r11b
+; CHECK-SSE1-NEXT:    xorb %r12b, %r10b
+; CHECK-SSE1-NEXT:    xorb %r15b, %r8b
+; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %bl, %al
+; CHECK-SSE1-NEXT:    movb %al, 7(%rdi)
+; CHECK-SSE1-NEXT:    movb %cl, 6(%rdi)
+; CHECK-SSE1-NEXT:    movb %dl, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
+; CHECK-SSE1-NEXT:    movb %r10b, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v8i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v8i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <8 x i8> %x, %y
+  %n1 = and <8 x i8> %n0, %mask
+  %r = xor <8 x i8> %n1, %y
+  ret <8 x i8> %r
+}
+
+define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v4i16:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    xorl %r11d, %ecx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    xorl %eax, %edx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorl %eax, %edx
+; CHECK-BASELINE-NEXT:    xorl %r11d, %ecx
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %cx, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movw %dx, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movw %si, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v4i16:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    xorl %r11d, %ecx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    xorl %eax, %edx
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorl %eax, %edx
+; CHECK-SSE1-NEXT:    xorl %r11d, %ecx
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
+; CHECK-SSE1-NEXT:    movw %r8w, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %cx, 4(%rdi)
+; CHECK-SSE1-NEXT:    movw %dx, 2(%rdi)
+; CHECK-SSE1-NEXT:    movw %si, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v4i16:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v4i16:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <4 x i16> %x, %y
+  %n1 = and <4 x i16> %n0, %mask
+  %r = xor <4 x i16> %n1, %y
+  ret <4 x i16> %r
+}
+
+define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v2i32:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    xorl %edx, %edi
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    andl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    xorl %edx, %edi
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v2i32:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    xorl %edx, %edi
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    andl %r9d, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    xorl %edx, %edi
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    movl %esi, %edx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v2i32:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v2i32:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <2 x i32> %x, %y
+  %n1 = and <2 x i32> %n0, %mask
+  %r = xor <2 x i32> %n1, %y
+  ret <2 x i32> %r
+}
+
+define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
+; CHECK-LABEL: in_v1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorq %rsi, %rdi
+; CHECK-NEXT:    andq %rdx, %rdi
+; CHECK-NEXT:    xorq %rsi, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
+  %n0 = xor <1 x i64> %x, %y
+  %n1 = and <1 x i64> %n0, %mask
+  %r = xor <1 x i64> %n1, %y
+  ret <1 x i64> %r
+}
+
+; ============================================================================ ;
+; 128-bit vector width
+; ============================================================================ ;
+
+define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v16i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %al, %r9b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
+; CHECK-BASELINE-NEXT:    xorb %al, %r9b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    xorb %bl, %r11b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    xorb %bl, %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    xorb %bpl, %bl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    xorb %bpl, %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %bpl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %bpl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    xorb %r12b, %r13b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    xorb %r12b, %r13b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    xorb %r15b, %r12b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    xorb %r15b, %r12b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    xorb %r14b, %r15b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    xorb %r14b, %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    xorb %sil, %r14b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    xorb %sil, %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    xorb %sil, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %sil, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, 15(%rdi)
+; CHECK-BASELINE-NEXT:    movb %al, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r14b, 13(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r15b, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r12b, 11(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r13b, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bpl, 9(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bl, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movb %dl, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %al, %r8b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    xorb %al, %r8b
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v16i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %al, %r9b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
+; CHECK-SSE1-NEXT:    xorb %al, %r9b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %r10b, %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %r10b, %dl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    xorb %bl, %r11b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    xorb %bl, %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    xorb %bpl, %bl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    xorb %bpl, %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    xorb %r13b, %bpl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    xorb %r13b, %bpl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    xorb %r12b, %r13b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    xorb %r12b, %r13b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    xorb %r15b, %r12b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    xorb %r15b, %r12b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    xorb %r14b, %r15b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    xorb %r14b, %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    xorb %sil, %r14b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    xorb %sil, %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    xorb %sil, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %sil, %cl
+; CHECK-SSE1-NEXT:    movb %cl, 15(%rdi)
+; CHECK-SSE1-NEXT:    movb %al, 14(%rdi)
+; CHECK-SSE1-NEXT:    movb %r14b, 13(%rdi)
+; CHECK-SSE1-NEXT:    movb %r15b, 12(%rdi)
+; CHECK-SSE1-NEXT:    movb %r12b, 11(%rdi)
+; CHECK-SSE1-NEXT:    movb %r13b, 10(%rdi)
+; CHECK-SSE1-NEXT:    movb %bpl, 9(%rdi)
+; CHECK-SSE1-NEXT:    movb %bl, 8(%rdi)
+; CHECK-SSE1-NEXT:    movb %r11b, 7(%rdi)
+; CHECK-SSE1-NEXT:    movb %r10b, 6(%rdi)
+; CHECK-SSE1-NEXT:    movb %dl, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %al, %r8b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    xorb %al, %r8b
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    movb %cl, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v16i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v16i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <16 x i8> %x, %y
+  %n1 = and <16 x i8> %n0, %mask
+  %r = xor <16 x i8> %n1, %y
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v8i16:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    xorl %r11d, %r8d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorl %ebx, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %ebx, %esi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-BASELINE-NEXT:    xorl %r11d, %r8d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-BASELINE-NEXT:    xorw %bx, %bp
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bp
+; CHECK-BASELINE-NEXT:    xorl %ebx, %ebp
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorw %ax, %bx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
+; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %ax
+; CHECK-BASELINE-NEXT:    xorl %r14d, %eax
+; CHECK-BASELINE-NEXT:    movw %ax, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bp, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %cx, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movw %dx, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movw %si, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v8i16:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    xorl %r10d, %r9d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    xorl %r11d, %r8d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorl %ebx, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %ebx, %esi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorl %ebx, %edx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-SSE1-NEXT:    xorl %ebx, %edx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-SSE1-NEXT:    xorl %r11d, %r8d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
+; CHECK-SSE1-NEXT:    xorl %r10d, %r9d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-SSE1-NEXT:    xorw %bx, %bp
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bp
+; CHECK-SSE1-NEXT:    xorl %ebx, %ebp
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorw %ax, %bx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
+; CHECK-SSE1-NEXT:    xorl %eax, %ebx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    xorw %r14w, %ax
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %ax
+; CHECK-SSE1-NEXT:    xorl %r14d, %eax
+; CHECK-SSE1-NEXT:    movw %ax, 14(%rdi)
+; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
+; CHECK-SSE1-NEXT:    movw %bp, 10(%rdi)
+; CHECK-SSE1-NEXT:    movw %r9w, 8(%rdi)
+; CHECK-SSE1-NEXT:    movw %r8w, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %cx, 4(%rdi)
+; CHECK-SSE1-NEXT:    movw %dx, 2(%rdi)
+; CHECK-SSE1-NEXT:    movw %si, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v8i16:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v8i16:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <8 x i16> %x, %y
+  %n1 = and <8 x i16> %n0, %mask
+  %r = xor <8 x i16> %n1, %y
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @in_v4i32(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: in_v4i32:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movl (%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %ebx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %ebx
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
+; CHECK-BASELINE-NEXT:    andl 12(%rcx), %esi
+; CHECK-BASELINE-NEXT:    andl 8(%rcx), %ebx
+; CHECK-BASELINE-NEXT:    andl 4(%rcx), %eax
+; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
+; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
+; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
+; CHECK-BASELINE-NEXT:    xorl %r9d, %ebx
+; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
+; CHECK-BASELINE-NEXT:    movl %esi, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movl %ebx, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movl %eax, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movl %edx, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v4i32:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm0
+; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    andps (%rcx), %xmm1
+; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v4i32:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v4i32:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rsi), %xmm0
+; CHECK-XOP-NEXT:    vxorps (%rdi), %xmm0, %xmm1
+; CHECK-XOP-NEXT:    vandps (%rdx), %xmm1, %xmm1
+; CHECK-XOP-NEXT:    vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i32>, <4 x i32> *%px, align 16
+  %y = load <4 x i32>, <4 x i32> *%py, align 16
+  %mask = load <4 x i32>, <4 x i32> *%pmask, align 16
+  %n0 = xor <4 x i32> %x, %y
+  %n1 = and <4 x i32> %n0, %mask
+  %r = xor <4 x i32> %n1, %y
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
+; CHECK-BASELINE-LABEL: in_v2i64:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    xorq %rdx, %rdi
+; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
+; CHECK-BASELINE-NEXT:    andq %r9, %rsi
+; CHECK-BASELINE-NEXT:    andq %r8, %rdi
+; CHECK-BASELINE-NEXT:    xorq %rdx, %rdi
+; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movq %rsi, %rdx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v2i64:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    xorq %rdx, %rdi
+; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
+; CHECK-SSE1-NEXT:    andq %r9, %rsi
+; CHECK-SSE1-NEXT:    andq %r8, %rdi
+; CHECK-SSE1-NEXT:    xorq %rdx, %rdi
+; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movq %rsi, %rdx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v2i64:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v2i64:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT:    retq
+  %n0 = xor <2 x i64> %x, %y
+  %n1 = and <2 x i64> %n0, %mask
+  %r = xor <2 x i64> %n1, %y
+  ret <2 x i64> %r
+}
+
+; ============================================================================ ;
+; 256-bit vector width
+; ============================================================================ ;
+
+define <32 x i8> @in_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: in_v32i8:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdx, %r13
+; CHECK-BASELINE-NEXT:    movq %rsi, %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-BASELINE-NEXT:    movb 16(%rdx), %r12b
+; CHECK-BASELINE-NEXT:    movb 15(%rdx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 14(%rdx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 13(%rdx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 12(%rdx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 11(%rdx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 10(%rdx), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 9(%rdx), %r10b
+; CHECK-BASELINE-NEXT:    movb 8(%rdx), %r11b
+; CHECK-BASELINE-NEXT:    movb 7(%rdx), %r9b
+; CHECK-BASELINE-NEXT:    movb 6(%rdx), %r8b
+; CHECK-BASELINE-NEXT:    movb 5(%rdx), %bpl
+; CHECK-BASELINE-NEXT:    movb 4(%rdx), %dil
+; CHECK-BASELINE-NEXT:    movb 3(%rdx), %sil
+; CHECK-BASELINE-NEXT:    movb 2(%rdx), %r14b
+; CHECK-BASELINE-NEXT:    movb (%rdx), %al
+; CHECK-BASELINE-NEXT:    movb 1(%rdx), %r15b
+; CHECK-BASELINE-NEXT:    movb (%rbx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb (%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 1(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %r15b, %al
+; CHECK-BASELINE-NEXT:    andb 1(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r15b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 2(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %r14b, %al
+; CHECK-BASELINE-NEXT:    andb 2(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r14b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 3(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %sil, %al
+; CHECK-BASELINE-NEXT:    andb 3(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %sil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 4(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 4(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 5(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %bpl, %al
+; CHECK-BASELINE-NEXT:    andb 5(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %bpl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 6(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %r8b, %al
+; CHECK-BASELINE-NEXT:    andb 6(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r8b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 7(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %r9b, %al
+; CHECK-BASELINE-NEXT:    andb 7(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r9b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 8(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %r11b, %al
+; CHECK-BASELINE-NEXT:    andb 8(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r11b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 9(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %r10b, %al
+; CHECK-BASELINE-NEXT:    andb 9(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r10b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 10(%rbx), %dl
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 10(%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 11(%rbx), %dl
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 11(%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 12(%rbx), %dl
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 12(%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 13(%rbx), %dl
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 13(%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 14(%rbx), %dl
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 14(%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 15(%rbx), %dl
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 15(%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 16(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb %r12b, %al
+; CHECK-BASELINE-NEXT:    andb 16(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r12b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 17(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 17(%rbx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 17(%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 18(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 18(%rbx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 18(%rcx), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 19(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 19(%rbx), %r12b
+; CHECK-BASELINE-NEXT:    xorb %al, %r12b
+; CHECK-BASELINE-NEXT:    andb 19(%rcx), %r12b
+; CHECK-BASELINE-NEXT:    movq %rcx, %rdx
+; CHECK-BASELINE-NEXT:    xorb %al, %r12b
+; CHECK-BASELINE-NEXT:    movb 20(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 20(%rbx), %r14b
+; CHECK-BASELINE-NEXT:    xorb %al, %r14b
+; CHECK-BASELINE-NEXT:    andb 20(%rcx), %r14b
+; CHECK-BASELINE-NEXT:    xorb %al, %r14b
+; CHECK-BASELINE-NEXT:    movb 21(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 21(%rbx), %r15b
+; CHECK-BASELINE-NEXT:    xorb %al, %r15b
+; CHECK-BASELINE-NEXT:    andb 21(%rcx), %r15b
+; CHECK-BASELINE-NEXT:    xorb %al, %r15b
+; CHECK-BASELINE-NEXT:    movb 22(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 22(%rbx), %bpl
+; CHECK-BASELINE-NEXT:    xorb %al, %bpl
+; CHECK-BASELINE-NEXT:    andb 22(%rcx), %bpl
+; CHECK-BASELINE-NEXT:    xorb %al, %bpl
+; CHECK-BASELINE-NEXT:    movb 23(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 23(%rbx), %r11b
+; CHECK-BASELINE-NEXT:    xorb %al, %r11b
+; CHECK-BASELINE-NEXT:    andb 23(%rcx), %r11b
+; CHECK-BASELINE-NEXT:    xorb %al, %r11b
+; CHECK-BASELINE-NEXT:    movb 24(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 24(%rbx), %r10b
+; CHECK-BASELINE-NEXT:    xorb %al, %r10b
+; CHECK-BASELINE-NEXT:    andb 24(%rcx), %r10b
+; CHECK-BASELINE-NEXT:    xorb %al, %r10b
+; CHECK-BASELINE-NEXT:    movb 25(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 25(%rbx), %r9b
+; CHECK-BASELINE-NEXT:    xorb %al, %r9b
+; CHECK-BASELINE-NEXT:    andb 25(%rcx), %r9b
+; CHECK-BASELINE-NEXT:    xorb %al, %r9b
+; CHECK-BASELINE-NEXT:    movb 26(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 26(%rbx), %r8b
+; CHECK-BASELINE-NEXT:    xorb %al, %r8b
+; CHECK-BASELINE-NEXT:    andb 26(%rcx), %r8b
+; CHECK-BASELINE-NEXT:    xorb %al, %r8b
+; CHECK-BASELINE-NEXT:    movb 27(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 27(%rbx), %dil
+; CHECK-BASELINE-NEXT:    xorb %al, %dil
+; CHECK-BASELINE-NEXT:    andb 27(%rcx), %dil
+; CHECK-BASELINE-NEXT:    xorb %al, %dil
+; CHECK-BASELINE-NEXT:    movb 28(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 28(%rbx), %sil
+; CHECK-BASELINE-NEXT:    xorb %al, %sil
+; CHECK-BASELINE-NEXT:    andb 28(%rcx), %sil
+; CHECK-BASELINE-NEXT:    xorb %al, %sil
+; CHECK-BASELINE-NEXT:    movb 29(%r13), %al
+; CHECK-BASELINE-NEXT:    movb 29(%rbx), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb 29(%rdx), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    movb 30(%r13), %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 30(%rbx), %al
+; CHECK-BASELINE-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    andb 30(%rdx), %al
+; CHECK-BASELINE-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movb 31(%r13), %r13b
+; CHECK-BASELINE-NEXT:    movb 31(%rbx), %bl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %bl
+; CHECK-BASELINE-NEXT:    andb 31(%rdx), %bl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %bl
+; CHECK-BASELINE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; CHECK-BASELINE-NEXT:    movb %bl, 31(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 30(%r13)
+; CHECK-BASELINE-NEXT:    movb %cl, 29(%r13)
+; CHECK-BASELINE-NEXT:    movb %sil, 28(%r13)
+; CHECK-BASELINE-NEXT:    movb %dil, 27(%r13)
+; CHECK-BASELINE-NEXT:    movb %r8b, 26(%r13)
+; CHECK-BASELINE-NEXT:    movb %r9b, 25(%r13)
+; CHECK-BASELINE-NEXT:    movb %r10b, 24(%r13)
+; CHECK-BASELINE-NEXT:    movb %r11b, 23(%r13)
+; CHECK-BASELINE-NEXT:    movb %bpl, 22(%r13)
+; CHECK-BASELINE-NEXT:    movb %r15b, 21(%r13)
+; CHECK-BASELINE-NEXT:    movb %r14b, 20(%r13)
+; CHECK-BASELINE-NEXT:    movb %r12b, 19(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 18(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 17(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 16(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 15(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 14(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 13(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 12(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 11(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 10(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 9(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 8(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 7(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 6(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 5(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 4(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 3(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 2(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 1(%r13)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, (%r13)
+; CHECK-BASELINE-NEXT:    movq %r13, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v32i8:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdx, %r13
+; CHECK-SSE1-NEXT:    movq %rsi, %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE1-NEXT:    movb 16(%rdx), %r12b
+; CHECK-SSE1-NEXT:    movb 15(%rdx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 14(%rdx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 13(%rdx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 12(%rdx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 11(%rdx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 10(%rdx), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 9(%rdx), %r10b
+; CHECK-SSE1-NEXT:    movb 8(%rdx), %r11b
+; CHECK-SSE1-NEXT:    movb 7(%rdx), %r9b
+; CHECK-SSE1-NEXT:    movb 6(%rdx), %r8b
+; CHECK-SSE1-NEXT:    movb 5(%rdx), %bpl
+; CHECK-SSE1-NEXT:    movb 4(%rdx), %dil
+; CHECK-SSE1-NEXT:    movb 3(%rdx), %sil
+; CHECK-SSE1-NEXT:    movb 2(%rdx), %r14b
+; CHECK-SSE1-NEXT:    movb (%rdx), %al
+; CHECK-SSE1-NEXT:    movb 1(%rdx), %r15b
+; CHECK-SSE1-NEXT:    movb (%rbx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb (%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 1(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %r15b, %al
+; CHECK-SSE1-NEXT:    andb 1(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r15b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 2(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %r14b, %al
+; CHECK-SSE1-NEXT:    andb 2(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r14b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 3(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %sil, %al
+; CHECK-SSE1-NEXT:    andb 3(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %sil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 4(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 4(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 5(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %bpl, %al
+; CHECK-SSE1-NEXT:    andb 5(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %bpl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 6(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %r8b, %al
+; CHECK-SSE1-NEXT:    andb 6(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r8b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 7(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %r9b, %al
+; CHECK-SSE1-NEXT:    andb 7(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r9b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 8(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %r11b, %al
+; CHECK-SSE1-NEXT:    andb 8(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r11b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 9(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %r10b, %al
+; CHECK-SSE1-NEXT:    andb 9(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r10b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 10(%rbx), %dl
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 10(%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 11(%rbx), %dl
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 11(%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 12(%rbx), %dl
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 12(%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 13(%rbx), %dl
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 13(%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 14(%rbx), %dl
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 14(%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 15(%rbx), %dl
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 15(%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 16(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb %r12b, %al
+; CHECK-SSE1-NEXT:    andb 16(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r12b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 17(%r13), %al
+; CHECK-SSE1-NEXT:    movb 17(%rbx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 17(%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 18(%r13), %al
+; CHECK-SSE1-NEXT:    movb 18(%rbx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 18(%rcx), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 19(%r13), %al
+; CHECK-SSE1-NEXT:    movb 19(%rbx), %r12b
+; CHECK-SSE1-NEXT:    xorb %al, %r12b
+; CHECK-SSE1-NEXT:    andb 19(%rcx), %r12b
+; CHECK-SSE1-NEXT:    movq %rcx, %rdx
+; CHECK-SSE1-NEXT:    xorb %al, %r12b
+; CHECK-SSE1-NEXT:    movb 20(%r13), %al
+; CHECK-SSE1-NEXT:    movb 20(%rbx), %r14b
+; CHECK-SSE1-NEXT:    xorb %al, %r14b
+; CHECK-SSE1-NEXT:    andb 20(%rcx), %r14b
+; CHECK-SSE1-NEXT:    xorb %al, %r14b
+; CHECK-SSE1-NEXT:    movb 21(%r13), %al
+; CHECK-SSE1-NEXT:    movb 21(%rbx), %r15b
+; CHECK-SSE1-NEXT:    xorb %al, %r15b
+; CHECK-SSE1-NEXT:    andb 21(%rcx), %r15b
+; CHECK-SSE1-NEXT:    xorb %al, %r15b
+; CHECK-SSE1-NEXT:    movb 22(%r13), %al
+; CHECK-SSE1-NEXT:    movb 22(%rbx), %bpl
+; CHECK-SSE1-NEXT:    xorb %al, %bpl
+; CHECK-SSE1-NEXT:    andb 22(%rcx), %bpl
+; CHECK-SSE1-NEXT:    xorb %al, %bpl
+; CHECK-SSE1-NEXT:    movb 23(%r13), %al
+; CHECK-SSE1-NEXT:    movb 23(%rbx), %r11b
+; CHECK-SSE1-NEXT:    xorb %al, %r11b
+; CHECK-SSE1-NEXT:    andb 23(%rcx), %r11b
+; CHECK-SSE1-NEXT:    xorb %al, %r11b
+; CHECK-SSE1-NEXT:    movb 24(%r13), %al
+; CHECK-SSE1-NEXT:    movb 24(%rbx), %r10b
+; CHECK-SSE1-NEXT:    xorb %al, %r10b
+; CHECK-SSE1-NEXT:    andb 24(%rcx), %r10b
+; CHECK-SSE1-NEXT:    xorb %al, %r10b
+; CHECK-SSE1-NEXT:    movb 25(%r13), %al
+; CHECK-SSE1-NEXT:    movb 25(%rbx), %r9b
+; CHECK-SSE1-NEXT:    xorb %al, %r9b
+; CHECK-SSE1-NEXT:    andb 25(%rcx), %r9b
+; CHECK-SSE1-NEXT:    xorb %al, %r9b
+; CHECK-SSE1-NEXT:    movb 26(%r13), %al
+; CHECK-SSE1-NEXT:    movb 26(%rbx), %r8b
+; CHECK-SSE1-NEXT:    xorb %al, %r8b
+; CHECK-SSE1-NEXT:    andb 26(%rcx), %r8b
+; CHECK-SSE1-NEXT:    xorb %al, %r8b
+; CHECK-SSE1-NEXT:    movb 27(%r13), %al
+; CHECK-SSE1-NEXT:    movb 27(%rbx), %dil
+; CHECK-SSE1-NEXT:    xorb %al, %dil
+; CHECK-SSE1-NEXT:    andb 27(%rcx), %dil
+; CHECK-SSE1-NEXT:    xorb %al, %dil
+; CHECK-SSE1-NEXT:    movb 28(%r13), %al
+; CHECK-SSE1-NEXT:    movb 28(%rbx), %sil
+; CHECK-SSE1-NEXT:    xorb %al, %sil
+; CHECK-SSE1-NEXT:    andb 28(%rcx), %sil
+; CHECK-SSE1-NEXT:    xorb %al, %sil
+; CHECK-SSE1-NEXT:    movb 29(%r13), %al
+; CHECK-SSE1-NEXT:    movb 29(%rbx), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb 29(%rdx), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    movb 30(%r13), %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 30(%rbx), %al
+; CHECK-SSE1-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    andb 30(%rdx), %al
+; CHECK-SSE1-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    movb 31(%r13), %r13b
+; CHECK-SSE1-NEXT:    movb 31(%rbx), %bl
+; CHECK-SSE1-NEXT:    xorb %r13b, %bl
+; CHECK-SSE1-NEXT:    andb 31(%rdx), %bl
+; CHECK-SSE1-NEXT:    xorb %r13b, %bl
+; CHECK-SSE1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; CHECK-SSE1-NEXT:    movb %bl, 31(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 30(%r13)
+; CHECK-SSE1-NEXT:    movb %cl, 29(%r13)
+; CHECK-SSE1-NEXT:    movb %sil, 28(%r13)
+; CHECK-SSE1-NEXT:    movb %dil, 27(%r13)
+; CHECK-SSE1-NEXT:    movb %r8b, 26(%r13)
+; CHECK-SSE1-NEXT:    movb %r9b, 25(%r13)
+; CHECK-SSE1-NEXT:    movb %r10b, 24(%r13)
+; CHECK-SSE1-NEXT:    movb %r11b, 23(%r13)
+; CHECK-SSE1-NEXT:    movb %bpl, 22(%r13)
+; CHECK-SSE1-NEXT:    movb %r15b, 21(%r13)
+; CHECK-SSE1-NEXT:    movb %r14b, 20(%r13)
+; CHECK-SSE1-NEXT:    movb %r12b, 19(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 18(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 17(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 16(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 15(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 14(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 13(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 12(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 11(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 10(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 9(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 8(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 7(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 6(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 5(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 4(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 3(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 2(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 1(%r13)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, (%r13)
+; CHECK-SSE1-NEXT:    movq %r13, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v32i8:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm2
+; CHECK-SSE2-NEXT:    movaps 16(%rsi), %xmm3
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    andps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v32i8:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rsi), %ymm0
+; CHECK-XOP-NEXT:    vxorps (%rdi), %ymm0, %ymm1
+; CHECK-XOP-NEXT:    vandps (%rdx), %ymm1, %ymm1
+; CHECK-XOP-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <32 x i8>, <32 x i8> *%px, align 32
+  %y = load <32 x i8>, <32 x i8> *%py, align 32
+  %mask = load <32 x i8>, <32 x i8> *%pmask, align 32
+  %n0 = xor <32 x i8> %x, %y
+  %n1 = and <32 x i8> %n0, %mask
+  %r = xor <32 x i8> %n1, %y
+  ret <32 x i8> %r
+}
+
+define <16 x i16> @in_v16i16(<16 x i16> *%px, <16 x i16> *%py, <16 x i16> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: in_v16i16:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rcx, %r8
+; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 28(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 24(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl (%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl (%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw %ax, %dx
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %cx, %ax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %bp, %ax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw %bx, %dx
+; CHECK-BASELINE-NEXT:    movl %edx, %eax
+; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %ecx
+; CHECK-BASELINE-NEXT:    xorw %r9w, %cx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw %r10w, %dx
+; CHECK-BASELINE-NEXT:    movl %edx, %ecx
+; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw %r12w, %dx
+; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %r12d
+; CHECK-BASELINE-NEXT:    xorw %r15w, %r12w
+; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %r15d
+; CHECK-BASELINE-NEXT:    xorw %r14w, %r15w
+; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %r14d
+; CHECK-BASELINE-NEXT:    xorw %r11w, %r14w
+; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %ebp
+; CHECK-BASELINE-NEXT:    xorw %r13w, %bp
+; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %ebx
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r9d
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %r13d
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r13w # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    andw 30(%r8), %r13w
+; CHECK-BASELINE-NEXT:    andw 28(%r8), %r9w
+; CHECK-BASELINE-NEXT:    andw 26(%r8), %r10w
+; CHECK-BASELINE-NEXT:    andw 24(%r8), %r11w
+; CHECK-BASELINE-NEXT:    andw 22(%r8), %bx
+; CHECK-BASELINE-NEXT:    andw 20(%r8), %bp
+; CHECK-BASELINE-NEXT:    andw 18(%r8), %r14w
+; CHECK-BASELINE-NEXT:    andw 16(%r8), %r15w
+; CHECK-BASELINE-NEXT:    andw 14(%r8), %r12w
+; CHECK-BASELINE-NEXT:    andw 12(%r8), %dx
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    andw 10(%r8), %cx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw 8(%r8), %dx
+; CHECK-BASELINE-NEXT:    andw 6(%r8), %ax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw 4(%r8), %cx
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw 2(%r8), %ax
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw (%r8), %si
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl %ecx, %esi
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl %edx, %r8d
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movw %r13w, 30(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r9w, 28(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r10w, 26(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r11w, 24(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bx, 22(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bp, 20(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r14w, 18(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r15w, 16(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r12w, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movw %dx, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r8w, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movw %cx, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %si, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %ax, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v16i16:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rcx, %r8
+; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 28(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 24(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 20(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r11d
+; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 16(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
+; CHECK-SSE1-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 8(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl (%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 4(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl (%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw %ax, %dx
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %cx, %ax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %bp, %ax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw %bx, %dx
+; CHECK-SSE1-NEXT:    movl %edx, %eax
+; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %ecx
+; CHECK-SSE1-NEXT:    xorw %r9w, %cx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw %r10w, %dx
+; CHECK-SSE1-NEXT:    movl %edx, %ecx
+; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw %r12w, %dx
+; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %r12d
+; CHECK-SSE1-NEXT:    xorw %r15w, %r12w
+; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %r15d
+; CHECK-SSE1-NEXT:    xorw %r14w, %r15w
+; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %r14d
+; CHECK-SSE1-NEXT:    xorw %r11w, %r14w
+; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %ebp
+; CHECK-SSE1-NEXT:    xorw %r13w, %bp
+; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %ebx
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r9d
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %r13d
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r13w # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    andw 30(%r8), %r13w
+; CHECK-SSE1-NEXT:    andw 28(%r8), %r9w
+; CHECK-SSE1-NEXT:    andw 26(%r8), %r10w
+; CHECK-SSE1-NEXT:    andw 24(%r8), %r11w
+; CHECK-SSE1-NEXT:    andw 22(%r8), %bx
+; CHECK-SSE1-NEXT:    andw 20(%r8), %bp
+; CHECK-SSE1-NEXT:    andw 18(%r8), %r14w
+; CHECK-SSE1-NEXT:    andw 16(%r8), %r15w
+; CHECK-SSE1-NEXT:    andw 14(%r8), %r12w
+; CHECK-SSE1-NEXT:    andw 12(%r8), %dx
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    andw 10(%r8), %cx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw 8(%r8), %dx
+; CHECK-SSE1-NEXT:    andw 6(%r8), %ax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw 4(%r8), %cx
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw 2(%r8), %ax
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw (%r8), %si
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl %ecx, %esi
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl %edx, %r8d
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movw %r13w, 30(%rdi)
+; CHECK-SSE1-NEXT:    movw %r9w, 28(%rdi)
+; CHECK-SSE1-NEXT:    movw %r10w, 26(%rdi)
+; CHECK-SSE1-NEXT:    movw %r11w, 24(%rdi)
+; CHECK-SSE1-NEXT:    movw %bx, 22(%rdi)
+; CHECK-SSE1-NEXT:    movw %bp, 20(%rdi)
+; CHECK-SSE1-NEXT:    movw %r14w, 18(%rdi)
+; CHECK-SSE1-NEXT:    movw %r15w, 16(%rdi)
+; CHECK-SSE1-NEXT:    movw %r12w, 14(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 12(%rdi)
+; CHECK-SSE1-NEXT:    movw %dx, 10(%rdi)
+; CHECK-SSE1-NEXT:    movw %r8w, 8(%rdi)
+; CHECK-SSE1-NEXT:    movw %cx, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %si, 4(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %ax, 2(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v16i16:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm2
+; CHECK-SSE2-NEXT:    movaps 16(%rsi), %xmm3
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    andps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v16i16:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rsi), %ymm0
+; CHECK-XOP-NEXT:    vxorps (%rdi), %ymm0, %ymm1
+; CHECK-XOP-NEXT:    vandps (%rdx), %ymm1, %ymm1
+; CHECK-XOP-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <16 x i16>, <16 x i16> *%px, align 32
+  %y = load <16 x i16>, <16 x i16> *%py, align 32
+  %mask = load <16 x i16>, <16 x i16> *%pmask, align 32
+  %n0 = xor <16 x i16> %x, %y
+  %n1 = and <16 x i16> %n0, %mask
+  %r = xor <16 x i16> %n1, %y
+  ret <16 x i16> %r
+}
+
+define <8 x i32> @in_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: in_v8i32:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
+; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r13
+; CHECK-BASELINE-NEXT:    pushq %r12
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movl 28(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movl 24(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 16(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl (%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movl (%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorl %r12d, %r11d
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
+; CHECK-BASELINE-NEXT:    xorl %r13d, %r9d
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r8d
+; CHECK-BASELINE-NEXT:    xorl %ebx, %r8d
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %ebx
+; CHECK-BASELINE-NEXT:    xorl %ebp, %ebx
+; CHECK-BASELINE-NEXT:    movl 16(%rsi), %ebp
+; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
+; CHECK-BASELINE-NEXT:    movl 20(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
+; CHECK-BASELINE-NEXT:    movl 24(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorl %r14d, %eax
+; CHECK-BASELINE-NEXT:    movl 28(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorl %r15d, %esi
+; CHECK-BASELINE-NEXT:    andl 28(%rcx), %esi
+; CHECK-BASELINE-NEXT:    andl 24(%rcx), %eax
+; CHECK-BASELINE-NEXT:    andl 20(%rcx), %edx
+; CHECK-BASELINE-NEXT:    andl 16(%rcx), %ebp
+; CHECK-BASELINE-NEXT:    andl 12(%rcx), %ebx
+; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r8d
+; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
+; CHECK-BASELINE-NEXT:    andl (%rcx), %r11d
+; CHECK-BASELINE-NEXT:    xorl %r12d, %r11d
+; CHECK-BASELINE-NEXT:    xorl %r13d, %r9d
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl %r14d, %eax
+; CHECK-BASELINE-NEXT:    xorl %r15d, %esi
+; CHECK-BASELINE-NEXT:    movl %esi, 28(%rdi)
+; CHECK-BASELINE-NEXT:    movl %eax, 24(%rdi)
+; CHECK-BASELINE-NEXT:    movl %edx, 20(%rdi)
+; CHECK-BASELINE-NEXT:    movl %ebp, 16(%rdi)
+; CHECK-BASELINE-NEXT:    movl %ebx, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r8d, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movl %r11d, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
+; CHECK-BASELINE-NEXT:    popq %r13
+; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
+; CHECK-BASELINE-NEXT:    popq %rbp
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v8i32:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
+; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r13
+; CHECK-SSE1-NEXT:    pushq %r12
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movl 28(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movl 24(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 16(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 12(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl (%rdx), %r12d
+; CHECK-SSE1-NEXT:    movl 4(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movl (%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorl %r12d, %r11d
+; CHECK-SSE1-NEXT:    movl 4(%rsi), %r9d
+; CHECK-SSE1-NEXT:    xorl %r13d, %r9d
+; CHECK-SSE1-NEXT:    movl 8(%rsi), %r8d
+; CHECK-SSE1-NEXT:    xorl %ebx, %r8d
+; CHECK-SSE1-NEXT:    movl 12(%rsi), %ebx
+; CHECK-SSE1-NEXT:    xorl %ebp, %ebx
+; CHECK-SSE1-NEXT:    movl 16(%rsi), %ebp
+; CHECK-SSE1-NEXT:    xorl %eax, %ebp
+; CHECK-SSE1-NEXT:    movl 20(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorl %r10d, %edx
+; CHECK-SSE1-NEXT:    movl 24(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorl %r14d, %eax
+; CHECK-SSE1-NEXT:    movl 28(%rsi), %esi
+; CHECK-SSE1-NEXT:    xorl %r15d, %esi
+; CHECK-SSE1-NEXT:    andl 28(%rcx), %esi
+; CHECK-SSE1-NEXT:    andl 24(%rcx), %eax
+; CHECK-SSE1-NEXT:    andl 20(%rcx), %edx
+; CHECK-SSE1-NEXT:    andl 16(%rcx), %ebp
+; CHECK-SSE1-NEXT:    andl 12(%rcx), %ebx
+; CHECK-SSE1-NEXT:    andl 8(%rcx), %r8d
+; CHECK-SSE1-NEXT:    andl 4(%rcx), %r9d
+; CHECK-SSE1-NEXT:    andl (%rcx), %r11d
+; CHECK-SSE1-NEXT:    xorl %r12d, %r11d
+; CHECK-SSE1-NEXT:    xorl %r13d, %r9d
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl %r14d, %eax
+; CHECK-SSE1-NEXT:    xorl %r15d, %esi
+; CHECK-SSE1-NEXT:    movl %esi, 28(%rdi)
+; CHECK-SSE1-NEXT:    movl %eax, 24(%rdi)
+; CHECK-SSE1-NEXT:    movl %edx, 20(%rdi)
+; CHECK-SSE1-NEXT:    movl %ebp, 16(%rdi)
+; CHECK-SSE1-NEXT:    movl %ebx, 12(%rdi)
+; CHECK-SSE1-NEXT:    movl %r8d, 8(%rdi)
+; CHECK-SSE1-NEXT:    movl %r9d, 4(%rdi)
+; CHECK-SSE1-NEXT:    movl %r11d, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
+; CHECK-SSE1-NEXT:    popq %r13
+; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
+; CHECK-SSE1-NEXT:    popq %rbp
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v8i32:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm2
+; CHECK-SSE2-NEXT:    movaps 16(%rsi), %xmm3
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    andps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v8i32:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rsi), %ymm0
+; CHECK-XOP-NEXT:    vxorps (%rdi), %ymm0, %ymm1
+; CHECK-XOP-NEXT:    vandps (%rdx), %ymm1, %ymm1
+; CHECK-XOP-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <8 x i32>, <8 x i32> *%px, align 32
+  %y = load <8 x i32>, <8 x i32> *%py, align 32
+  %mask = load <8 x i32>, <8 x i32> *%pmask, align 32
+  %n0 = xor <8 x i32> %x, %y
+  %n1 = and <8 x i32> %n0, %mask
+  %r = xor <8 x i32> %n1, %y
+  ret <8 x i32> %r
+}
+
+define <4 x i64> @in_v4i64(<4 x i64> *%px, <4 x i64> *%py, <4 x i64> *%pmask) nounwind {
+; CHECK-BASELINE-LABEL: in_v4i64:
+; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq 24(%rdx), %r8
+; CHECK-BASELINE-NEXT:    movq 16(%rdx), %r9
+; CHECK-BASELINE-NEXT:    movq (%rdx), %r11
+; CHECK-BASELINE-NEXT:    movq 8(%rdx), %r10
+; CHECK-BASELINE-NEXT:    movq (%rsi), %rdx
+; CHECK-BASELINE-NEXT:    xorq %r11, %rdx
+; CHECK-BASELINE-NEXT:    movq 8(%rsi), %rax
+; CHECK-BASELINE-NEXT:    xorq %r10, %rax
+; CHECK-BASELINE-NEXT:    movq 16(%rsi), %rbx
+; CHECK-BASELINE-NEXT:    xorq %r9, %rbx
+; CHECK-BASELINE-NEXT:    movq 24(%rsi), %rsi
+; CHECK-BASELINE-NEXT:    xorq %r8, %rsi
+; CHECK-BASELINE-NEXT:    andq 24(%rcx), %rsi
+; CHECK-BASELINE-NEXT:    andq 16(%rcx), %rbx
+; CHECK-BASELINE-NEXT:    andq 8(%rcx), %rax
+; CHECK-BASELINE-NEXT:    andq (%rcx), %rdx
+; CHECK-BASELINE-NEXT:    xorq %r11, %rdx
+; CHECK-BASELINE-NEXT:    xorq %r10, %rax
+; CHECK-BASELINE-NEXT:    xorq %r9, %rbx
+; CHECK-BASELINE-NEXT:    xorq %r8, %rsi
+; CHECK-BASELINE-NEXT:    movq %rsi, 24(%rdi)
+; CHECK-BASELINE-NEXT:    movq %rbx, 16(%rdi)
+; CHECK-BASELINE-NEXT:    movq %rax, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdx, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    retq
+;
+; CHECK-SSE1-LABEL: in_v4i64:
+; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq 24(%rdx), %r8
+; CHECK-SSE1-NEXT:    movq 16(%rdx), %r9
+; CHECK-SSE1-NEXT:    movq (%rdx), %r11
+; CHECK-SSE1-NEXT:    movq 8(%rdx), %r10
+; CHECK-SSE1-NEXT:    movq (%rsi), %rdx
+; CHECK-SSE1-NEXT:    xorq %r11, %rdx
+; CHECK-SSE1-NEXT:    movq 8(%rsi), %rax
+; CHECK-SSE1-NEXT:    xorq %r10, %rax
+; CHECK-SSE1-NEXT:    movq 16(%rsi), %rbx
+; CHECK-SSE1-NEXT:    xorq %r9, %rbx
+; CHECK-SSE1-NEXT:    movq 24(%rsi), %rsi
+; CHECK-SSE1-NEXT:    xorq %r8, %rsi
+; CHECK-SSE1-NEXT:    andq 24(%rcx), %rsi
+; CHECK-SSE1-NEXT:    andq 16(%rcx), %rbx
+; CHECK-SSE1-NEXT:    andq 8(%rcx), %rax
+; CHECK-SSE1-NEXT:    andq (%rcx), %rdx
+; CHECK-SSE1-NEXT:    xorq %r11, %rdx
+; CHECK-SSE1-NEXT:    xorq %r10, %rax
+; CHECK-SSE1-NEXT:    xorq %r9, %rbx
+; CHECK-SSE1-NEXT:    xorq %r8, %rsi
+; CHECK-SSE1-NEXT:    movq %rsi, 24(%rdi)
+; CHECK-SSE1-NEXT:    movq %rbx, 16(%rdi)
+; CHECK-SSE1-NEXT:    movq %rax, 8(%rdi)
+; CHECK-SSE1-NEXT:    movq %rdx, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    retq
+;
+; CHECK-SSE2-LABEL: in_v4i64:
+; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movaps (%rsi), %xmm2
+; CHECK-SSE2-NEXT:    movaps 16(%rsi), %xmm3
+; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    andps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT:    andps (%rdx), %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-XOP-LABEL: in_v4i64:
+; CHECK-XOP:       # %bb.0:
+; CHECK-XOP-NEXT:    vmovaps (%rsi), %ymm0
+; CHECK-XOP-NEXT:    vxorps (%rdi), %ymm0, %ymm1
+; CHECK-XOP-NEXT:    vandps (%rdx), %ymm1, %ymm1
+; CHECK-XOP-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT:    retq
+  %x = load <4 x i64>, <4 x i64> *%px, align 32
+  %y = load <4 x i64>, <4 x i64> *%py, align 32
+  %mask = load <4 x i64>, <4 x i64> *%pmask, align 32
+  %n0 = xor <4 x i64> %x, %y
+  %n1 = and <4 x i64> %n0, %mask
+  %r = xor <4 x i64> %n1, %y
+  ret <4 x i64> %r
+}




More information about the llvm-commits mailing list