[llvm-branch-commits] [llvm] dc80e80 - X86: Fold masked-merge when and-not is not available

Matthias Braun via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Nov 29 16:02:16 PST 2021


Author: Matthias Braun
Date: 2021-11-29T16:02:06-08:00
New Revision: dc80e809825a23ea46c245eb5e8b1ca897b6101b

URL: https://github.com/llvm/llvm-project/commit/dc80e809825a23ea46c245eb5e8b1ca897b6101b
DIFF: https://github.com/llvm/llvm-project/commit/dc80e809825a23ea46c245eb5e8b1ca897b6101b.diff

LOG: X86: Fold masked-merge when and-not is not available

Differential Revision: https://reviews.llvm.org/D112754

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/fold-masked-merge.ll
    llvm/test/CodeGen/X86/or-lea.ll
    llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
    llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 17d14053d804e..c42584aa763f9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46577,6 +46577,59 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return Ret;
 }
 
+static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
+                                   SDValue And1_L, SDValue And1_R, SDLoc DL,
+                                   SelectionDAG &DAG) {
+  if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
+    return SDValue();
+  SDValue NotOp = And0_L->getOperand(0);
+  if (NotOp == And1_R)
+    std::swap(And1_R, And1_L);
+  if (NotOp != And1_L)
+    return SDValue();
+
+  // (~(NotOp) & And0_R) | (NotOp & And1_R)
+  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
+  EVT VT = And1_L->getValueType(0);
+  SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
+  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
+  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
+  return Xor1;
+}
+
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for  targets without a fused
+/// "and-not" operation. This function is intended to be called from a
+/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  SDValue N0 = Node->getOperand(0);
+  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
+    return SDValue();
+  SDValue N1 = Node->getOperand(1);
+  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
+    return SDValue();
+
+  SDLoc DL(Node);
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  SDValue N10 = N1->getOperand(0);
+  SDValue N11 = N1->getOperand(1);
+  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
+    return Result;
+  return SDValue();
+}
+
 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
                          const X86Subtarget &Subtarget) {
@@ -46670,6 +46723,11 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
       return Res;
   }
 
+  // We should fold "masked merge" patterns when `andn` is not available.
+  if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG))
+      return R;
+
   return SDValue();
 }
 

diff  --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index e26ec40afb3c6..1203874205492 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -8,11 +8,10 @@
 define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
 ; NOBMI-LABEL: masked_merge0:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %edi, %eax
-; NOBMI-NEXT:    andl %edi, %esi
-; NOBMI-NEXT:    notl %eax
-; NOBMI-NEXT:    andl %edx, %eax
-; NOBMI-NEXT:    orl %esi, %eax
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    andl %edi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge0:
@@ -54,14 +53,22 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 }
 
 define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; CHECK-LABEL: masked_merge2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    notb %al
-; CHECK-NEXT:    andb %sil, %al
-; CHECK-NEXT:    andb %dil, %sil
-; CHECK-NEXT:    orb %sil, %al
-; CHECK-NEXT:    retq
+; NOBMI-LABEL: masked_merge2:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    xorb %sil, %al
+; NOBMI-NEXT:    andb %dil, %al
+; NOBMI-NEXT:    xorb %sil, %al
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: masked_merge2:
+; BMI:       # %bb.0:
+; BMI-NEXT:    movl %edi, %eax
+; BMI-NEXT:    notb %al
+; BMI-NEXT:    andb %sil, %al
+; BMI-NEXT:    andb %dil, %sil
+; BMI-NEXT:    orb %sil, %al
+; BMI-NEXT:    retq
   %not = xor i8 %a0, -1
   %and0 = and i8 %not, %a1
   %and1 = and i8 %a1, %a0
@@ -72,13 +79,12 @@ define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
 define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
 ; NOBMI-LABEL: masked_merge3:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movq %rdi, %rax
-; NOBMI-NEXT:    notq %rsi
+; NOBMI-NEXT:    movq %rsi, %rax
 ; NOBMI-NEXT:    notq %rdx
-; NOBMI-NEXT:    andq %rdi, %rsi
+; NOBMI-NEXT:    xorq %rdx, %rax
 ; NOBMI-NEXT:    notq %rax
-; NOBMI-NEXT:    andq %rdx, %rax
-; NOBMI-NEXT:    orq %rsi, %rax
+; NOBMI-NEXT:    andq %rdi, %rax
+; NOBMI-NEXT:    xorq %rdx, %rax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge3:

diff  --git a/llvm/test/CodeGen/X86/or-lea.ll b/llvm/test/CodeGen/X86/or-lea.ll
index 67f313047c531..f497fc6bd51ac 100644
--- a/llvm/test/CodeGen/X86/or-lea.ll
+++ b/llvm/test/CodeGen/X86/or-lea.ll
@@ -137,12 +137,11 @@ define i64 @or_shift1_and1_64(i64 %x, i64 %y) {
 define i32 @or_and_and_rhs_neg_i32(i32 %x, i32 %y, i32 %z) {
 ; NOBMI-LABEL: or_and_and_rhs_neg_i32:
 ; NOBMI:       # %bb.0: # %entry
-; NOBMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; NOBMI-NEXT:    # kill: def $edx killed $edx def $rdx
+; NOBMI-NEXT:    xorl %edi, %edx
 ; NOBMI-NEXT:    andl %esi, %edx
-; NOBMI-NEXT:    notl %esi
-; NOBMI-NEXT:    andl %edi, %esi
-; NOBMI-NEXT:    orl %edx, %esi
-; NOBMI-NEXT:    leal 1(%rsi), %eax
+; NOBMI-NEXT:    xorl %edi, %edx
+; NOBMI-NEXT:    leal 1(%rdx), %eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: or_and_and_rhs_neg_i32:
@@ -164,12 +163,11 @@ entry:
 define i32 @or_and_and_lhs_neg_i32(i32 %x, i32 %y, i32 %z) {
 ; NOBMI-LABEL: or_and_and_lhs_neg_i32:
 ; NOBMI:       # %bb.0: # %entry
-; NOBMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; NOBMI-NEXT:    # kill: def $edx killed $edx def $rdx
+; NOBMI-NEXT:    xorl %edi, %edx
 ; NOBMI-NEXT:    andl %esi, %edx
-; NOBMI-NEXT:    notl %esi
-; NOBMI-NEXT:    andl %edi, %esi
-; NOBMI-NEXT:    orl %edx, %esi
-; NOBMI-NEXT:    leal 1(%rsi), %eax
+; NOBMI-NEXT:    xorl %edi, %edx
+; NOBMI-NEXT:    leal 1(%rdx), %eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: or_and_and_lhs_neg_i32:
@@ -191,12 +189,11 @@ entry:
 define i32 @or_and_rhs_neg_and_i32(i32 %x, i32 %y, i32 %z) {
 ; NOBMI-LABEL: or_and_rhs_neg_and_i32:
 ; NOBMI:       # %bb.0: # %entry
-; NOBMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
+; NOBMI-NEXT:    xorl %edx, %edi
 ; NOBMI-NEXT:    andl %esi, %edi
-; NOBMI-NEXT:    notl %esi
-; NOBMI-NEXT:    andl %edx, %esi
-; NOBMI-NEXT:    orl %edi, %esi
-; NOBMI-NEXT:    leal 1(%rsi), %eax
+; NOBMI-NEXT:    xorl %edx, %edi
+; NOBMI-NEXT:    leal 1(%rdi), %eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: or_and_rhs_neg_and_i32:
@@ -218,12 +215,11 @@ entry:
 define i32 @or_and_lhs_neg_and_i32(i32 %x, i32 %y, i32 %z) {
 ; NOBMI-LABEL: or_and_lhs_neg_and_i32:
 ; NOBMI:       # %bb.0: # %entry
-; NOBMI-NEXT:    # kill: def $esi killed $esi def $rsi
+; NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
+; NOBMI-NEXT:    xorl %edx, %edi
 ; NOBMI-NEXT:    andl %esi, %edi
-; NOBMI-NEXT:    notl %esi
-; NOBMI-NEXT:    andl %edx, %esi
-; NOBMI-NEXT:    orl %edi, %esi
-; NOBMI-NEXT:    leal 1(%rsi), %eax
+; NOBMI-NEXT:    xorl %edx, %edi
+; NOBMI-NEXT:    leal 1(%rdi), %eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: or_and_lhs_neg_and_i32:
@@ -245,11 +241,10 @@ entry:
 define i64 @or_and_and_rhs_neg_i64(i64 %x, i64 %y, i64 %z) {
 ; NOBMI-LABEL: or_and_and_rhs_neg_i64:
 ; NOBMI:       # %bb.0: # %entry
+; NOBMI-NEXT:    xorq %rdi, %rdx
 ; NOBMI-NEXT:    andq %rsi, %rdx
-; NOBMI-NEXT:    notq %rsi
-; NOBMI-NEXT:    andq %rdi, %rsi
-; NOBMI-NEXT:    orq %rdx, %rsi
-; NOBMI-NEXT:    leaq 1(%rsi), %rax
+; NOBMI-NEXT:    xorq %rdi, %rdx
+; NOBMI-NEXT:    leaq 1(%rdx), %rax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: or_and_and_rhs_neg_i64:
@@ -271,11 +266,10 @@ entry:
 define i64 @or_and_and_lhs_neg_i64(i64 %x, i64 %y, i64 %z) {
 ; NOBMI-LABEL: or_and_and_lhs_neg_i64:
 ; NOBMI:       # %bb.0: # %entry
+; NOBMI-NEXT:    xorq %rdi, %rdx
 ; NOBMI-NEXT:    andq %rsi, %rdx
-; NOBMI-NEXT:    notq %rsi
-; NOBMI-NEXT:    andq %rdi, %rsi
-; NOBMI-NEXT:    orq %rdx, %rsi
-; NOBMI-NEXT:    leaq 1(%rsi), %rax
+; NOBMI-NEXT:    xorq %rdi, %rdx
+; NOBMI-NEXT:    leaq 1(%rdx), %rax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: or_and_and_lhs_neg_i64:
@@ -297,11 +291,10 @@ entry:
 define i64 @or_and_rhs_neg_and_i64(i64 %x, i64 %y, i64 %z) {
 ; NOBMI-LABEL: or_and_rhs_neg_and_i64:
 ; NOBMI:       # %bb.0: # %entry
+; NOBMI-NEXT:    xorq %rdx, %rdi
 ; NOBMI-NEXT:    andq %rsi, %rdi
-; NOBMI-NEXT:    notq %rsi
-; NOBMI-NEXT:    andq %rdx, %rsi
-; NOBMI-NEXT:    orq %rdi, %rsi
-; NOBMI-NEXT:    leaq 1(%rsi), %rax
+; NOBMI-NEXT:    xorq %rdx, %rdi
+; NOBMI-NEXT:    leaq 1(%rdi), %rax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: or_and_rhs_neg_and_i64:
@@ -323,11 +316,10 @@ entry:
 define i64 @or_and_lhs_neg_and_i64(i64 %x, i64 %y, i64 %z) {
 ; NOBMI-LABEL: or_and_lhs_neg_and_i64:
 ; NOBMI:       # %bb.0: # %entry
+; NOBMI-NEXT:    xorq %rdx, %rdi
 ; NOBMI-NEXT:    andq %rsi, %rdi
-; NOBMI-NEXT:    notq %rsi
-; NOBMI-NEXT:    andq %rdx, %rsi
-; NOBMI-NEXT:    orq %rdi, %rsi
-; NOBMI-NEXT:    leaq 1(%rsi), %rax
+; NOBMI-NEXT:    xorq %rdx, %rdi
+; NOBMI-NEXT:    leaq 1(%rdi), %rax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: or_and_lhs_neg_and_i64:

diff  --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 81044c805902d..7d3047f5b8762 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -58,11 +58,10 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out32:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl %esi, %eax
-; CHECK-NOBMI-NEXT:    orl %edi, %eax
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out32:
@@ -81,11 +80,10 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; CHECK-NOBMI-LABEL: out64:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movq %rdx, %rax
-; CHECK-NOBMI-NEXT:    andq %rdx, %rdi
-; CHECK-NOBMI-NEXT:    notq %rax
-; CHECK-NOBMI-NEXT:    andq %rsi, %rax
-; CHECK-NOBMI-NEXT:    orq %rdi, %rax
+; CHECK-NOBMI-NEXT:    movq %rdi, %rax
+; CHECK-NOBMI-NEXT:    xorq %rsi, %rax
+; CHECK-NOBMI-NEXT:    andq %rdx, %rax
+; CHECK-NOBMI-NEXT:    xorq %rsi, %rax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out64:
@@ -628,11 +626,10 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out_constant_varx_42:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl $42, %eax
-; CHECK-NOBMI-NEXT:    orl %edi, %eax
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out_constant_varx_42:
@@ -673,11 +670,10 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out_constant_varx_42_invmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl %edi, %eax
-; CHECK-NOBMI-NEXT:    andl $42, %edx
-; CHECK-NOBMI-NEXT:    orl %edx, %eax
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out_constant_varx_42_invmask:
@@ -801,11 +797,10 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out_constant_42_vary:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl $42, %edx
-; CHECK-NOBMI-NEXT:    andl %esi, %eax
-; CHECK-NOBMI-NEXT:    orl %edx, %eax
+; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out_constant_42_vary:
@@ -844,11 +839,10 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out_constant_42_vary_invmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl $42, %eax
-; CHECK-NOBMI-NEXT:    orl %esi, %eax
+; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out_constant_42_vary_invmask:

diff  --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index e69901d3f602d..4985be5872853 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -107,64 +107,50 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin
 define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v4i8:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    andb %bl, %r8b
-; CHECK-BASELINE-NEXT:    andb %r11b, %cl
-; CHECK-BASELINE-NEXT:    andb %r10b, %dl
-; CHECK-BASELINE-NEXT:    andb %dil, %sil
-; CHECK-BASELINE-NEXT:    notb %r10b
-; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    notb %bl
-; CHECK-BASELINE-NEXT:    notb %dil
-; CHECK-BASELINE-NEXT:    andb %r9b, %dil
-; CHECK-BASELINE-NEXT:    orb %sil, %dil
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    orb %r8b, %bl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    orb %cl, %r11b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    orb %dl, %r10b
-; CHECK-BASELINE-NEXT:    movb %bl, 3(%rax)
-; CHECK-BASELINE-NEXT:    movb %r11b, 2(%rax)
-; CHECK-BASELINE-NEXT:    movb %r10b, 1(%rax)
-; CHECK-BASELINE-NEXT:    movb %dil, (%rax)
-; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
+; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
+; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i8:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    andb %bl, %r8b
-; CHECK-SSE1-NEXT:    andb %r11b, %cl
-; CHECK-SSE1-NEXT:    andb %r10b, %dl
-; CHECK-SSE1-NEXT:    andb %dil, %sil
-; CHECK-SSE1-NEXT:    notb %r10b
-; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    notb %bl
-; CHECK-SSE1-NEXT:    notb %dil
-; CHECK-SSE1-NEXT:    andb %r9b, %dil
-; CHECK-SSE1-NEXT:    orb %sil, %dil
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    orb %r8b, %bl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    orb %cl, %r11b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    orb %dl, %r10b
-; CHECK-SSE1-NEXT:    movb %bl, 3(%rax)
-; CHECK-SSE1-NEXT:    movb %r11b, 2(%rax)
-; CHECK-SSE1-NEXT:    movb %r10b, 1(%rax)
-; CHECK-SSE1-NEXT:    movb %dil, (%rax)
-; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
+; CHECK-SSE1-NEXT:    xorb %r11b, %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %r11b, %dl
+; CHECK-SSE1-NEXT:    xorb %r10b, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %r10b, %cl
+; CHECK-SSE1-NEXT:    xorb %dil, %r8b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    xorb %dil, %r8b
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %sil, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i8:
@@ -191,24 +177,20 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    andb %r11b, %r8b
-; CHECK-BASELINE-NEXT:    andb %r10b, %dl
-; CHECK-BASELINE-NEXT:    andb %dil, %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    notb %r10b
-; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    notb %dil
-; CHECK-BASELINE-NEXT:    andb %r9b, %dil
-; CHECK-BASELINE-NEXT:    orb %sil, %dil
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    orb %r8b, %r11b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    orb %dl, %r10b
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
+; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
 ; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
-; CHECK-BASELINE-NEXT:    movb %r11b, 3(%rax)
-; CHECK-BASELINE-NEXT:    movb %r10b, 1(%rax)
-; CHECK-BASELINE-NEXT:    movb %dil, (%rax)
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i8_undef:
@@ -216,24 +198,20 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    andb %r11b, %r8b
-; CHECK-SSE1-NEXT:    andb %r10b, %dl
-; CHECK-SSE1-NEXT:    andb %dil, %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    notb %r10b
-; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    notb %dil
-; CHECK-SSE1-NEXT:    andb %r9b, %dil
-; CHECK-SSE1-NEXT:    orb %sil, %dil
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    orb %r8b, %r11b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    orb %dl, %r10b
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
+; CHECK-SSE1-NEXT:    xorb %r10b, %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %r10b, %dl
+; CHECK-SSE1-NEXT:    xorb %dil, %r8b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    xorb %dil, %r8b
 ; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
-; CHECK-SSE1-NEXT:    movb %r11b, 3(%rax)
-; CHECK-SSE1-NEXT:    movb %r10b, 1(%rax)
-; CHECK-SSE1-NEXT:    movb %dil, (%rax)
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %sil, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i8_undef:
@@ -306,11 +284,10 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -329,66 +306,54 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %rbp
 ; CHECK-BASELINE-NEXT:    pushq %r15
 ; CHECK-BASELINE-NEXT:    pushq %r14
-; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    andb %bl, %r9b
-; CHECK-BASELINE-NEXT:    andb %r15b, %r8b
-; CHECK-BASELINE-NEXT:    andb %bpl, %cl
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    andb %r11b, %dl
-; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    andb %dil, %sil
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    andb %r12b, %r13b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    andb %r14b, %cl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    andb %r10b, %dl
-; CHECK-BASELINE-NEXT:    notb %dil
-; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    notb %bpl
-; CHECK-BASELINE-NEXT:    notb %r15b
-; CHECK-BASELINE-NEXT:    notb %bl
-; CHECK-BASELINE-NEXT:    notb %r10b
-; CHECK-BASELINE-NEXT:    notb %r14b
-; CHECK-BASELINE-NEXT:    notb %r12b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
-; CHECK-BASELINE-NEXT:    orb %r13b, %r12b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-BASELINE-NEXT:    orb %cl, %r14b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    orb %dl, %r10b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    orb %r9b, %bl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
-; CHECK-BASELINE-NEXT:    orb %r8b, %r15b
+; CHECK-BASELINE-NEXT:    xorb %bl, %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    xorb %bl, %sil
+; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
+; CHECK-BASELINE-NEXT:    xorb %r15b, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %r15b, %cl
+; CHECK-BASELINE-NEXT:    xorb %r14b, %r8b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    xorb %r14b, %r8b
+; CHECK-BASELINE-NEXT:    xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
+; CHECK-BASELINE-NEXT:    xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    xorb %r11b, %bpl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorb %r11b, %bpl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    xorb %r10b, %r11b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
-; CHECK-BASELINE-NEXT:    orb %sil, %dil
-; CHECK-BASELINE-NEXT:    movb %r12b, 7(%rax)
-; CHECK-BASELINE-NEXT:    movb %r14b, 6(%rax)
-; CHECK-BASELINE-NEXT:    movb %r10b, 5(%rax)
-; CHECK-BASELINE-NEXT:    movb %bl, 4(%rax)
-; CHECK-BASELINE-NEXT:    movb %r15b, 3(%rax)
-; CHECK-BASELINE-NEXT:    movb %bpl, 2(%rax)
-; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rax)
-; CHECK-BASELINE-NEXT:    movb %dil, (%rax)
+; CHECK-BASELINE-NEXT:    xorb %r10b, %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    xorb %dil, %bl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    xorb %dil, %bl
+; CHECK-BASELINE-NEXT:    movb %bl, 7(%rax)
+; CHECK-BASELINE-NEXT:    movb %r11b, 6(%rax)
+; CHECK-BASELINE-NEXT:    movb %bpl, 5(%rax)
+; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rax)
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
-; CHECK-BASELINE-NEXT:    popq %r13
 ; CHECK-BASELINE-NEXT:    popq %r14
 ; CHECK-BASELINE-NEXT:    popq %r15
 ; CHECK-BASELINE-NEXT:    popq %rbp
@@ -399,66 +364,54 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %rbp
 ; CHECK-SSE1-NEXT:    pushq %r15
 ; CHECK-SSE1-NEXT:    pushq %r14
-; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    andb %bl, %r9b
-; CHECK-SSE1-NEXT:    andb %r15b, %r8b
-; CHECK-SSE1-NEXT:    andb %bpl, %cl
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    andb %r11b, %dl
-; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    andb %dil, %sil
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    andb %r12b, %r13b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    andb %r14b, %cl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    andb %r10b, %dl
-; CHECK-SSE1-NEXT:    notb %dil
-; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    notb %bpl
-; CHECK-SSE1-NEXT:    notb %r15b
-; CHECK-SSE1-NEXT:    notb %bl
-; CHECK-SSE1-NEXT:    notb %r10b
-; CHECK-SSE1-NEXT:    notb %r14b
-; CHECK-SSE1-NEXT:    notb %r12b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
-; CHECK-SSE1-NEXT:    orb %r13b, %r12b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-SSE1-NEXT:    orb %cl, %r14b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    orb %dl, %r10b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    orb %r9b, %bl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
-; CHECK-SSE1-NEXT:    orb %r8b, %r15b
+; CHECK-SSE1-NEXT:    xorb %bl, %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    xorb %bl, %sil
+; CHECK-SSE1-NEXT:    xorb %r12b, %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %r12b, %dl
+; CHECK-SSE1-NEXT:    xorb %r15b, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %r15b, %cl
+; CHECK-SSE1-NEXT:    xorb %r14b, %r8b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    xorb %r14b, %r8b
+; CHECK-SSE1-NEXT:    xorb %bpl, %r9b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
+; CHECK-SSE1-NEXT:    xorb %bpl, %r9b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    xorb %r11b, %bpl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorb %r11b, %bpl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    xorb %r10b, %r11b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
-; CHECK-SSE1-NEXT:    orb %sil, %dil
-; CHECK-SSE1-NEXT:    movb %r12b, 7(%rax)
-; CHECK-SSE1-NEXT:    movb %r14b, 6(%rax)
-; CHECK-SSE1-NEXT:    movb %r10b, 5(%rax)
-; CHECK-SSE1-NEXT:    movb %bl, 4(%rax)
-; CHECK-SSE1-NEXT:    movb %r15b, 3(%rax)
-; CHECK-SSE1-NEXT:    movb %bpl, 2(%rax)
-; CHECK-SSE1-NEXT:    movb %r11b, 1(%rax)
-; CHECK-SSE1-NEXT:    movb %dil, (%rax)
+; CHECK-SSE1-NEXT:    xorb %r10b, %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    xorb %dil, %bl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    xorb %dil, %bl
+; CHECK-SSE1-NEXT:    movb %bl, 7(%rax)
+; CHECK-SSE1-NEXT:    movb %r11b, 6(%rax)
+; CHECK-SSE1-NEXT:    movb %bpl, 5(%rax)
+; CHECK-SSE1-NEXT:    movb %r9b, 4(%rax)
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %sil, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
-; CHECK-SSE1-NEXT:    popq %r13
 ; CHECK-SSE1-NEXT:    popq %r14
 ; CHECK-SSE1-NEXT:    popq %r15
 ; CHECK-SSE1-NEXT:    popq %rbp
@@ -485,64 +438,50 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v4i16:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    andl %ebx, %esi
-; CHECK-BASELINE-NEXT:    andl %edi, %r8d
-; CHECK-BASELINE-NEXT:    andl %r11d, %ecx
-; CHECK-BASELINE-NEXT:    andl %r10d, %edx
-; CHECK-BASELINE-NEXT:    notl %r10d
-; CHECK-BASELINE-NEXT:    notl %r11d
-; CHECK-BASELINE-NEXT:    notl %edi
-; CHECK-BASELINE-NEXT:    notl %ebx
-; CHECK-BASELINE-NEXT:    andl %r9d, %ebx
-; CHECK-BASELINE-NEXT:    orl %esi, %ebx
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %di
-; CHECK-BASELINE-NEXT:    orl %r8d, %edi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
-; CHECK-BASELINE-NEXT:    orl %ecx, %r11d
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
-; CHECK-BASELINE-NEXT:    orl %edx, %r10d
-; CHECK-BASELINE-NEXT:    movw %bx, (%rax)
-; CHECK-BASELINE-NEXT:    movw %di, 6(%rax)
-; CHECK-BASELINE-NEXT:    movw %r11w, 4(%rax)
-; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rax)
-; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
+; CHECK-BASELINE-NEXT:    xorl %r11d, %ecx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    xorl %r11d, %ecx
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
+; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    andl %ebx, %esi
-; CHECK-SSE1-NEXT:    andl %edi, %r8d
-; CHECK-SSE1-NEXT:    andl %r11d, %ecx
-; CHECK-SSE1-NEXT:    andl %r10d, %edx
-; CHECK-SSE1-NEXT:    notl %r10d
-; CHECK-SSE1-NEXT:    notl %r11d
-; CHECK-SSE1-NEXT:    notl %edi
-; CHECK-SSE1-NEXT:    notl %ebx
-; CHECK-SSE1-NEXT:    andl %r9d, %ebx
-; CHECK-SSE1-NEXT:    orl %esi, %ebx
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %di
-; CHECK-SSE1-NEXT:    orl %r8d, %edi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
-; CHECK-SSE1-NEXT:    orl %ecx, %r11d
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
-; CHECK-SSE1-NEXT:    orl %edx, %r10d
-; CHECK-SSE1-NEXT:    movw %bx, (%rax)
-; CHECK-SSE1-NEXT:    movw %di, 6(%rax)
-; CHECK-SSE1-NEXT:    movw %r11w, 4(%rax)
-; CHECK-SSE1-NEXT:    movw %r10w, 2(%rax)
-; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorl %edi, %edx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-SSE1-NEXT:    xorl %edi, %edx
+; CHECK-SSE1-NEXT:    xorl %r11d, %ecx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    xorl %r11d, %ecx
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
+; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
+; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16:
@@ -567,51 +506,43 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 ; CHECK-BASELINE-LABEL: out_v4i16_undef:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    andl %edi, %esi
-; CHECK-BASELINE-NEXT:    andl %r11d, %r8d
-; CHECK-BASELINE-NEXT:    andl %r10d, %edx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT:    notl %r10d
-; CHECK-BASELINE-NEXT:    notl %r11d
-; CHECK-BASELINE-NEXT:    notl %edi
-; CHECK-BASELINE-NEXT:    andl %r9d, %edi
-; CHECK-BASELINE-NEXT:    orl %esi, %edi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
-; CHECK-BASELINE-NEXT:    orl %r8d, %r11d
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
-; CHECK-BASELINE-NEXT:    orl %edx, %r10d
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
-; CHECK-BASELINE-NEXT:    movw %di, (%rax)
-; CHECK-BASELINE-NEXT:    movw %r11w, 6(%rax)
-; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
+; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16_undef:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    andl %edi, %esi
-; CHECK-SSE1-NEXT:    andl %r11d, %r8d
-; CHECK-SSE1-NEXT:    andl %r10d, %edx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT:    notl %r10d
-; CHECK-SSE1-NEXT:    notl %r11d
-; CHECK-SSE1-NEXT:    notl %edi
-; CHECK-SSE1-NEXT:    andl %r9d, %edi
-; CHECK-SSE1-NEXT:    orl %esi, %edi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
-; CHECK-SSE1-NEXT:    orl %r8d, %r11d
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
-; CHECK-SSE1-NEXT:    orl %edx, %r10d
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorl %edi, %edx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-SSE1-NEXT:    xorl %edi, %edx
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
-; CHECK-SSE1-NEXT:    movw %di, (%rax)
-; CHECK-SSE1-NEXT:    movw %r11w, 6(%rax)
-; CHECK-SSE1-NEXT:    movw %r10w, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
+; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16_undef:
@@ -635,30 +566,26 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i32:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
-; CHECK-BASELINE-NEXT:    orl %esi, %r9d
-; CHECK-BASELINE-NEXT:    andl %edx, %eax
-; CHECK-BASELINE-NEXT:    orl %edi, %eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i32:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notl %eax
-; CHECK-SSE1-NEXT:    notl %r9d
-; CHECK-SSE1-NEXT:    andl %ecx, %r9d
-; CHECK-SSE1-NEXT:    orl %esi, %r9d
-; CHECK-SSE1-NEXT:    andl %edx, %eax
-; CHECK-SSE1-NEXT:    orl %edi, %eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i32:
@@ -682,11 +609,10 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdx, %rax
-; CHECK-NEXT:    andq %rdx, %rdi
-; CHECK-NEXT:    notq %rax
-; CHECK-NEXT:    andq %rsi, %rax
-; CHECK-NEXT:    orq %rdi, %rax
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorq %rsi, %rax
+; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    xorq %rsi, %rax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -708,120 +634,105 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    movl %edx, %r11d
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %cl, %sil
-; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    xorb %bl, %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    xorb %bl, %sil
+; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    orb %sil, %cl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %dl, %sil
-; CHECK-BASELINE-NEXT:    notb %dl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    orb %sil, %dl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %bl, %sil
-; CHECK-BASELINE-NEXT:    notb %bl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    orb %sil, %bl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %r13b, %sil
-; CHECK-BASELINE-NEXT:    notb %r13b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    orb %sil, %r13b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %r12b, %sil
-; CHECK-BASELINE-NEXT:    notb %r12b
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorb %r10b, %r8b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    xorb %r10b, %r8b
+; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorb %r12b, %r9b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
+; CHECK-BASELINE-NEXT:    xorb %r12b, %r9b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    xorb %bpl, %r12b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
-; CHECK-BASELINE-NEXT:    orb %sil, %r12b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %r15b, %sil
-; CHECK-BASELINE-NEXT:    notb %r15b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
-; CHECK-BASELINE-NEXT:    orb %sil, %r15b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %r14b, %sil
-; CHECK-BASELINE-NEXT:    notb %r14b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-BASELINE-NEXT:    orb %sil, %r14b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %bpl, %sil
-; CHECK-BASELINE-NEXT:    notb %bpl
+; CHECK-BASELINE-NEXT:    xorb %bpl, %r12b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    xorb %r14b, %bpl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT:    orb %sil, %bpl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %r11b, %sil
-; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    orb %sil, %r11b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %r10b, %sil
-; CHECK-BASELINE-NEXT:    notb %r10b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    orb %sil, %r10b
+; CHECK-BASELINE-NEXT:    xorb %r14b, %bpl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %dil, %sil
-; CHECK-BASELINE-NEXT:    notb %dil
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
-; CHECK-BASELINE-NEXT:    orb %sil, %dil
-; CHECK-BASELINE-NEXT:    movb %cl, 15(%rax)
-; CHECK-BASELINE-NEXT:    movb %dl, 14(%rax)
-; CHECK-BASELINE-NEXT:    movb %bl, 13(%rax)
-; CHECK-BASELINE-NEXT:    movb %r13b, 12(%rax)
-; CHECK-BASELINE-NEXT:    movb %r12b, 11(%rax)
-; CHECK-BASELINE-NEXT:    movb %r15b, 10(%rax)
-; CHECK-BASELINE-NEXT:    movb %r14b, 9(%rax)
-; CHECK-BASELINE-NEXT:    movb %bpl, 8(%rax)
-; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rax)
-; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rax)
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    andb %cl, %r9b
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    orb %r9b, %cl
-; CHECK-BASELINE-NEXT:    movb %dil, 5(%rax)
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    andb %dl, %r8b
-; CHECK-BASELINE-NEXT:    notb %dl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    orb %r8b, %dl
-; CHECK-BASELINE-NEXT:    movb %cl, 4(%rax)
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andb %cl, %sil
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    orb %sil, %cl
-; CHECK-BASELINE-NEXT:    movb %dl, 3(%rax)
+; CHECK-BASELINE-NEXT:    xorb %r15b, %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    xorb %r15b, %sil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andb %dl, %sil
-; CHECK-BASELINE-NEXT:    notb %dl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %dl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    orb %sil, %dl
-; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
+; CHECK-BASELINE-NEXT:    xorb %r13b, %dl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andb %cl, %sil
-; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    orb %sil, %cl
-; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
-; CHECK-BASELINE-NEXT:    movb %cl, (%rax)
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %al, %r13b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-BASELINE-NEXT:    xorb %al, %r13b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %al, %r15b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    xorb %al, %r15b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %al, %r14b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    xorb %al, %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %al, %bl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    xorb %al, %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    xorb %r8b, %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    xorb %r8b, %al
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r8b
+; CHECK-BASELINE-NEXT:    xorb %r8b, %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    xorb %r8b, %r10b
+; CHECK-BASELINE-NEXT:    movb %r10b, 15(%rdi)
+; CHECK-BASELINE-NEXT:    movb %al, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bl, 13(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r14b, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r15b, 11(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r13b, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movb %cl, 9(%rdi)
+; CHECK-BASELINE-NEXT:    movb %dl, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movb %sil, 7(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bpl, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r12b, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -838,120 +749,105 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    movl %edx, %r11d
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %cl, %sil
-; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    xorb %bl, %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    xorb %bl, %sil
+; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorb %dl, %r11b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    xorb %dl, %r11b
+; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    orb %sil, %cl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %dl, %sil
-; CHECK-SSE1-NEXT:    notb %dl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    orb %sil, %dl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %bl, %sil
-; CHECK-SSE1-NEXT:    notb %bl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    orb %sil, %bl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %r13b, %sil
-; CHECK-SSE1-NEXT:    notb %r13b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    orb %sil, %r13b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %r12b, %sil
-; CHECK-SSE1-NEXT:    notb %r12b
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorb %r10b, %r8b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    xorb %r10b, %r8b
+; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorb %r12b, %r9b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
+; CHECK-SSE1-NEXT:    xorb %r12b, %r9b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    xorb %bpl, %r12b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
-; CHECK-SSE1-NEXT:    orb %sil, %r12b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %r15b, %sil
-; CHECK-SSE1-NEXT:    notb %r15b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
-; CHECK-SSE1-NEXT:    orb %sil, %r15b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %r14b, %sil
-; CHECK-SSE1-NEXT:    notb %r14b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-SSE1-NEXT:    orb %sil, %r14b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %bpl, %sil
-; CHECK-SSE1-NEXT:    notb %bpl
+; CHECK-SSE1-NEXT:    xorb %bpl, %r12b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    xorb %r14b, %bpl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT:    orb %sil, %bpl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %r11b, %sil
-; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    orb %sil, %r11b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %r10b, %sil
-; CHECK-SSE1-NEXT:    notb %r10b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    orb %sil, %r10b
+; CHECK-SSE1-NEXT:    xorb %r14b, %bpl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %dil, %sil
-; CHECK-SSE1-NEXT:    notb %dil
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
-; CHECK-SSE1-NEXT:    orb %sil, %dil
-; CHECK-SSE1-NEXT:    movb %cl, 15(%rax)
-; CHECK-SSE1-NEXT:    movb %dl, 14(%rax)
-; CHECK-SSE1-NEXT:    movb %bl, 13(%rax)
-; CHECK-SSE1-NEXT:    movb %r13b, 12(%rax)
-; CHECK-SSE1-NEXT:    movb %r12b, 11(%rax)
-; CHECK-SSE1-NEXT:    movb %r15b, 10(%rax)
-; CHECK-SSE1-NEXT:    movb %r14b, 9(%rax)
-; CHECK-SSE1-NEXT:    movb %bpl, 8(%rax)
-; CHECK-SSE1-NEXT:    movb %r11b, 7(%rax)
-; CHECK-SSE1-NEXT:    movb %r10b, 6(%rax)
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    andb %cl, %r9b
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    orb %r9b, %cl
-; CHECK-SSE1-NEXT:    movb %dil, 5(%rax)
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    andb %dl, %r8b
-; CHECK-SSE1-NEXT:    notb %dl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    orb %r8b, %dl
-; CHECK-SSE1-NEXT:    movb %cl, 4(%rax)
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-SSE1-NEXT:    andb %cl, %sil
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    orb %sil, %cl
-; CHECK-SSE1-NEXT:    movb %dl, 3(%rax)
+; CHECK-SSE1-NEXT:    xorb %r15b, %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    xorb %r15b, %sil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-SSE1-NEXT:    andb %dl, %sil
-; CHECK-SSE1-NEXT:    notb %dl
+; CHECK-SSE1-NEXT:    xorb %r13b, %dl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    orb %sil, %dl
-; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
+; CHECK-SSE1-NEXT:    xorb %r13b, %dl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-SSE1-NEXT:    andb %cl, %sil
-; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    orb %sil, %cl
-; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
-; CHECK-SSE1-NEXT:    movb %cl, (%rax)
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %al, %r13b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
+; CHECK-SSE1-NEXT:    xorb %al, %r13b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %al, %r15b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    xorb %al, %r15b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %al, %r14b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    xorb %al, %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %al, %bl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    xorb %al, %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    xorb %r8b, %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    xorb %r8b, %al
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r8b
+; CHECK-SSE1-NEXT:    xorb %r8b, %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    xorb %r8b, %r10b
+; CHECK-SSE1-NEXT:    movb %r10b, 15(%rdi)
+; CHECK-SSE1-NEXT:    movb %al, 14(%rdi)
+; CHECK-SSE1-NEXT:    movb %bl, 13(%rdi)
+; CHECK-SSE1-NEXT:    movb %r14b, 12(%rdi)
+; CHECK-SSE1-NEXT:    movb %r15b, 11(%rdi)
+; CHECK-SSE1-NEXT:    movb %r13b, 10(%rdi)
+; CHECK-SSE1-NEXT:    movb %cl, 9(%rdi)
+; CHECK-SSE1-NEXT:    movb %dl, 8(%rdi)
+; CHECK-SSE1-NEXT:    movb %sil, 7(%rdi)
+; CHECK-SSE1-NEXT:    movb %bpl, 6(%rdi)
+; CHECK-SSE1-NEXT:    movb %r12b, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 3(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -982,124 +878,116 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-LABEL: out_v8i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    pushq %rbp
+; CHECK-BASELINE-NEXT:    pushq %r15
 ; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    andw %r14w, %bx
-; CHECK-BASELINE-NEXT:    notl %r14d
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r14w
-; CHECK-BASELINE-NEXT:    orl %ebx, %r14d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    andw %r11w, %di
-; CHECK-BASELINE-NEXT:    notl %r11d
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
-; CHECK-BASELINE-NEXT:    orl %edi, %r11d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    andw %r10w, %di
-; CHECK-BASELINE-NEXT:    notl %r10d
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
-; CHECK-BASELINE-NEXT:    orl %edi, %r10d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    andl %ebx, %r9d
-; CHECK-BASELINE-NEXT:    notl %ebx
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
-; CHECK-BASELINE-NEXT:    orl %r9d, %ebx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    andl %edi, %r8d
-; CHECK-BASELINE-NEXT:    notl %edi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-BASELINE-NEXT:    xorl %ebp, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %ebp, %esi
+; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
+; CHECK-BASELINE-NEXT:    xorl %edi, %ecx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    xorl %edi, %ecx
+; CHECK-BASELINE-NEXT:    xorl %r12d, %r8d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-BASELINE-NEXT:    xorl %r12d, %r8d
+; CHECK-BASELINE-NEXT:    xorl %r15d, %r9d
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
+; CHECK-BASELINE-NEXT:    xorl %r15d, %r9d
+; CHECK-BASELINE-NEXT:    movl %r14d, %edi
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %di
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %di
-; CHECK-BASELINE-NEXT:    orl %r8d, %edi
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT:    andl %ebp, %ecx
-; CHECK-BASELINE-NEXT:    notl %ebp
+; CHECK-BASELINE-NEXT:    xorl %r14d, %edi
+; CHECK-BASELINE-NEXT:    movl %r11d, %ebx
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
+; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
+; CHECK-BASELINE-NEXT:    movl %r10d, %ebp
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bp
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bp
-; CHECK-BASELINE-NEXT:    orl %ecx, %ebp
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-BASELINE-NEXT:    andl %ecx, %edx
-; CHECK-BASELINE-NEXT:    notl %ecx
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT:    orl %edx, %ecx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edx
-; CHECK-BASELINE-NEXT:    andl %edx, %esi
-; CHECK-BASELINE-NEXT:    notl %edx
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
-; CHECK-BASELINE-NEXT:    orl %esi, %edx
-; CHECK-BASELINE-NEXT:    movw %r14w, 14(%rax)
-; CHECK-BASELINE-NEXT:    movw %r11w, 12(%rax)
-; CHECK-BASELINE-NEXT:    movw %r10w, 10(%rax)
-; CHECK-BASELINE-NEXT:    movw %bx, 8(%rax)
-; CHECK-BASELINE-NEXT:    movw %di, 6(%rax)
-; CHECK-BASELINE-NEXT:    movw %bp, 4(%rax)
-; CHECK-BASELINE-NEXT:    movw %cx, 2(%rax)
-; CHECK-BASELINE-NEXT:    movw %dx, (%rax)
+; CHECK-BASELINE-NEXT:    xorl %r10d, %ebp
+; CHECK-BASELINE-NEXT:    movw %bp, 14(%rax)
+; CHECK-BASELINE-NEXT:    movw %bx, 12(%rax)
+; CHECK-BASELINE-NEXT:    movw %di, 10(%rax)
+; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rax)
+; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
+; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r14
+; CHECK-BASELINE-NEXT:    popq %r15
 ; CHECK-BASELINE-NEXT:    popq %rbp
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v8i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    pushq %rbp
+; CHECK-SSE1-NEXT:    pushq %r15
 ; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    andw %r14w, %bx
-; CHECK-SSE1-NEXT:    notl %r14d
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r14w
-; CHECK-SSE1-NEXT:    orl %ebx, %r14d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    andw %r11w, %di
-; CHECK-SSE1-NEXT:    notl %r11d
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
-; CHECK-SSE1-NEXT:    orl %edi, %r11d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    andw %r10w, %di
-; CHECK-SSE1-NEXT:    notl %r10d
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
-; CHECK-SSE1-NEXT:    orl %edi, %r10d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    andl %ebx, %r9d
-; CHECK-SSE1-NEXT:    notl %ebx
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
-; CHECK-SSE1-NEXT:    orl %r9d, %ebx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    andl %edi, %r8d
-; CHECK-SSE1-NEXT:    notl %edi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-SSE1-NEXT:    xorl %ebp, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %ebp, %esi
+; CHECK-SSE1-NEXT:    xorl %ebx, %edx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
+; CHECK-SSE1-NEXT:    xorl %ebx, %edx
+; CHECK-SSE1-NEXT:    xorl %edi, %ecx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    xorl %edi, %ecx
+; CHECK-SSE1-NEXT:    xorl %r12d, %r8d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
+; CHECK-SSE1-NEXT:    xorl %r12d, %r8d
+; CHECK-SSE1-NEXT:    xorl %r15d, %r9d
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
+; CHECK-SSE1-NEXT:    xorl %r15d, %r9d
+; CHECK-SSE1-NEXT:    movl %r14d, %edi
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %di
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %di
-; CHECK-SSE1-NEXT:    orl %r8d, %edi
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT:    andl %ebp, %ecx
-; CHECK-SSE1-NEXT:    notl %ebp
+; CHECK-SSE1-NEXT:    xorl %r14d, %edi
+; CHECK-SSE1-NEXT:    movl %r11d, %ebx
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
+; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
+; CHECK-SSE1-NEXT:    movl %r10d, %ebp
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bp
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bp
-; CHECK-SSE1-NEXT:    orl %ecx, %ebp
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-SSE1-NEXT:    andl %ecx, %edx
-; CHECK-SSE1-NEXT:    notl %ecx
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT:    orl %edx, %ecx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edx
-; CHECK-SSE1-NEXT:    andl %edx, %esi
-; CHECK-SSE1-NEXT:    notl %edx
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
-; CHECK-SSE1-NEXT:    orl %esi, %edx
-; CHECK-SSE1-NEXT:    movw %r14w, 14(%rax)
-; CHECK-SSE1-NEXT:    movw %r11w, 12(%rax)
-; CHECK-SSE1-NEXT:    movw %r10w, 10(%rax)
-; CHECK-SSE1-NEXT:    movw %bx, 8(%rax)
-; CHECK-SSE1-NEXT:    movw %di, 6(%rax)
-; CHECK-SSE1-NEXT:    movw %bp, 4(%rax)
-; CHECK-SSE1-NEXT:    movw %cx, 2(%rax)
-; CHECK-SSE1-NEXT:    movw %dx, (%rax)
+; CHECK-SSE1-NEXT:    xorl %r10d, %ebp
+; CHECK-SSE1-NEXT:    movw %bp, 14(%rax)
+; CHECK-SSE1-NEXT:    movw %bx, 12(%rax)
+; CHECK-SSE1-NEXT:    movw %di, 10(%rax)
+; CHECK-SSE1-NEXT:    movw %r9w, 8(%rax)
+; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
+; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r14
+; CHECK-SSE1-NEXT:    popq %r15
 ; CHECK-SSE1-NEXT:    popq %rbp
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -1124,37 +1012,31 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 define <4 x i32> @out_v4i32(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v4i32:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl (%rcx), %r8d
-; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r9d
-; CHECK-BASELINE-NEXT:    movl 8(%rcx), %edi
-; CHECK-BASELINE-NEXT:    movl 12(%rcx), %ecx
-; CHECK-BASELINE-NEXT:    movl 12(%rsi), %r10d
-; CHECK-BASELINE-NEXT:    andl %ecx, %r10d
-; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    andl %edi, %r11d
-; CHECK-BASELINE-NEXT:    movl 4(%rsi), %ebx
-; CHECK-BASELINE-NEXT:    andl %r9d, %ebx
-; CHECK-BASELINE-NEXT:    movl (%rsi), %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %esi
-; CHECK-BASELINE-NEXT:    notl %r8d
-; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    notl %edi
-; CHECK-BASELINE-NEXT:    notl %ecx
-; CHECK-BASELINE-NEXT:    andl 12(%rdx), %ecx
-; CHECK-BASELINE-NEXT:    orl %r10d, %ecx
-; CHECK-BASELINE-NEXT:    andl 8(%rdx), %edi
-; CHECK-BASELINE-NEXT:    orl %r11d, %edi
-; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    orl %ebx, %r9d
-; CHECK-BASELINE-NEXT:    andl (%rdx), %r8d
-; CHECK-BASELINE-NEXT:    orl %esi, %r8d
-; CHECK-BASELINE-NEXT:    movl %ecx, 12(%rax)
-; CHECK-BASELINE-NEXT:    movl %edi, 8(%rax)
-; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rax)
-; CHECK-BASELINE-NEXT:    movl %r8d, (%rax)
-; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movl (%rdx), %edi
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movl (%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorl %edi, %r11d
+; CHECK-BASELINE-NEXT:    andl (%rcx), %r11d
+; CHECK-BASELINE-NEXT:    xorl %edi, %r11d
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %edi
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edi
+; CHECK-BASELINE-NEXT:    andl 4(%rcx), %edi
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edi
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
+; CHECK-BASELINE-NEXT:    andl 8(%rcx), %edx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
+; CHECK-BASELINE-NEXT:    andl 12(%rcx), %esi
+; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
+; CHECK-BASELINE-NEXT:    movl %esi, 12(%rax)
+; CHECK-BASELINE-NEXT:    movl %edx, 8(%rax)
+; CHECK-BASELINE-NEXT:    movl %edi, 4(%rax)
+; CHECK-BASELINE-NEXT:    movl %r11d, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i32:
@@ -1197,30 +1079,27 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pm
 ; CHECK-BASELINE-LABEL: out_v4i32_undef:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r8d
-; CHECK-BASELINE-NEXT:    movl (%rcx), %r9d
-; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r10d
-; CHECK-BASELINE-NEXT:    movl 12(%rcx), %edi
-; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r8d
-; CHECK-BASELINE-NEXT:    movl 12(%rsi), %ecx
-; CHECK-BASELINE-NEXT:    andl %edi, %ecx
-; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    andl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    movl (%rsi), %esi
-; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    notl %r10d
-; CHECK-BASELINE-NEXT:    notl %edi
-; CHECK-BASELINE-NEXT:    andl 12(%rdx), %edi
-; CHECK-BASELINE-NEXT:    orl %ecx, %edi
-; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    orl %r11d, %r10d
-; CHECK-BASELINE-NEXT:    andl (%rdx), %r9d
-; CHECK-BASELINE-NEXT:    orl %esi, %r9d
-; CHECK-BASELINE-NEXT:    movl %r8d, 8(%rax)
-; CHECK-BASELINE-NEXT:    movl %edi, 12(%rax)
-; CHECK-BASELINE-NEXT:    movl %r10d, 4(%rax)
-; CHECK-BASELINE-NEXT:    movl %r9d, (%rax)
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r9d
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movl (%rdx), %edi
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r9d
+; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
+; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %edi
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edi
+; CHECK-BASELINE-NEXT:    andl 4(%rcx), %edi
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edi
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
+; CHECK-BASELINE-NEXT:    andl 12(%rcx), %esi
+; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
+; CHECK-BASELINE-NEXT:    movl %r9d, 8(%rax)
+; CHECK-BASELINE-NEXT:    movl %esi, 12(%rax)
+; CHECK-BASELINE-NEXT:    movl %edi, 4(%rax)
+; CHECK-BASELINE-NEXT:    movl %edx, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i32_undef:
@@ -1262,30 +1141,26 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pm
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i64:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movq %r8, %rax
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
+; CHECK-BASELINE-NEXT:    andq %r8, %rax
+; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
+; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
 ; CHECK-BASELINE-NEXT:    andq %r9, %rsi
-; CHECK-BASELINE-NEXT:    andq %r8, %rdi
-; CHECK-BASELINE-NEXT:    notq %rax
-; CHECK-BASELINE-NEXT:    notq %r9
-; CHECK-BASELINE-NEXT:    andq %rcx, %r9
-; CHECK-BASELINE-NEXT:    orq %rsi, %r9
-; CHECK-BASELINE-NEXT:    andq %rdx, %rax
-; CHECK-BASELINE-NEXT:    orq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movq %r9, %rdx
+; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
+; CHECK-BASELINE-NEXT:    movq %rsi, %rdx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i64:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movq %r8, %rax
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    xorq %rdx, %rax
+; CHECK-SSE1-NEXT:    andq %r8, %rax
+; CHECK-SSE1-NEXT:    xorq %rdx, %rax
+; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
 ; CHECK-SSE1-NEXT:    andq %r9, %rsi
-; CHECK-SSE1-NEXT:    andq %r8, %rdi
-; CHECK-SSE1-NEXT:    notq %rax
-; CHECK-SSE1-NEXT:    notq %r9
-; CHECK-SSE1-NEXT:    andq %rcx, %r9
-; CHECK-SSE1-NEXT:    orq %rsi, %r9
-; CHECK-SSE1-NEXT:    andq %rdx, %rax
-; CHECK-SSE1-NEXT:    orq %rdi, %rax
-; CHECK-SSE1-NEXT:    movq %r9, %rdx
+; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
+; CHECK-SSE1-NEXT:    movq %rsi, %rdx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i64:
@@ -1319,286 +1194,257 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movq %rcx, %r15
-; CHECK-BASELINE-NEXT:    movq %rsi, %r12
-; CHECK-BASELINE-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-BASELINE-NEXT:    movb 16(%rcx), %al
+; CHECK-BASELINE-NEXT:    movq %rcx, %r10
+; CHECK-BASELINE-NEXT:    movq %rdx, %r8
+; CHECK-BASELINE-NEXT:    movq %rsi, %r9
+; CHECK-BASELINE-NEXT:    movq %rdi, %r11
+; CHECK-BASELINE-NEXT:    movb 15(%rdx), %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 17(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb 14(%rdx), %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 18(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb 13(%rdx), %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 19(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb 12(%rdx), %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 20(%rcx), %al
+; CHECK-BASELINE-NEXT:    movb 11(%rdx), %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 21(%rcx), %r14b
-; CHECK-BASELINE-NEXT:    movb 22(%rcx), %r9b
-; CHECK-BASELINE-NEXT:    movb 23(%rcx), %r10b
-; CHECK-BASELINE-NEXT:    movb 24(%rcx), %r11b
-; CHECK-BASELINE-NEXT:    movb 25(%rcx), %bpl
-; CHECK-BASELINE-NEXT:    movb 26(%rcx), %r13b
-; CHECK-BASELINE-NEXT:    movb 27(%rcx), %r8b
-; CHECK-BASELINE-NEXT:    movb 28(%rcx), %dil
-; CHECK-BASELINE-NEXT:    movb 29(%rcx), %sil
-; CHECK-BASELINE-NEXT:    movb 30(%rcx), %bl
-; CHECK-BASELINE-NEXT:    movb 31(%rcx), %al
-; CHECK-BASELINE-NEXT:    movb 31(%r12), %cl
-; CHECK-BASELINE-NEXT:    andb %al, %cl
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    andb 31(%rdx), %al
-; CHECK-BASELINE-NEXT:    orb %cl, %al
+; CHECK-BASELINE-NEXT:    movb 10(%rdx), %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 30(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %bl, %al
-; CHECK-BASELINE-NEXT:    notb %bl
-; CHECK-BASELINE-NEXT:    andb 30(%rdx), %bl
-; CHECK-BASELINE-NEXT:    orb %al, %bl
+; CHECK-BASELINE-NEXT:    movb 9(%rdx), %bpl
+; CHECK-BASELINE-NEXT:    movb 8(%rdx), %r14b
+; CHECK-BASELINE-NEXT:    movb 7(%rdx), %r15b
+; CHECK-BASELINE-NEXT:    movb 6(%rdx), %r12b
+; CHECK-BASELINE-NEXT:    movb 5(%rdx), %sil
+; CHECK-BASELINE-NEXT:    movb 4(%rdx), %r13b
+; CHECK-BASELINE-NEXT:    movb 3(%rdx), %dl
+; CHECK-BASELINE-NEXT:    movb 2(%r8), %dil
+; CHECK-BASELINE-NEXT:    movb (%r8), %al
+; CHECK-BASELINE-NEXT:    movb 1(%r8), %cl
+; CHECK-BASELINE-NEXT:    movb (%r9), %bl
+; CHECK-BASELINE-NEXT:    xorb %al, %bl
+; CHECK-BASELINE-NEXT:    andb (%r10), %bl
+; CHECK-BASELINE-NEXT:    xorb %al, %bl
 ; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 29(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %sil, %al
-; CHECK-BASELINE-NEXT:    notb %sil
-; CHECK-BASELINE-NEXT:    andb 29(%rdx), %sil
-; CHECK-BASELINE-NEXT:    orb %al, %sil
-; CHECK-BASELINE-NEXT:    movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 28(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %dil, %al
-; CHECK-BASELINE-NEXT:    notb %dil
-; CHECK-BASELINE-NEXT:    andb 28(%rdx), %dil
-; CHECK-BASELINE-NEXT:    orb %al, %dil
-; CHECK-BASELINE-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 27(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r8b, %al
-; CHECK-BASELINE-NEXT:    notb %r8b
-; CHECK-BASELINE-NEXT:    andb 27(%rdx), %r8b
-; CHECK-BASELINE-NEXT:    orb %al, %r8b
-; CHECK-BASELINE-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 26(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r13b, %al
-; CHECK-BASELINE-NEXT:    notb %r13b
-; CHECK-BASELINE-NEXT:    andb 26(%rdx), %r13b
-; CHECK-BASELINE-NEXT:    orb %al, %r13b
-; CHECK-BASELINE-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 25(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %bpl, %al
-; CHECK-BASELINE-NEXT:    notb %bpl
-; CHECK-BASELINE-NEXT:    andb 25(%rdx), %bpl
-; CHECK-BASELINE-NEXT:    orb %al, %bpl
-; CHECK-BASELINE-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 24(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r11b, %al
-; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    andb 24(%rdx), %r11b
-; CHECK-BASELINE-NEXT:    orb %al, %r11b
-; CHECK-BASELINE-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 23(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r10b, %al
-; CHECK-BASELINE-NEXT:    notb %r10b
-; CHECK-BASELINE-NEXT:    andb 23(%rdx), %r10b
-; CHECK-BASELINE-NEXT:    orb %al, %r10b
-; CHECK-BASELINE-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 22(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r9b, %al
-; CHECK-BASELINE-NEXT:    notb %r9b
-; CHECK-BASELINE-NEXT:    andb 22(%rdx), %r9b
-; CHECK-BASELINE-NEXT:    orb %al, %r9b
-; CHECK-BASELINE-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 21(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r14b, %al
-; CHECK-BASELINE-NEXT:    notb %r14b
-; CHECK-BASELINE-NEXT:    andb 21(%rdx), %r14b
-; CHECK-BASELINE-NEXT:    orb %al, %r14b
-; CHECK-BASELINE-NEXT:    movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 20(%r12), %al
+; CHECK-BASELINE-NEXT:    movb 1(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    andb 1(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 2(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 2(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 3(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    andb 3(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 4(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %r13b, %al
+; CHECK-BASELINE-NEXT:    andb 4(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %r13b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 5(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %sil, %al
+; CHECK-BASELINE-NEXT:    andb 5(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %sil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 6(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %r12b, %al
+; CHECK-BASELINE-NEXT:    andb 6(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %r12b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 7(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %r15b, %al
+; CHECK-BASELINE-NEXT:    andb 7(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %r15b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 8(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %r14b, %al
+; CHECK-BASELINE-NEXT:    andb 8(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %r14b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 9(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %bpl, %al
+; CHECK-BASELINE-NEXT:    andb 9(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %bpl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 10(%r9), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb 20(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 19(%r12), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    andb 10(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 11(%r9), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb 19(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 18(%r12), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    andb 11(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 12(%r9), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb 18(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 17(%r12), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    andb 12(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 13(%r9), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    movq %rdx, %rbx
-; CHECK-BASELINE-NEXT:    andb 17(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 16(%r12), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    andb 13(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 14(%r9), %al
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    andb 14(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 15(%r9), %al
 ; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb 16(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    andb 15(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movb 16(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 16(%r9), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb 16(%r10), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 15(%r15), %cl
-; CHECK-BASELINE-NEXT:    movb 15(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb 15(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb 17(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 17(%r9), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb 17(%r10), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 14(%r15), %cl
-; CHECK-BASELINE-NEXT:    movb 14(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb 14(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb 18(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 18(%r9), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb 18(%r10), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 13(%r15), %cl
-; CHECK-BASELINE-NEXT:    movb 13(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb 13(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb 19(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 19(%r9), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb 19(%r10), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 12(%r15), %cl
-; CHECK-BASELINE-NEXT:    movb 12(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %cl, %al
-; CHECK-BASELINE-NEXT:    notb %cl
-; CHECK-BASELINE-NEXT:    andb 12(%rdx), %cl
-; CHECK-BASELINE-NEXT:    orb %al, %cl
+; CHECK-BASELINE-NEXT:    movb 20(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 20(%r9), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb 20(%r10), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movb 11(%r15), %r13b
-; CHECK-BASELINE-NEXT:    movb 11(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r13b, %al
-; CHECK-BASELINE-NEXT:    notb %r13b
-; CHECK-BASELINE-NEXT:    andb 11(%rdx), %r13b
-; CHECK-BASELINE-NEXT:    orb %al, %r13b
-; CHECK-BASELINE-NEXT:    movb 10(%r15), %r14b
-; CHECK-BASELINE-NEXT:    movb 10(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r14b, %al
-; CHECK-BASELINE-NEXT:    notb %r14b
-; CHECK-BASELINE-NEXT:    andb 10(%rdx), %r14b
-; CHECK-BASELINE-NEXT:    orb %al, %r14b
-; CHECK-BASELINE-NEXT:    movb 9(%r15), %bpl
-; CHECK-BASELINE-NEXT:    movb 9(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %bpl, %al
-; CHECK-BASELINE-NEXT:    notb %bpl
-; CHECK-BASELINE-NEXT:    andb 9(%rdx), %bpl
-; CHECK-BASELINE-NEXT:    orb %al, %bpl
-; CHECK-BASELINE-NEXT:    movb 8(%r15), %r11b
-; CHECK-BASELINE-NEXT:    movb 8(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r11b, %al
-; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    andb 8(%rdx), %r11b
-; CHECK-BASELINE-NEXT:    orb %al, %r11b
-; CHECK-BASELINE-NEXT:    movb 7(%r15), %r10b
-; CHECK-BASELINE-NEXT:    movb 7(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r10b, %al
-; CHECK-BASELINE-NEXT:    notb %r10b
-; CHECK-BASELINE-NEXT:    andb 7(%rdx), %r10b
-; CHECK-BASELINE-NEXT:    orb %al, %r10b
-; CHECK-BASELINE-NEXT:    movb 6(%r15), %r9b
-; CHECK-BASELINE-NEXT:    movb 6(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r9b, %al
-; CHECK-BASELINE-NEXT:    notb %r9b
-; CHECK-BASELINE-NEXT:    andb 6(%rdx), %r9b
-; CHECK-BASELINE-NEXT:    orb %al, %r9b
-; CHECK-BASELINE-NEXT:    movb 5(%r15), %r8b
-; CHECK-BASELINE-NEXT:    movb 5(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %r8b, %al
-; CHECK-BASELINE-NEXT:    notb %r8b
-; CHECK-BASELINE-NEXT:    andb 5(%rdx), %r8b
-; CHECK-BASELINE-NEXT:    orb %al, %r8b
-; CHECK-BASELINE-NEXT:    movb 4(%r15), %dil
-; CHECK-BASELINE-NEXT:    movb 4(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %dil, %al
-; CHECK-BASELINE-NEXT:    notb %dil
-; CHECK-BASELINE-NEXT:    andb 4(%rdx), %dil
-; CHECK-BASELINE-NEXT:    orb %al, %dil
-; CHECK-BASELINE-NEXT:    movb 3(%r15), %sil
-; CHECK-BASELINE-NEXT:    movb 3(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %sil, %al
-; CHECK-BASELINE-NEXT:    notb %sil
-; CHECK-BASELINE-NEXT:    andb 3(%rdx), %sil
-; CHECK-BASELINE-NEXT:    orb %al, %sil
-; CHECK-BASELINE-NEXT:    movb 2(%r15), %dl
-; CHECK-BASELINE-NEXT:    movb 2(%r12), %al
-; CHECK-BASELINE-NEXT:    andb %dl, %al
-; CHECK-BASELINE-NEXT:    notb %dl
-; CHECK-BASELINE-NEXT:    andb 2(%rbx), %dl
-; CHECK-BASELINE-NEXT:    orb %al, %dl
-; CHECK-BASELINE-NEXT:    movb 1(%r15), %al
-; CHECK-BASELINE-NEXT:    movb 1(%r12), %cl
-; CHECK-BASELINE-NEXT:    andb %al, %cl
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    andb 1(%rbx), %al
-; CHECK-BASELINE-NEXT:    orb %cl, %al
-; CHECK-BASELINE-NEXT:    movb (%r15), %r15b
-; CHECK-BASELINE-NEXT:    movb (%r12), %cl
-; CHECK-BASELINE-NEXT:    andb %r15b, %cl
-; CHECK-BASELINE-NEXT:    notb %r15b
-; CHECK-BASELINE-NEXT:    andb (%rbx), %r15b
-; CHECK-BASELINE-NEXT:    orb %cl, %r15b
-; CHECK-BASELINE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 31(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 30(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 29(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 28(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 27(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 26(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 25(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 24(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 23(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 22(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 21(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 20(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 19(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 18(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 17(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 16(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 15(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 14(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 13(%rcx)
-; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-BASELINE-NEXT:    movb %bl, 12(%rcx)
-; CHECK-BASELINE-NEXT:    movb %r13b, 11(%rcx)
-; CHECK-BASELINE-NEXT:    movb %r14b, 10(%rcx)
-; CHECK-BASELINE-NEXT:    movb %bpl, 9(%rcx)
-; CHECK-BASELINE-NEXT:    movb %r11b, 8(%rcx)
-; CHECK-BASELINE-NEXT:    movb %r10b, 7(%rcx)
-; CHECK-BASELINE-NEXT:    movb %r9b, 6(%rcx)
-; CHECK-BASELINE-NEXT:    movb %r8b, 5(%rcx)
-; CHECK-BASELINE-NEXT:    movb %dil, 4(%rcx)
-; CHECK-BASELINE-NEXT:    movb %sil, 3(%rcx)
-; CHECK-BASELINE-NEXT:    movb %dl, 2(%rcx)
-; CHECK-BASELINE-NEXT:    movb %al, 1(%rcx)
-; CHECK-BASELINE-NEXT:    movb %r15b, (%rcx)
-; CHECK-BASELINE-NEXT:    movq %rcx, %rax
+; CHECK-BASELINE-NEXT:    movb 21(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 21(%r9), %r13b
+; CHECK-BASELINE-NEXT:    xorb %al, %r13b
+; CHECK-BASELINE-NEXT:    andb 21(%r10), %r13b
+; CHECK-BASELINE-NEXT:    xorb %al, %r13b
+; CHECK-BASELINE-NEXT:    movb 22(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 22(%r9), %r12b
+; CHECK-BASELINE-NEXT:    xorb %al, %r12b
+; CHECK-BASELINE-NEXT:    andb 22(%r10), %r12b
+; CHECK-BASELINE-NEXT:    xorb %al, %r12b
+; CHECK-BASELINE-NEXT:    movb 23(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 23(%r9), %r15b
+; CHECK-BASELINE-NEXT:    xorb %al, %r15b
+; CHECK-BASELINE-NEXT:    andb 23(%r10), %r15b
+; CHECK-BASELINE-NEXT:    xorb %al, %r15b
+; CHECK-BASELINE-NEXT:    movb 24(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 24(%r9), %r14b
+; CHECK-BASELINE-NEXT:    xorb %al, %r14b
+; CHECK-BASELINE-NEXT:    andb 24(%r10), %r14b
+; CHECK-BASELINE-NEXT:    xorb %al, %r14b
+; CHECK-BASELINE-NEXT:    movb 25(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 25(%r9), %bpl
+; CHECK-BASELINE-NEXT:    xorb %al, %bpl
+; CHECK-BASELINE-NEXT:    andb 25(%r10), %bpl
+; CHECK-BASELINE-NEXT:    xorb %al, %bpl
+; CHECK-BASELINE-NEXT:    movb 26(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 26(%r9), %dil
+; CHECK-BASELINE-NEXT:    xorb %al, %dil
+; CHECK-BASELINE-NEXT:    andb 26(%r10), %dil
+; CHECK-BASELINE-NEXT:    xorb %al, %dil
+; CHECK-BASELINE-NEXT:    movb 27(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 27(%r9), %sil
+; CHECK-BASELINE-NEXT:    xorb %al, %sil
+; CHECK-BASELINE-NEXT:    andb 27(%r10), %sil
+; CHECK-BASELINE-NEXT:    xorb %al, %sil
+; CHECK-BASELINE-NEXT:    movb 28(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 28(%r9), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    andb 28(%r10), %dl
+; CHECK-BASELINE-NEXT:    xorb %al, %dl
+; CHECK-BASELINE-NEXT:    movb 29(%r8), %al
+; CHECK-BASELINE-NEXT:    movb 29(%r9), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    andb 29(%r10), %cl
+; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    movb 30(%r8), %bl
+; CHECK-BASELINE-NEXT:    movb 30(%r9), %al
+; CHECK-BASELINE-NEXT:    xorb %bl, %al
+; CHECK-BASELINE-NEXT:    andb 30(%r10), %al
+; CHECK-BASELINE-NEXT:    xorb %bl, %al
+; CHECK-BASELINE-NEXT:    movb 31(%r8), %r8b
+; CHECK-BASELINE-NEXT:    movb 31(%r9), %bl
+; CHECK-BASELINE-NEXT:    xorb %r8b, %bl
+; CHECK-BASELINE-NEXT:    andb 31(%r10), %bl
+; CHECK-BASELINE-NEXT:    xorb %r8b, %bl
+; CHECK-BASELINE-NEXT:    movb %bl, 31(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 30(%r11)
+; CHECK-BASELINE-NEXT:    movb %cl, 29(%r11)
+; CHECK-BASELINE-NEXT:    movb %dl, 28(%r11)
+; CHECK-BASELINE-NEXT:    movb %sil, 27(%r11)
+; CHECK-BASELINE-NEXT:    movb %dil, 26(%r11)
+; CHECK-BASELINE-NEXT:    movb %bpl, 25(%r11)
+; CHECK-BASELINE-NEXT:    movb %r14b, 24(%r11)
+; CHECK-BASELINE-NEXT:    movb %r15b, 23(%r11)
+; CHECK-BASELINE-NEXT:    movb %r12b, 22(%r11)
+; CHECK-BASELINE-NEXT:    movb %r13b, 21(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 20(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 19(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 18(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 17(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 16(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 15(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 14(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 13(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 12(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 11(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 10(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 9(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 8(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 7(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 6(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 5(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 4(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 3(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 2(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, 1(%r11)
+; CHECK-BASELINE-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-BASELINE-NEXT:    movb %al, (%r11)
+; CHECK-BASELINE-NEXT:    movq %r11, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -1615,286 +1461,257 @@ define <32 x i8> @out_v32i8(<32 x i8> *%px, <32 x i8> *%py, <32 x i8> *%pmask) n
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movq %rcx, %r15
-; CHECK-SSE1-NEXT:    movq %rsi, %r12
-; CHECK-SSE1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-SSE1-NEXT:    movb 16(%rcx), %al
+; CHECK-SSE1-NEXT:    movq %rcx, %r10
+; CHECK-SSE1-NEXT:    movq %rdx, %r8
+; CHECK-SSE1-NEXT:    movq %rsi, %r9
+; CHECK-SSE1-NEXT:    movq %rdi, %r11
+; CHECK-SSE1-NEXT:    movb 15(%rdx), %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 17(%rcx), %al
+; CHECK-SSE1-NEXT:    movb 14(%rdx), %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 18(%rcx), %al
+; CHECK-SSE1-NEXT:    movb 13(%rdx), %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 19(%rcx), %al
+; CHECK-SSE1-NEXT:    movb 12(%rdx), %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 20(%rcx), %al
+; CHECK-SSE1-NEXT:    movb 11(%rdx), %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 21(%rcx), %r14b
-; CHECK-SSE1-NEXT:    movb 22(%rcx), %r9b
-; CHECK-SSE1-NEXT:    movb 23(%rcx), %r10b
-; CHECK-SSE1-NEXT:    movb 24(%rcx), %r11b
-; CHECK-SSE1-NEXT:    movb 25(%rcx), %bpl
-; CHECK-SSE1-NEXT:    movb 26(%rcx), %r13b
-; CHECK-SSE1-NEXT:    movb 27(%rcx), %r8b
-; CHECK-SSE1-NEXT:    movb 28(%rcx), %dil
-; CHECK-SSE1-NEXT:    movb 29(%rcx), %sil
-; CHECK-SSE1-NEXT:    movb 30(%rcx), %bl
-; CHECK-SSE1-NEXT:    movb 31(%rcx), %al
-; CHECK-SSE1-NEXT:    movb 31(%r12), %cl
-; CHECK-SSE1-NEXT:    andb %al, %cl
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    andb 31(%rdx), %al
-; CHECK-SSE1-NEXT:    orb %cl, %al
+; CHECK-SSE1-NEXT:    movb 10(%rdx), %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 30(%r12), %al
-; CHECK-SSE1-NEXT:    andb %bl, %al
-; CHECK-SSE1-NEXT:    notb %bl
-; CHECK-SSE1-NEXT:    andb 30(%rdx), %bl
-; CHECK-SSE1-NEXT:    orb %al, %bl
+; CHECK-SSE1-NEXT:    movb 9(%rdx), %bpl
+; CHECK-SSE1-NEXT:    movb 8(%rdx), %r14b
+; CHECK-SSE1-NEXT:    movb 7(%rdx), %r15b
+; CHECK-SSE1-NEXT:    movb 6(%rdx), %r12b
+; CHECK-SSE1-NEXT:    movb 5(%rdx), %sil
+; CHECK-SSE1-NEXT:    movb 4(%rdx), %r13b
+; CHECK-SSE1-NEXT:    movb 3(%rdx), %dl
+; CHECK-SSE1-NEXT:    movb 2(%r8), %dil
+; CHECK-SSE1-NEXT:    movb (%r8), %al
+; CHECK-SSE1-NEXT:    movb 1(%r8), %cl
+; CHECK-SSE1-NEXT:    movb (%r9), %bl
+; CHECK-SSE1-NEXT:    xorb %al, %bl
+; CHECK-SSE1-NEXT:    andb (%r10), %bl
+; CHECK-SSE1-NEXT:    xorb %al, %bl
 ; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 29(%r12), %al
-; CHECK-SSE1-NEXT:    andb %sil, %al
-; CHECK-SSE1-NEXT:    notb %sil
-; CHECK-SSE1-NEXT:    andb 29(%rdx), %sil
-; CHECK-SSE1-NEXT:    orb %al, %sil
-; CHECK-SSE1-NEXT:    movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 28(%r12), %al
-; CHECK-SSE1-NEXT:    andb %dil, %al
-; CHECK-SSE1-NEXT:    notb %dil
-; CHECK-SSE1-NEXT:    andb 28(%rdx), %dil
-; CHECK-SSE1-NEXT:    orb %al, %dil
-; CHECK-SSE1-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 27(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r8b, %al
-; CHECK-SSE1-NEXT:    notb %r8b
-; CHECK-SSE1-NEXT:    andb 27(%rdx), %r8b
-; CHECK-SSE1-NEXT:    orb %al, %r8b
-; CHECK-SSE1-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 26(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r13b, %al
-; CHECK-SSE1-NEXT:    notb %r13b
-; CHECK-SSE1-NEXT:    andb 26(%rdx), %r13b
-; CHECK-SSE1-NEXT:    orb %al, %r13b
-; CHECK-SSE1-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 25(%r12), %al
-; CHECK-SSE1-NEXT:    andb %bpl, %al
-; CHECK-SSE1-NEXT:    notb %bpl
-; CHECK-SSE1-NEXT:    andb 25(%rdx), %bpl
-; CHECK-SSE1-NEXT:    orb %al, %bpl
-; CHECK-SSE1-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 24(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r11b, %al
-; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    andb 24(%rdx), %r11b
-; CHECK-SSE1-NEXT:    orb %al, %r11b
-; CHECK-SSE1-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 23(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r10b, %al
-; CHECK-SSE1-NEXT:    notb %r10b
-; CHECK-SSE1-NEXT:    andb 23(%rdx), %r10b
-; CHECK-SSE1-NEXT:    orb %al, %r10b
-; CHECK-SSE1-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 22(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r9b, %al
-; CHECK-SSE1-NEXT:    notb %r9b
-; CHECK-SSE1-NEXT:    andb 22(%rdx), %r9b
-; CHECK-SSE1-NEXT:    orb %al, %r9b
-; CHECK-SSE1-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 21(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r14b, %al
-; CHECK-SSE1-NEXT:    notb %r14b
-; CHECK-SSE1-NEXT:    andb 21(%rdx), %r14b
-; CHECK-SSE1-NEXT:    orb %al, %r14b
-; CHECK-SSE1-NEXT:    movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 20(%r12), %al
+; CHECK-SSE1-NEXT:    movb 1(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    andb 1(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 2(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 2(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 3(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    andb 3(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 4(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %r13b, %al
+; CHECK-SSE1-NEXT:    andb 4(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %r13b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 5(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %sil, %al
+; CHECK-SSE1-NEXT:    andb 5(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %sil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 6(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %r12b, %al
+; CHECK-SSE1-NEXT:    andb 6(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %r12b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 7(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %r15b, %al
+; CHECK-SSE1-NEXT:    andb 7(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %r15b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 8(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %r14b, %al
+; CHECK-SSE1-NEXT:    andb 8(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %r14b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 9(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %bpl, %al
+; CHECK-SSE1-NEXT:    andb 9(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %bpl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 10(%r9), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb 20(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 19(%r12), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    andb 10(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 11(%r9), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb 19(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 18(%r12), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    andb 11(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 12(%r9), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb 18(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 17(%r12), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    andb 12(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 13(%r9), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    movq %rdx, %rbx
-; CHECK-SSE1-NEXT:    andb 17(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 16(%r12), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    andb 13(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 14(%r9), %al
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    andb 14(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 15(%r9), %al
 ; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb 16(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    andb 15(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movb 16(%r8), %al
+; CHECK-SSE1-NEXT:    movb 16(%r9), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb 16(%r10), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 15(%r15), %cl
-; CHECK-SSE1-NEXT:    movb 15(%r12), %al
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb 15(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb 17(%r8), %al
+; CHECK-SSE1-NEXT:    movb 17(%r9), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb 17(%r10), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 14(%r15), %cl
-; CHECK-SSE1-NEXT:    movb 14(%r12), %al
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb 14(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb 18(%r8), %al
+; CHECK-SSE1-NEXT:    movb 18(%r9), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb 18(%r10), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 13(%r15), %cl
-; CHECK-SSE1-NEXT:    movb 13(%r12), %al
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb 13(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb 19(%r8), %al
+; CHECK-SSE1-NEXT:    movb 19(%r9), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb 19(%r10), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 12(%r15), %cl
-; CHECK-SSE1-NEXT:    movb 12(%r12), %al
-; CHECK-SSE1-NEXT:    andb %cl, %al
-; CHECK-SSE1-NEXT:    notb %cl
-; CHECK-SSE1-NEXT:    andb 12(%rdx), %cl
-; CHECK-SSE1-NEXT:    orb %al, %cl
+; CHECK-SSE1-NEXT:    movb 20(%r8), %al
+; CHECK-SSE1-NEXT:    movb 20(%r9), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb 20(%r10), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movb 11(%r15), %r13b
-; CHECK-SSE1-NEXT:    movb 11(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r13b, %al
-; CHECK-SSE1-NEXT:    notb %r13b
-; CHECK-SSE1-NEXT:    andb 11(%rdx), %r13b
-; CHECK-SSE1-NEXT:    orb %al, %r13b
-; CHECK-SSE1-NEXT:    movb 10(%r15), %r14b
-; CHECK-SSE1-NEXT:    movb 10(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r14b, %al
-; CHECK-SSE1-NEXT:    notb %r14b
-; CHECK-SSE1-NEXT:    andb 10(%rdx), %r14b
-; CHECK-SSE1-NEXT:    orb %al, %r14b
-; CHECK-SSE1-NEXT:    movb 9(%r15), %bpl
-; CHECK-SSE1-NEXT:    movb 9(%r12), %al
-; CHECK-SSE1-NEXT:    andb %bpl, %al
-; CHECK-SSE1-NEXT:    notb %bpl
-; CHECK-SSE1-NEXT:    andb 9(%rdx), %bpl
-; CHECK-SSE1-NEXT:    orb %al, %bpl
-; CHECK-SSE1-NEXT:    movb 8(%r15), %r11b
-; CHECK-SSE1-NEXT:    movb 8(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r11b, %al
-; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    andb 8(%rdx), %r11b
-; CHECK-SSE1-NEXT:    orb %al, %r11b
-; CHECK-SSE1-NEXT:    movb 7(%r15), %r10b
-; CHECK-SSE1-NEXT:    movb 7(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r10b, %al
-; CHECK-SSE1-NEXT:    notb %r10b
-; CHECK-SSE1-NEXT:    andb 7(%rdx), %r10b
-; CHECK-SSE1-NEXT:    orb %al, %r10b
-; CHECK-SSE1-NEXT:    movb 6(%r15), %r9b
-; CHECK-SSE1-NEXT:    movb 6(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r9b, %al
-; CHECK-SSE1-NEXT:    notb %r9b
-; CHECK-SSE1-NEXT:    andb 6(%rdx), %r9b
-; CHECK-SSE1-NEXT:    orb %al, %r9b
-; CHECK-SSE1-NEXT:    movb 5(%r15), %r8b
-; CHECK-SSE1-NEXT:    movb 5(%r12), %al
-; CHECK-SSE1-NEXT:    andb %r8b, %al
-; CHECK-SSE1-NEXT:    notb %r8b
-; CHECK-SSE1-NEXT:    andb 5(%rdx), %r8b
-; CHECK-SSE1-NEXT:    orb %al, %r8b
-; CHECK-SSE1-NEXT:    movb 4(%r15), %dil
-; CHECK-SSE1-NEXT:    movb 4(%r12), %al
-; CHECK-SSE1-NEXT:    andb %dil, %al
-; CHECK-SSE1-NEXT:    notb %dil
-; CHECK-SSE1-NEXT:    andb 4(%rdx), %dil
-; CHECK-SSE1-NEXT:    orb %al, %dil
-; CHECK-SSE1-NEXT:    movb 3(%r15), %sil
-; CHECK-SSE1-NEXT:    movb 3(%r12), %al
-; CHECK-SSE1-NEXT:    andb %sil, %al
-; CHECK-SSE1-NEXT:    notb %sil
-; CHECK-SSE1-NEXT:    andb 3(%rdx), %sil
-; CHECK-SSE1-NEXT:    orb %al, %sil
-; CHECK-SSE1-NEXT:    movb 2(%r15), %dl
-; CHECK-SSE1-NEXT:    movb 2(%r12), %al
-; CHECK-SSE1-NEXT:    andb %dl, %al
-; CHECK-SSE1-NEXT:    notb %dl
-; CHECK-SSE1-NEXT:    andb 2(%rbx), %dl
-; CHECK-SSE1-NEXT:    orb %al, %dl
-; CHECK-SSE1-NEXT:    movb 1(%r15), %al
-; CHECK-SSE1-NEXT:    movb 1(%r12), %cl
-; CHECK-SSE1-NEXT:    andb %al, %cl
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    andb 1(%rbx), %al
-; CHECK-SSE1-NEXT:    orb %cl, %al
-; CHECK-SSE1-NEXT:    movb (%r15), %r15b
-; CHECK-SSE1-NEXT:    movb (%r12), %cl
-; CHECK-SSE1-NEXT:    andb %r15b, %cl
-; CHECK-SSE1-NEXT:    notb %r15b
-; CHECK-SSE1-NEXT:    andb (%rbx), %r15b
-; CHECK-SSE1-NEXT:    orb %cl, %r15b
-; CHECK-SSE1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 31(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 30(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 29(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 28(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 27(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 26(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 25(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 24(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 23(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 22(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 21(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 20(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 19(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 18(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 17(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 16(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 15(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 14(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 13(%rcx)
-; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload
-; CHECK-SSE1-NEXT:    movb %bl, 12(%rcx)
-; CHECK-SSE1-NEXT:    movb %r13b, 11(%rcx)
-; CHECK-SSE1-NEXT:    movb %r14b, 10(%rcx)
-; CHECK-SSE1-NEXT:    movb %bpl, 9(%rcx)
-; CHECK-SSE1-NEXT:    movb %r11b, 8(%rcx)
-; CHECK-SSE1-NEXT:    movb %r10b, 7(%rcx)
-; CHECK-SSE1-NEXT:    movb %r9b, 6(%rcx)
-; CHECK-SSE1-NEXT:    movb %r8b, 5(%rcx)
-; CHECK-SSE1-NEXT:    movb %dil, 4(%rcx)
-; CHECK-SSE1-NEXT:    movb %sil, 3(%rcx)
-; CHECK-SSE1-NEXT:    movb %dl, 2(%rcx)
-; CHECK-SSE1-NEXT:    movb %al, 1(%rcx)
-; CHECK-SSE1-NEXT:    movb %r15b, (%rcx)
-; CHECK-SSE1-NEXT:    movq %rcx, %rax
+; CHECK-SSE1-NEXT:    movb 21(%r8), %al
+; CHECK-SSE1-NEXT:    movb 21(%r9), %r13b
+; CHECK-SSE1-NEXT:    xorb %al, %r13b
+; CHECK-SSE1-NEXT:    andb 21(%r10), %r13b
+; CHECK-SSE1-NEXT:    xorb %al, %r13b
+; CHECK-SSE1-NEXT:    movb 22(%r8), %al
+; CHECK-SSE1-NEXT:    movb 22(%r9), %r12b
+; CHECK-SSE1-NEXT:    xorb %al, %r12b
+; CHECK-SSE1-NEXT:    andb 22(%r10), %r12b
+; CHECK-SSE1-NEXT:    xorb %al, %r12b
+; CHECK-SSE1-NEXT:    movb 23(%r8), %al
+; CHECK-SSE1-NEXT:    movb 23(%r9), %r15b
+; CHECK-SSE1-NEXT:    xorb %al, %r15b
+; CHECK-SSE1-NEXT:    andb 23(%r10), %r15b
+; CHECK-SSE1-NEXT:    xorb %al, %r15b
+; CHECK-SSE1-NEXT:    movb 24(%r8), %al
+; CHECK-SSE1-NEXT:    movb 24(%r9), %r14b
+; CHECK-SSE1-NEXT:    xorb %al, %r14b
+; CHECK-SSE1-NEXT:    andb 24(%r10), %r14b
+; CHECK-SSE1-NEXT:    xorb %al, %r14b
+; CHECK-SSE1-NEXT:    movb 25(%r8), %al
+; CHECK-SSE1-NEXT:    movb 25(%r9), %bpl
+; CHECK-SSE1-NEXT:    xorb %al, %bpl
+; CHECK-SSE1-NEXT:    andb 25(%r10), %bpl
+; CHECK-SSE1-NEXT:    xorb %al, %bpl
+; CHECK-SSE1-NEXT:    movb 26(%r8), %al
+; CHECK-SSE1-NEXT:    movb 26(%r9), %dil
+; CHECK-SSE1-NEXT:    xorb %al, %dil
+; CHECK-SSE1-NEXT:    andb 26(%r10), %dil
+; CHECK-SSE1-NEXT:    xorb %al, %dil
+; CHECK-SSE1-NEXT:    movb 27(%r8), %al
+; CHECK-SSE1-NEXT:    movb 27(%r9), %sil
+; CHECK-SSE1-NEXT:    xorb %al, %sil
+; CHECK-SSE1-NEXT:    andb 27(%r10), %sil
+; CHECK-SSE1-NEXT:    xorb %al, %sil
+; CHECK-SSE1-NEXT:    movb 28(%r8), %al
+; CHECK-SSE1-NEXT:    movb 28(%r9), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    andb 28(%r10), %dl
+; CHECK-SSE1-NEXT:    xorb %al, %dl
+; CHECK-SSE1-NEXT:    movb 29(%r8), %al
+; CHECK-SSE1-NEXT:    movb 29(%r9), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    andb 29(%r10), %cl
+; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    movb 30(%r8), %bl
+; CHECK-SSE1-NEXT:    movb 30(%r9), %al
+; CHECK-SSE1-NEXT:    xorb %bl, %al
+; CHECK-SSE1-NEXT:    andb 30(%r10), %al
+; CHECK-SSE1-NEXT:    xorb %bl, %al
+; CHECK-SSE1-NEXT:    movb 31(%r8), %r8b
+; CHECK-SSE1-NEXT:    movb 31(%r9), %bl
+; CHECK-SSE1-NEXT:    xorb %r8b, %bl
+; CHECK-SSE1-NEXT:    andb 31(%r10), %bl
+; CHECK-SSE1-NEXT:    xorb %r8b, %bl
+; CHECK-SSE1-NEXT:    movb %bl, 31(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 30(%r11)
+; CHECK-SSE1-NEXT:    movb %cl, 29(%r11)
+; CHECK-SSE1-NEXT:    movb %dl, 28(%r11)
+; CHECK-SSE1-NEXT:    movb %sil, 27(%r11)
+; CHECK-SSE1-NEXT:    movb %dil, 26(%r11)
+; CHECK-SSE1-NEXT:    movb %bpl, 25(%r11)
+; CHECK-SSE1-NEXT:    movb %r14b, 24(%r11)
+; CHECK-SSE1-NEXT:    movb %r15b, 23(%r11)
+; CHECK-SSE1-NEXT:    movb %r12b, 22(%r11)
+; CHECK-SSE1-NEXT:    movb %r13b, 21(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 20(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 19(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 18(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 17(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 16(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 15(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 14(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 13(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 12(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 11(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 10(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 9(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 8(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 7(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 6(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 5(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 4(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 3(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 2(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, 1(%r11)
+; CHECK-SSE1-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
+; CHECK-SSE1-NEXT:    movb %al, (%r11)
+; CHECK-SSE1-NEXT:    movq %r11, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -1942,135 +1759,113 @@ define <16 x i16> @out_v16i16(<16 x i16> *%px, <16 x i16> *%py, <16 x i16> *%pma
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movq %rcx, %r9
-; CHECK-BASELINE-NEXT:    movq %rdx, %r10
-; CHECK-BASELINE-NEXT:    movq %rsi, %r8
-; CHECK-BASELINE-NEXT:    movq %rdi, %r11
-; CHECK-BASELINE-NEXT:    movl 12(%rcx), %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 14(%rcx), %edx
-; CHECK-BASELINE-NEXT:    movl 16(%rcx), %esi
-; CHECK-BASELINE-NEXT:    movzwl 18(%rcx), %edi
-; CHECK-BASELINE-NEXT:    movl 20(%rcx), %ecx
-; CHECK-BASELINE-NEXT:    movzwl 22(%r9), %r15d
-; CHECK-BASELINE-NEXT:    movl 24(%r9), %r12d
-; CHECK-BASELINE-NEXT:    movzwl 26(%r9), %r14d
-; CHECK-BASELINE-NEXT:    movl 28(%r9), %ebx
-; CHECK-BASELINE-NEXT:    movzwl 30(%r9), %ebp
-; CHECK-BASELINE-NEXT:    movzwl 30(%r8), %r13d
-; CHECK-BASELINE-NEXT:    andw %bp, %r13w
-; CHECK-BASELINE-NEXT:    notl %ebp
-; CHECK-BASELINE-NEXT:    andw 30(%r10), %bp
-; CHECK-BASELINE-NEXT:    orl %r13d, %ebp
-; CHECK-BASELINE-NEXT:    movzwl 28(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %bx, %ax
-; CHECK-BASELINE-NEXT:    notl %ebx
-; CHECK-BASELINE-NEXT:    andw 28(%r10), %bx
-; CHECK-BASELINE-NEXT:    orl %eax, %ebx
-; CHECK-BASELINE-NEXT:    movzwl 26(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %r14w, %ax
-; CHECK-BASELINE-NEXT:    notl %r14d
-; CHECK-BASELINE-NEXT:    andw 26(%r10), %r14w
-; CHECK-BASELINE-NEXT:    orl %eax, %r14d
-; CHECK-BASELINE-NEXT:    movzwl 24(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %r12w, %ax
-; CHECK-BASELINE-NEXT:    notl %r12d
-; CHECK-BASELINE-NEXT:    andw 24(%r10), %r12w
-; CHECK-BASELINE-NEXT:    orl %eax, %r12d
-; CHECK-BASELINE-NEXT:    movzwl 22(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %r15w, %ax
-; CHECK-BASELINE-NEXT:    notl %r15d
-; CHECK-BASELINE-NEXT:    andw 22(%r10), %r15w
-; CHECK-BASELINE-NEXT:    orl %eax, %r15d
-; CHECK-BASELINE-NEXT:    movzwl 20(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %cx, %ax
-; CHECK-BASELINE-NEXT:    notl %ecx
-; CHECK-BASELINE-NEXT:    andw 20(%r10), %cx
-; CHECK-BASELINE-NEXT:    orl %eax, %ecx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 18(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %di, %ax
-; CHECK-BASELINE-NEXT:    notl %edi
-; CHECK-BASELINE-NEXT:    andw 18(%r10), %di
-; CHECK-BASELINE-NEXT:    orl %eax, %edi
-; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 16(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %si, %ax
-; CHECK-BASELINE-NEXT:    notl %esi
-; CHECK-BASELINE-NEXT:    andw 16(%r10), %si
-; CHECK-BASELINE-NEXT:    orl %eax, %esi
-; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 14(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %dx, %ax
-; CHECK-BASELINE-NEXT:    notl %edx
-; CHECK-BASELINE-NEXT:    andw 14(%r10), %dx
-; CHECK-BASELINE-NEXT:    orl %eax, %edx
-; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 12(%r8), %eax
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw %cx, %ax
-; CHECK-BASELINE-NEXT:    notl %ecx
-; CHECK-BASELINE-NEXT:    andw 12(%r10), %cx
-; CHECK-BASELINE-NEXT:    orl %eax, %ecx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 10(%r9), %r13d
-; CHECK-BASELINE-NEXT:    movzwl 10(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %r13w, %ax
-; CHECK-BASELINE-NEXT:    notl %r13d
-; CHECK-BASELINE-NEXT:    andw 10(%r10), %r13w
-; CHECK-BASELINE-NEXT:    orl %eax, %r13d
-; CHECK-BASELINE-NEXT:    movl 8(%r9), %edi
-; CHECK-BASELINE-NEXT:    movzwl 8(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %di, %ax
-; CHECK-BASELINE-NEXT:    notl %edi
-; CHECK-BASELINE-NEXT:    andw 8(%r10), %di
-; CHECK-BASELINE-NEXT:    orl %eax, %edi
-; CHECK-BASELINE-NEXT:    movzwl 6(%r9), %esi
-; CHECK-BASELINE-NEXT:    movzwl 6(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %si, %ax
-; CHECK-BASELINE-NEXT:    notl %esi
-; CHECK-BASELINE-NEXT:    andw 6(%r10), %si
-; CHECK-BASELINE-NEXT:    orl %eax, %esi
-; CHECK-BASELINE-NEXT:    movl 4(%r9), %edx
-; CHECK-BASELINE-NEXT:    movzwl 4(%r8), %eax
-; CHECK-BASELINE-NEXT:    andw %dx, %ax
-; CHECK-BASELINE-NEXT:    notl %edx
-; CHECK-BASELINE-NEXT:    andw 4(%r10), %dx
-; CHECK-BASELINE-NEXT:    orl %eax, %edx
-; CHECK-BASELINE-NEXT:    movzwl 2(%r9), %eax
-; CHECK-BASELINE-NEXT:    movzwl 2(%r8), %ecx
-; CHECK-BASELINE-NEXT:    andw %ax, %cx
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    andw 2(%r10), %ax
-; CHECK-BASELINE-NEXT:    orl %ecx, %eax
-; CHECK-BASELINE-NEXT:    movl (%r9), %r9d
-; CHECK-BASELINE-NEXT:    movzwl (%r8), %ecx
-; CHECK-BASELINE-NEXT:    andw %r9w, %cx
-; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    andw (%r10), %r9w
-; CHECK-BASELINE-NEXT:    orl %ecx, %r9d
-; CHECK-BASELINE-NEXT:    movw %bp, 30(%r11)
-; CHECK-BASELINE-NEXT:    movw %bx, 28(%r11)
-; CHECK-BASELINE-NEXT:    movw %r14w, 26(%r11)
-; CHECK-BASELINE-NEXT:    movw %r12w, 24(%r11)
-; CHECK-BASELINE-NEXT:    movw %r15w, 22(%r11)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %cx, 20(%r11)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %cx, 18(%r11)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %cx, 16(%r11)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %cx, 14(%r11)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %cx, 12(%r11)
-; CHECK-BASELINE-NEXT:    movw %r13w, 10(%r11)
-; CHECK-BASELINE-NEXT:    movw %di, 8(%r11)
-; CHECK-BASELINE-NEXT:    movw %si, 6(%r11)
-; CHECK-BASELINE-NEXT:    movw %dx, 4(%r11)
-; CHECK-BASELINE-NEXT:    movw %ax, 2(%r11)
-; CHECK-BASELINE-NEXT:    movw %r9w, (%r11)
-; CHECK-BASELINE-NEXT:    movq %r11, %rax
+; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movzwl 16(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movzwl 12(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movzwl 8(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzwl 4(%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movzwl (%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movzwl (%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %bp, %ax
+; CHECK-BASELINE-NEXT:    andw (%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
+; CHECK-BASELINE-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r10w, %ax
+; CHECK-BASELINE-NEXT:    andw 2(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r10d
+; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
+; CHECK-BASELINE-NEXT:    andw 4(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
+; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %bx, %ax
+; CHECK-BASELINE-NEXT:    andw 6(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
+; CHECK-BASELINE-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
+; CHECK-BASELINE-NEXT:    andw 8(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
+; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
+; CHECK-BASELINE-NEXT:    andw 10(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
+; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
+; CHECK-BASELINE-NEXT:    andw 12(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
+; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r11w, %ax
+; CHECK-BASELINE-NEXT:    andw 14(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r11d
+; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
+; CHECK-BASELINE-NEXT:    andw 16(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r14d
+; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
+; CHECK-BASELINE-NEXT:    andw 18(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r15d
+; CHECK-BASELINE-NEXT:    movzwl 20(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %bx, %ax
+; CHECK-BASELINE-NEXT:    andw 20(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
+; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
+; CHECK-BASELINE-NEXT:    andw 22(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
+; CHECK-BASELINE-NEXT:    movzwl 24(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
+; CHECK-BASELINE-NEXT:    andw 24(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
+; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorw %ax, %r11w
+; CHECK-BASELINE-NEXT:    andw 26(%rcx), %r11w
+; CHECK-BASELINE-NEXT:    xorl %r11d, %eax
+; CHECK-BASELINE-NEXT:    movzwl 28(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %ebp
+; CHECK-BASELINE-NEXT:    xorw %r11w, %bp
+; CHECK-BASELINE-NEXT:    andw 28(%rcx), %bp
+; CHECK-BASELINE-NEXT:    xorl %ebp, %r11d
+; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorw %dx, %si
+; CHECK-BASELINE-NEXT:    andw 30(%rcx), %si
+; CHECK-BASELINE-NEXT:    xorl %esi, %edx
+; CHECK-BASELINE-NEXT:    movw %dx, 30(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r11w, 28(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 26(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r8w, 24(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r9w, 22(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bx, 20(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r15w, 18(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r14w, 16(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %ax, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r13w, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %ax, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %ax, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r12w, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -2087,135 +1882,113 @@ define <16 x i16> @out_v16i16(<16 x i16> *%px, <16 x i16> *%py, <16 x i16> *%pma
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movq %rcx, %r9
-; CHECK-SSE1-NEXT:    movq %rdx, %r10
-; CHECK-SSE1-NEXT:    movq %rsi, %r8
-; CHECK-SSE1-NEXT:    movq %rdi, %r11
-; CHECK-SSE1-NEXT:    movl 12(%rcx), %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 14(%rcx), %edx
-; CHECK-SSE1-NEXT:    movl 16(%rcx), %esi
-; CHECK-SSE1-NEXT:    movzwl 18(%rcx), %edi
-; CHECK-SSE1-NEXT:    movl 20(%rcx), %ecx
-; CHECK-SSE1-NEXT:    movzwl 22(%r9), %r15d
-; CHECK-SSE1-NEXT:    movl 24(%r9), %r12d
-; CHECK-SSE1-NEXT:    movzwl 26(%r9), %r14d
-; CHECK-SSE1-NEXT:    movl 28(%r9), %ebx
-; CHECK-SSE1-NEXT:    movzwl 30(%r9), %ebp
-; CHECK-SSE1-NEXT:    movzwl 30(%r8), %r13d
-; CHECK-SSE1-NEXT:    andw %bp, %r13w
-; CHECK-SSE1-NEXT:    notl %ebp
-; CHECK-SSE1-NEXT:    andw 30(%r10), %bp
-; CHECK-SSE1-NEXT:    orl %r13d, %ebp
-; CHECK-SSE1-NEXT:    movzwl 28(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %bx, %ax
-; CHECK-SSE1-NEXT:    notl %ebx
-; CHECK-SSE1-NEXT:    andw 28(%r10), %bx
-; CHECK-SSE1-NEXT:    orl %eax, %ebx
-; CHECK-SSE1-NEXT:    movzwl 26(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %r14w, %ax
-; CHECK-SSE1-NEXT:    notl %r14d
-; CHECK-SSE1-NEXT:    andw 26(%r10), %r14w
-; CHECK-SSE1-NEXT:    orl %eax, %r14d
-; CHECK-SSE1-NEXT:    movzwl 24(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %r12w, %ax
-; CHECK-SSE1-NEXT:    notl %r12d
-; CHECK-SSE1-NEXT:    andw 24(%r10), %r12w
-; CHECK-SSE1-NEXT:    orl %eax, %r12d
-; CHECK-SSE1-NEXT:    movzwl 22(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %r15w, %ax
-; CHECK-SSE1-NEXT:    notl %r15d
-; CHECK-SSE1-NEXT:    andw 22(%r10), %r15w
-; CHECK-SSE1-NEXT:    orl %eax, %r15d
-; CHECK-SSE1-NEXT:    movzwl 20(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %cx, %ax
-; CHECK-SSE1-NEXT:    notl %ecx
-; CHECK-SSE1-NEXT:    andw 20(%r10), %cx
-; CHECK-SSE1-NEXT:    orl %eax, %ecx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 18(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %di, %ax
-; CHECK-SSE1-NEXT:    notl %edi
-; CHECK-SSE1-NEXT:    andw 18(%r10), %di
-; CHECK-SSE1-NEXT:    orl %eax, %edi
-; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 16(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %si, %ax
-; CHECK-SSE1-NEXT:    notl %esi
-; CHECK-SSE1-NEXT:    andw 16(%r10), %si
-; CHECK-SSE1-NEXT:    orl %eax, %esi
-; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 14(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %dx, %ax
-; CHECK-SSE1-NEXT:    notl %edx
-; CHECK-SSE1-NEXT:    andw 14(%r10), %dx
-; CHECK-SSE1-NEXT:    orl %eax, %edx
-; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 12(%r8), %eax
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw %cx, %ax
-; CHECK-SSE1-NEXT:    notl %ecx
-; CHECK-SSE1-NEXT:    andw 12(%r10), %cx
-; CHECK-SSE1-NEXT:    orl %eax, %ecx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 10(%r9), %r13d
-; CHECK-SSE1-NEXT:    movzwl 10(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %r13w, %ax
-; CHECK-SSE1-NEXT:    notl %r13d
-; CHECK-SSE1-NEXT:    andw 10(%r10), %r13w
-; CHECK-SSE1-NEXT:    orl %eax, %r13d
-; CHECK-SSE1-NEXT:    movl 8(%r9), %edi
-; CHECK-SSE1-NEXT:    movzwl 8(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %di, %ax
-; CHECK-SSE1-NEXT:    notl %edi
-; CHECK-SSE1-NEXT:    andw 8(%r10), %di
-; CHECK-SSE1-NEXT:    orl %eax, %edi
-; CHECK-SSE1-NEXT:    movzwl 6(%r9), %esi
-; CHECK-SSE1-NEXT:    movzwl 6(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %si, %ax
-; CHECK-SSE1-NEXT:    notl %esi
-; CHECK-SSE1-NEXT:    andw 6(%r10), %si
-; CHECK-SSE1-NEXT:    orl %eax, %esi
-; CHECK-SSE1-NEXT:    movl 4(%r9), %edx
-; CHECK-SSE1-NEXT:    movzwl 4(%r8), %eax
-; CHECK-SSE1-NEXT:    andw %dx, %ax
-; CHECK-SSE1-NEXT:    notl %edx
-; CHECK-SSE1-NEXT:    andw 4(%r10), %dx
-; CHECK-SSE1-NEXT:    orl %eax, %edx
-; CHECK-SSE1-NEXT:    movzwl 2(%r9), %eax
-; CHECK-SSE1-NEXT:    movzwl 2(%r8), %ecx
-; CHECK-SSE1-NEXT:    andw %ax, %cx
-; CHECK-SSE1-NEXT:    notl %eax
-; CHECK-SSE1-NEXT:    andw 2(%r10), %ax
-; CHECK-SSE1-NEXT:    orl %ecx, %eax
-; CHECK-SSE1-NEXT:    movl (%r9), %r9d
-; CHECK-SSE1-NEXT:    movzwl (%r8), %ecx
-; CHECK-SSE1-NEXT:    andw %r9w, %cx
-; CHECK-SSE1-NEXT:    notl %r9d
-; CHECK-SSE1-NEXT:    andw (%r10), %r9w
-; CHECK-SSE1-NEXT:    orl %ecx, %r9d
-; CHECK-SSE1-NEXT:    movw %bp, 30(%r11)
-; CHECK-SSE1-NEXT:    movw %bx, 28(%r11)
-; CHECK-SSE1-NEXT:    movw %r14w, 26(%r11)
-; CHECK-SSE1-NEXT:    movw %r12w, 24(%r11)
-; CHECK-SSE1-NEXT:    movw %r15w, 22(%r11)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %cx, 20(%r11)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %cx, 18(%r11)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %cx, 16(%r11)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %cx, 14(%r11)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %cx, 12(%r11)
-; CHECK-SSE1-NEXT:    movw %r13w, 10(%r11)
-; CHECK-SSE1-NEXT:    movw %di, 8(%r11)
-; CHECK-SSE1-NEXT:    movw %si, 6(%r11)
-; CHECK-SSE1-NEXT:    movw %dx, 4(%r11)
-; CHECK-SSE1-NEXT:    movw %ax, 2(%r11)
-; CHECK-SSE1-NEXT:    movw %r9w, (%r11)
-; CHECK-SSE1-NEXT:    movq %r11, %rax
+; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movzwl 16(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r11d
+; CHECK-SSE1-NEXT:    movzwl 12(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movzwl 8(%rdx), %r8d
+; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzwl 4(%rdx), %r12d
+; CHECK-SSE1-NEXT:    movzwl (%rdx), %ebp
+; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movzwl (%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %bp, %ax
+; CHECK-SSE1-NEXT:    andw (%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %ebp
+; CHECK-SSE1-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r10w, %ax
+; CHECK-SSE1-NEXT:    andw 2(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r10d
+; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r12w, %ax
+; CHECK-SSE1-NEXT:    andw 4(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r12d
+; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %bx, %ax
+; CHECK-SSE1-NEXT:    andw 6(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %ebx
+; CHECK-SSE1-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r8w, %ax
+; CHECK-SSE1-NEXT:    andw 8(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r8d
+; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r9w, %ax
+; CHECK-SSE1-NEXT:    andw 10(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r9d
+; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r13w, %ax
+; CHECK-SSE1-NEXT:    andw 12(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r13d
+; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r11w, %ax
+; CHECK-SSE1-NEXT:    andw 14(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r11d
+; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r14w, %ax
+; CHECK-SSE1-NEXT:    andw 16(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r14d
+; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r15w, %ax
+; CHECK-SSE1-NEXT:    andw 18(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r15d
+; CHECK-SSE1-NEXT:    movzwl 20(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %bx, %ax
+; CHECK-SSE1-NEXT:    andw 20(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %ebx
+; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r9w, %ax
+; CHECK-SSE1-NEXT:    andw 22(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r9d
+; CHECK-SSE1-NEXT:    movzwl 24(%rdx), %r8d
+; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r8w, %ax
+; CHECK-SSE1-NEXT:    andw 24(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r8d
+; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
+; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorw %ax, %r11w
+; CHECK-SSE1-NEXT:    andw 26(%rcx), %r11w
+; CHECK-SSE1-NEXT:    xorl %r11d, %eax
+; CHECK-SSE1-NEXT:    movzwl 28(%rdx), %r11d
+; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %ebp
+; CHECK-SSE1-NEXT:    xorw %r11w, %bp
+; CHECK-SSE1-NEXT:    andw 28(%rcx), %bp
+; CHECK-SSE1-NEXT:    xorl %ebp, %r11d
+; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edx
+; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-SSE1-NEXT:    xorw %dx, %si
+; CHECK-SSE1-NEXT:    andw 30(%rcx), %si
+; CHECK-SSE1-NEXT:    xorl %esi, %edx
+; CHECK-SSE1-NEXT:    movw %dx, 30(%rdi)
+; CHECK-SSE1-NEXT:    movw %r11w, 28(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 26(%rdi)
+; CHECK-SSE1-NEXT:    movw %r8w, 24(%rdi)
+; CHECK-SSE1-NEXT:    movw %r9w, 22(%rdi)
+; CHECK-SSE1-NEXT:    movw %bx, 20(%rdi)
+; CHECK-SSE1-NEXT:    movw %r15w, 18(%rdi)
+; CHECK-SSE1-NEXT:    movw %r14w, 16(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %ax, 14(%rdi)
+; CHECK-SSE1-NEXT:    movw %r13w, 12(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %ax, 10(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %ax, 8(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %r12w, 4(%rdi)
+; CHECK-SSE1-NEXT:    movw %r10w, 2(%rdi)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -2260,65 +2033,59 @@ define <8 x i32> @out_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) n
 ; CHECK-BASELINE-NEXT:    pushq %rbp
 ; CHECK-BASELINE-NEXT:    pushq %r15
 ; CHECK-BASELINE-NEXT:    pushq %r14
+; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r8d
-; CHECK-BASELINE-NEXT:    movl 8(%rcx), %r9d
-; CHECK-BASELINE-NEXT:    movl 12(%rcx), %r10d
-; CHECK-BASELINE-NEXT:    movl 16(%rcx), %r11d
-; CHECK-BASELINE-NEXT:    movl 20(%rcx), %r15d
-; CHECK-BASELINE-NEXT:    movl 24(%rcx), %ebx
-; CHECK-BASELINE-NEXT:    movl 28(%rcx), %ebp
-; CHECK-BASELINE-NEXT:    movl 28(%rsi), %r14d
-; CHECK-BASELINE-NEXT:    andl %ebp, %r14d
-; CHECK-BASELINE-NEXT:    notl %ebp
-; CHECK-BASELINE-NEXT:    andl 28(%rdx), %ebp
-; CHECK-BASELINE-NEXT:    orl %r14d, %ebp
-; CHECK-BASELINE-NEXT:    movl 24(%rsi), %edi
-; CHECK-BASELINE-NEXT:    andl %ebx, %edi
-; CHECK-BASELINE-NEXT:    notl %ebx
-; CHECK-BASELINE-NEXT:    andl 24(%rdx), %ebx
-; CHECK-BASELINE-NEXT:    orl %edi, %ebx
+; CHECK-BASELINE-NEXT:    movl 28(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movl 24(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movl (%rdx), %edi
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movl (%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorl %edi, %r11d
+; CHECK-BASELINE-NEXT:    andl (%rcx), %r11d
+; CHECK-BASELINE-NEXT:    xorl %edi, %r11d
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r15d
+; CHECK-BASELINE-NEXT:    xorl %edx, %r15d
+; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r15d
+; CHECK-BASELINE-NEXT:    xorl %edx, %r15d
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r12d
+; CHECK-BASELINE-NEXT:    xorl %ebp, %r12d
+; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r12d
+; CHECK-BASELINE-NEXT:    xorl %ebp, %r12d
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %ebp
+; CHECK-BASELINE-NEXT:    xorl %ebx, %ebp
+; CHECK-BASELINE-NEXT:    andl 12(%rcx), %ebp
+; CHECK-BASELINE-NEXT:    xorl %ebx, %ebp
+; CHECK-BASELINE-NEXT:    movl 16(%rsi), %ebx
+; CHECK-BASELINE-NEXT:    xorl %r14d, %ebx
+; CHECK-BASELINE-NEXT:    andl 16(%rcx), %ebx
+; CHECK-BASELINE-NEXT:    xorl %r14d, %ebx
 ; CHECK-BASELINE-NEXT:    movl 20(%rsi), %edi
-; CHECK-BASELINE-NEXT:    andl %r15d, %edi
-; CHECK-BASELINE-NEXT:    notl %r15d
-; CHECK-BASELINE-NEXT:    andl 20(%rdx), %r15d
-; CHECK-BASELINE-NEXT:    orl %edi, %r15d
-; CHECK-BASELINE-NEXT:    movl 16(%rsi), %edi
-; CHECK-BASELINE-NEXT:    andl %r11d, %edi
-; CHECK-BASELINE-NEXT:    notl %r11d
-; CHECK-BASELINE-NEXT:    andl 16(%rdx), %r11d
-; CHECK-BASELINE-NEXT:    orl %edi, %r11d
-; CHECK-BASELINE-NEXT:    movl 12(%rsi), %edi
-; CHECK-BASELINE-NEXT:    andl %r10d, %edi
-; CHECK-BASELINE-NEXT:    notl %r10d
-; CHECK-BASELINE-NEXT:    andl 12(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    orl %edi, %r10d
-; CHECK-BASELINE-NEXT:    movl 8(%rsi), %edi
-; CHECK-BASELINE-NEXT:    andl %r9d, %edi
-; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    andl 8(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    orl %edi, %r9d
-; CHECK-BASELINE-NEXT:    movl 4(%rsi), %edi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notl %r8d
-; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r8d
-; CHECK-BASELINE-NEXT:    orl %edi, %r8d
-; CHECK-BASELINE-NEXT:    movl (%rcx), %ecx
-; CHECK-BASELINE-NEXT:    movl (%rsi), %esi
-; CHECK-BASELINE-NEXT:    andl %ecx, %esi
-; CHECK-BASELINE-NEXT:    notl %ecx
-; CHECK-BASELINE-NEXT:    andl (%rdx), %ecx
-; CHECK-BASELINE-NEXT:    orl %esi, %ecx
-; CHECK-BASELINE-NEXT:    movl %ebp, 28(%rax)
-; CHECK-BASELINE-NEXT:    movl %ebx, 24(%rax)
-; CHECK-BASELINE-NEXT:    movl %r15d, 20(%rax)
-; CHECK-BASELINE-NEXT:    movl %r11d, 16(%rax)
-; CHECK-BASELINE-NEXT:    movl %r10d, 12(%rax)
-; CHECK-BASELINE-NEXT:    movl %r9d, 8(%rax)
-; CHECK-BASELINE-NEXT:    movl %r8d, 4(%rax)
-; CHECK-BASELINE-NEXT:    movl %ecx, (%rax)
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edi
+; CHECK-BASELINE-NEXT:    andl 20(%rcx), %edi
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edi
+; CHECK-BASELINE-NEXT:    movl 24(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
+; CHECK-BASELINE-NEXT:    andl 24(%rcx), %edx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl 28(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
+; CHECK-BASELINE-NEXT:    andl 28(%rcx), %esi
+; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
+; CHECK-BASELINE-NEXT:    movl %esi, 28(%rax)
+; CHECK-BASELINE-NEXT:    movl %edx, 24(%rax)
+; CHECK-BASELINE-NEXT:    movl %edi, 20(%rax)
+; CHECK-BASELINE-NEXT:    movl %ebx, 16(%rax)
+; CHECK-BASELINE-NEXT:    movl %ebp, 12(%rax)
+; CHECK-BASELINE-NEXT:    movl %r12d, 8(%rax)
+; CHECK-BASELINE-NEXT:    movl %r15d, 4(%rax)
+; CHECK-BASELINE-NEXT:    movl %r11d, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r14
 ; CHECK-BASELINE-NEXT:    popq %r15
 ; CHECK-BASELINE-NEXT:    popq %rbp
@@ -2329,65 +2096,59 @@ define <8 x i32> @out_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) n
 ; CHECK-SSE1-NEXT:    pushq %rbp
 ; CHECK-SSE1-NEXT:    pushq %r15
 ; CHECK-SSE1-NEXT:    pushq %r14
+; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl 4(%rcx), %r8d
-; CHECK-SSE1-NEXT:    movl 8(%rcx), %r9d
-; CHECK-SSE1-NEXT:    movl 12(%rcx), %r10d
-; CHECK-SSE1-NEXT:    movl 16(%rcx), %r11d
-; CHECK-SSE1-NEXT:    movl 20(%rcx), %r15d
-; CHECK-SSE1-NEXT:    movl 24(%rcx), %ebx
-; CHECK-SSE1-NEXT:    movl 28(%rcx), %ebp
-; CHECK-SSE1-NEXT:    movl 28(%rsi), %r14d
-; CHECK-SSE1-NEXT:    andl %ebp, %r14d
-; CHECK-SSE1-NEXT:    notl %ebp
-; CHECK-SSE1-NEXT:    andl 28(%rdx), %ebp
-; CHECK-SSE1-NEXT:    orl %r14d, %ebp
-; CHECK-SSE1-NEXT:    movl 24(%rsi), %edi
-; CHECK-SSE1-NEXT:    andl %ebx, %edi
-; CHECK-SSE1-NEXT:    notl %ebx
-; CHECK-SSE1-NEXT:    andl 24(%rdx), %ebx
-; CHECK-SSE1-NEXT:    orl %edi, %ebx
+; CHECK-SSE1-NEXT:    movl 28(%rdx), %r8d
+; CHECK-SSE1-NEXT:    movl 24(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movl 16(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movl 12(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movl (%rdx), %edi
+; CHECK-SSE1-NEXT:    movl 4(%rdx), %edx
+; CHECK-SSE1-NEXT:    movl (%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorl %edi, %r11d
+; CHECK-SSE1-NEXT:    andl (%rcx), %r11d
+; CHECK-SSE1-NEXT:    xorl %edi, %r11d
+; CHECK-SSE1-NEXT:    movl 4(%rsi), %r15d
+; CHECK-SSE1-NEXT:    xorl %edx, %r15d
+; CHECK-SSE1-NEXT:    andl 4(%rcx), %r15d
+; CHECK-SSE1-NEXT:    xorl %edx, %r15d
+; CHECK-SSE1-NEXT:    movl 8(%rsi), %r12d
+; CHECK-SSE1-NEXT:    xorl %ebp, %r12d
+; CHECK-SSE1-NEXT:    andl 8(%rcx), %r12d
+; CHECK-SSE1-NEXT:    xorl %ebp, %r12d
+; CHECK-SSE1-NEXT:    movl 12(%rsi), %ebp
+; CHECK-SSE1-NEXT:    xorl %ebx, %ebp
+; CHECK-SSE1-NEXT:    andl 12(%rcx), %ebp
+; CHECK-SSE1-NEXT:    xorl %ebx, %ebp
+; CHECK-SSE1-NEXT:    movl 16(%rsi), %ebx
+; CHECK-SSE1-NEXT:    xorl %r14d, %ebx
+; CHECK-SSE1-NEXT:    andl 16(%rcx), %ebx
+; CHECK-SSE1-NEXT:    xorl %r14d, %ebx
 ; CHECK-SSE1-NEXT:    movl 20(%rsi), %edi
-; CHECK-SSE1-NEXT:    andl %r15d, %edi
-; CHECK-SSE1-NEXT:    notl %r15d
-; CHECK-SSE1-NEXT:    andl 20(%rdx), %r15d
-; CHECK-SSE1-NEXT:    orl %edi, %r15d
-; CHECK-SSE1-NEXT:    movl 16(%rsi), %edi
-; CHECK-SSE1-NEXT:    andl %r11d, %edi
-; CHECK-SSE1-NEXT:    notl %r11d
-; CHECK-SSE1-NEXT:    andl 16(%rdx), %r11d
-; CHECK-SSE1-NEXT:    orl %edi, %r11d
-; CHECK-SSE1-NEXT:    movl 12(%rsi), %edi
-; CHECK-SSE1-NEXT:    andl %r10d, %edi
-; CHECK-SSE1-NEXT:    notl %r10d
-; CHECK-SSE1-NEXT:    andl 12(%rdx), %r10d
-; CHECK-SSE1-NEXT:    orl %edi, %r10d
-; CHECK-SSE1-NEXT:    movl 8(%rsi), %edi
-; CHECK-SSE1-NEXT:    andl %r9d, %edi
-; CHECK-SSE1-NEXT:    notl %r9d
-; CHECK-SSE1-NEXT:    andl 8(%rdx), %r9d
-; CHECK-SSE1-NEXT:    orl %edi, %r9d
-; CHECK-SSE1-NEXT:    movl 4(%rsi), %edi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notl %r8d
-; CHECK-SSE1-NEXT:    andl 4(%rdx), %r8d
-; CHECK-SSE1-NEXT:    orl %edi, %r8d
-; CHECK-SSE1-NEXT:    movl (%rcx), %ecx
-; CHECK-SSE1-NEXT:    movl (%rsi), %esi
-; CHECK-SSE1-NEXT:    andl %ecx, %esi
-; CHECK-SSE1-NEXT:    notl %ecx
-; CHECK-SSE1-NEXT:    andl (%rdx), %ecx
-; CHECK-SSE1-NEXT:    orl %esi, %ecx
-; CHECK-SSE1-NEXT:    movl %ebp, 28(%rax)
-; CHECK-SSE1-NEXT:    movl %ebx, 24(%rax)
-; CHECK-SSE1-NEXT:    movl %r15d, 20(%rax)
-; CHECK-SSE1-NEXT:    movl %r11d, 16(%rax)
-; CHECK-SSE1-NEXT:    movl %r10d, 12(%rax)
-; CHECK-SSE1-NEXT:    movl %r9d, 8(%rax)
-; CHECK-SSE1-NEXT:    movl %r8d, 4(%rax)
-; CHECK-SSE1-NEXT:    movl %ecx, (%rax)
+; CHECK-SSE1-NEXT:    xorl %r10d, %edi
+; CHECK-SSE1-NEXT:    andl 20(%rcx), %edi
+; CHECK-SSE1-NEXT:    xorl %r10d, %edi
+; CHECK-SSE1-NEXT:    movl 24(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorl %r9d, %edx
+; CHECK-SSE1-NEXT:    andl 24(%rcx), %edx
+; CHECK-SSE1-NEXT:    xorl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl 28(%rsi), %esi
+; CHECK-SSE1-NEXT:    xorl %r8d, %esi
+; CHECK-SSE1-NEXT:    andl 28(%rcx), %esi
+; CHECK-SSE1-NEXT:    xorl %r8d, %esi
+; CHECK-SSE1-NEXT:    movl %esi, 28(%rax)
+; CHECK-SSE1-NEXT:    movl %edx, 24(%rax)
+; CHECK-SSE1-NEXT:    movl %edi, 20(%rax)
+; CHECK-SSE1-NEXT:    movl %ebx, 16(%rax)
+; CHECK-SSE1-NEXT:    movl %ebp, 12(%rax)
+; CHECK-SSE1-NEXT:    movl %r12d, 8(%rax)
+; CHECK-SSE1-NEXT:    movl %r15d, 4(%rax)
+; CHECK-SSE1-NEXT:    movl %r11d, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r14
 ; CHECK-SSE1-NEXT:    popq %r15
 ; CHECK-SSE1-NEXT:    popq %rbp
@@ -2426,72 +2187,60 @@ define <8 x i32> @out_v8i32(<8 x i32> *%px, <8 x i32> *%py, <8 x i32> *%pmask) n
 define <4 x i64> @out_v4i64(<4 x i64> *%px, <4 x i64> *%py, <4 x i64> *%pmask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v4i64:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movq (%rcx), %r8
-; CHECK-BASELINE-NEXT:    movq 8(%rcx), %r9
-; CHECK-BASELINE-NEXT:    movq 16(%rcx), %rdi
-; CHECK-BASELINE-NEXT:    movq 24(%rcx), %rcx
-; CHECK-BASELINE-NEXT:    movq 24(%rsi), %r10
-; CHECK-BASELINE-NEXT:    andq %rcx, %r10
-; CHECK-BASELINE-NEXT:    movq 16(%rsi), %r11
-; CHECK-BASELINE-NEXT:    andq %rdi, %r11
-; CHECK-BASELINE-NEXT:    movq 8(%rsi), %rbx
-; CHECK-BASELINE-NEXT:    andq %r9, %rbx
-; CHECK-BASELINE-NEXT:    movq (%rsi), %rsi
-; CHECK-BASELINE-NEXT:    andq %r8, %rsi
-; CHECK-BASELINE-NEXT:    notq %r8
-; CHECK-BASELINE-NEXT:    notq %r9
-; CHECK-BASELINE-NEXT:    notq %rdi
-; CHECK-BASELINE-NEXT:    notq %rcx
-; CHECK-BASELINE-NEXT:    andq 24(%rdx), %rcx
-; CHECK-BASELINE-NEXT:    orq %r10, %rcx
-; CHECK-BASELINE-NEXT:    andq 16(%rdx), %rdi
-; CHECK-BASELINE-NEXT:    orq %r11, %rdi
-; CHECK-BASELINE-NEXT:    andq 8(%rdx), %r9
-; CHECK-BASELINE-NEXT:    orq %rbx, %r9
-; CHECK-BASELINE-NEXT:    andq (%rdx), %r8
-; CHECK-BASELINE-NEXT:    orq %rsi, %r8
-; CHECK-BASELINE-NEXT:    movq %rcx, 24(%rax)
-; CHECK-BASELINE-NEXT:    movq %rdi, 16(%rax)
-; CHECK-BASELINE-NEXT:    movq %r9, 8(%rax)
-; CHECK-BASELINE-NEXT:    movq %r8, (%rax)
-; CHECK-BASELINE-NEXT:    popq %rbx
+; CHECK-BASELINE-NEXT:    movq 24(%rdx), %r8
+; CHECK-BASELINE-NEXT:    movq 16(%rdx), %r9
+; CHECK-BASELINE-NEXT:    movq (%rdx), %rdi
+; CHECK-BASELINE-NEXT:    movq 8(%rdx), %r10
+; CHECK-BASELINE-NEXT:    movq (%rsi), %r11
+; CHECK-BASELINE-NEXT:    xorq %rdi, %r11
+; CHECK-BASELINE-NEXT:    andq (%rcx), %r11
+; CHECK-BASELINE-NEXT:    xorq %rdi, %r11
+; CHECK-BASELINE-NEXT:    movq 8(%rsi), %rdi
+; CHECK-BASELINE-NEXT:    xorq %r10, %rdi
+; CHECK-BASELINE-NEXT:    andq 8(%rcx), %rdi
+; CHECK-BASELINE-NEXT:    xorq %r10, %rdi
+; CHECK-BASELINE-NEXT:    movq 16(%rsi), %rdx
+; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
+; CHECK-BASELINE-NEXT:    andq 16(%rcx), %rdx
+; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
+; CHECK-BASELINE-NEXT:    movq 24(%rsi), %rsi
+; CHECK-BASELINE-NEXT:    xorq %r8, %rsi
+; CHECK-BASELINE-NEXT:    andq 24(%rcx), %rsi
+; CHECK-BASELINE-NEXT:    xorq %r8, %rsi
+; CHECK-BASELINE-NEXT:    movq %rsi, 24(%rax)
+; CHECK-BASELINE-NEXT:    movq %rdx, 16(%rax)
+; CHECK-BASELINE-NEXT:    movq %rdi, 8(%rax)
+; CHECK-BASELINE-NEXT:    movq %r11, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i64:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movq (%rcx), %r8
-; CHECK-SSE1-NEXT:    movq 8(%rcx), %r9
-; CHECK-SSE1-NEXT:    movq 16(%rcx), %rdi
-; CHECK-SSE1-NEXT:    movq 24(%rcx), %rcx
-; CHECK-SSE1-NEXT:    movq 24(%rsi), %r10
-; CHECK-SSE1-NEXT:    andq %rcx, %r10
-; CHECK-SSE1-NEXT:    movq 16(%rsi), %r11
-; CHECK-SSE1-NEXT:    andq %rdi, %r11
-; CHECK-SSE1-NEXT:    movq 8(%rsi), %rbx
-; CHECK-SSE1-NEXT:    andq %r9, %rbx
-; CHECK-SSE1-NEXT:    movq (%rsi), %rsi
-; CHECK-SSE1-NEXT:    andq %r8, %rsi
-; CHECK-SSE1-NEXT:    notq %r8
-; CHECK-SSE1-NEXT:    notq %r9
-; CHECK-SSE1-NEXT:    notq %rdi
-; CHECK-SSE1-NEXT:    notq %rcx
-; CHECK-SSE1-NEXT:    andq 24(%rdx), %rcx
-; CHECK-SSE1-NEXT:    orq %r10, %rcx
-; CHECK-SSE1-NEXT:    andq 16(%rdx), %rdi
-; CHECK-SSE1-NEXT:    orq %r11, %rdi
-; CHECK-SSE1-NEXT:    andq 8(%rdx), %r9
-; CHECK-SSE1-NEXT:    orq %rbx, %r9
-; CHECK-SSE1-NEXT:    andq (%rdx), %r8
-; CHECK-SSE1-NEXT:    orq %rsi, %r8
-; CHECK-SSE1-NEXT:    movq %rcx, 24(%rax)
-; CHECK-SSE1-NEXT:    movq %rdi, 16(%rax)
-; CHECK-SSE1-NEXT:    movq %r9, 8(%rax)
-; CHECK-SSE1-NEXT:    movq %r8, (%rax)
-; CHECK-SSE1-NEXT:    popq %rbx
+; CHECK-SSE1-NEXT:    movq 24(%rdx), %r8
+; CHECK-SSE1-NEXT:    movq 16(%rdx), %r9
+; CHECK-SSE1-NEXT:    movq (%rdx), %rdi
+; CHECK-SSE1-NEXT:    movq 8(%rdx), %r10
+; CHECK-SSE1-NEXT:    movq (%rsi), %r11
+; CHECK-SSE1-NEXT:    xorq %rdi, %r11
+; CHECK-SSE1-NEXT:    andq (%rcx), %r11
+; CHECK-SSE1-NEXT:    xorq %rdi, %r11
+; CHECK-SSE1-NEXT:    movq 8(%rsi), %rdi
+; CHECK-SSE1-NEXT:    xorq %r10, %rdi
+; CHECK-SSE1-NEXT:    andq 8(%rcx), %rdi
+; CHECK-SSE1-NEXT:    xorq %r10, %rdi
+; CHECK-SSE1-NEXT:    movq 16(%rsi), %rdx
+; CHECK-SSE1-NEXT:    xorq %r9, %rdx
+; CHECK-SSE1-NEXT:    andq 16(%rcx), %rdx
+; CHECK-SSE1-NEXT:    xorq %r9, %rdx
+; CHECK-SSE1-NEXT:    movq 24(%rsi), %rsi
+; CHECK-SSE1-NEXT:    xorq %r8, %rsi
+; CHECK-SSE1-NEXT:    andq 24(%rcx), %rsi
+; CHECK-SSE1-NEXT:    xorq %r8, %rsi
+; CHECK-SSE1-NEXT:    movq %rsi, 24(%rax)
+; CHECK-SSE1-NEXT:    movq %rdx, 16(%rax)
+; CHECK-SSE1-NEXT:    movq %rdi, 8(%rax)
+; CHECK-SSE1-NEXT:    movq %r11, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i64:


        


More information about the llvm-branch-commits mailing list