[llvm] [ARM] LHS and RHS should be frozen for LowerCMP (PR #159993)

via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 22 09:11:57 PDT 2025


https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/159993

>From 942b079a52510a7910fbbf37322d528d3f2a0499 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Sun, 21 Sep 2025 13:38:48 -0400
Subject: [PATCH 1/2] [TargetLowering][ARM] Freeze operands in UCMP

LHS and RHS are used multiple times.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |    8 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |    4 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   13 +-
 .../GlobalISel/legalize-threeway-cmp.mir      |   34 +-
 llvm/test/CodeGen/AArch64/freeze.ll           |   32 +-
 llvm/test/CodeGen/AArch64/ucmp.ll             |    8 +-
 llvm/test/CodeGen/ARM/scmp.ll                 |   30 +-
 llvm/test/CodeGen/ARM/ucmp.ll                 |   30 +-
 llvm/test/CodeGen/PowerPC/ucmp.ll             |   12 +-
 llvm/test/CodeGen/SystemZ/ucmp.ll             |    4 +-
 llvm/test/CodeGen/Thumb/scmp.ll               |   60 +-
 llvm/test/CodeGen/Thumb/ucmp.ll               |   60 +-
 llvm/test/CodeGen/X86/scmp.ll                 |  880 ++++++-----
 llvm/test/CodeGen/X86/ucmp.ll                 | 1372 +++++++++--------
 14 files changed, 1317 insertions(+), 1230 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f3e036ed1b947..3be957378286f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8582,6 +8582,8 @@ LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
   LLT DstTy = MRI.getType(Dst);
   LLT SrcTy = MRI.getType(Cmp->getReg(1));
   LLT CmpTy = DstTy.changeElementSize(1);
+  auto LHS = MIRBuilder.buildFreeze(SrcTy, Cmp->getLHSReg());
+  auto RHS = MIRBuilder.buildFreeze(SrcTy, Cmp->getRHSReg());
 
   CmpInst::Predicate LTPredicate = Cmp->isSigned()
                                        ? CmpInst::Predicate::ICMP_SLT
@@ -8591,10 +8593,8 @@ LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
                                        : CmpInst::Predicate::ICMP_UGT;
 
   auto Zero = MIRBuilder.buildConstant(DstTy, 0);
-  auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, Cmp->getLHSReg(),
-                                   Cmp->getRHSReg());
-  auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, Cmp->getLHSReg(),
-                                   Cmp->getRHSReg());
+  auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, LHS, RHS);
+  auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, LHS, RHS);
 
   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
   auto BC = TLI.getBooleanContents(DstTy.isVector(), /*isFP=*/false);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 80500e48351e4..02f85cfc9262e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10956,8 +10956,8 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
 
 SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
   unsigned Opcode = Node->getOpcode();
-  SDValue LHS = Node->getOperand(0);
-  SDValue RHS = Node->getOperand(1);
+  SDValue LHS = DAG.getFreeze(Node->getOperand(0));
+  SDValue RHS = DAG.getFreeze(Node->getOperand(1));
   EVT VT = LHS.getValueType();
   EVT ResVT = Node->getValueType(0);
   EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9052cbfa89deb..01ab006d288fa 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -10479,6 +10479,9 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
 
   // Special case for Thumb1 UCMP only
   if (!IsSigned && Subtarget->isThumb1Only()) {
+    LHS = DAG.getFreeze(LHS);
+    RHS = DAG.getFreeze(RHS);
+
     // For Thumb unsigned comparison, use this sequence:
     // subs r2, r0, r1   ; r2 = LHS - RHS, sets flags
     // sbc r2, r2        ; r2 = r2 - r2 - !carry
@@ -10511,10 +10514,7 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
     // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
     SDValue Result =
         DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
-    if (Op.getValueType() != MVT::i32)
-      Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
-
-    return Result;
+    return DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
   }
 
   // For the ARM assembly pattern:
@@ -10582,10 +10582,7 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
                                 LTCondValue, Flags);
 
-  if (Op.getValueType() != MVT::i32)
-    Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
-
-  return Result2;
+  return DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
 }
 
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
index ae16e40671785..e1c63005ee9d2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
@@ -7,8 +7,10 @@ body:             |
     ; CHECK-LABEL: name: test_scmp
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[FREEZE]](s64), [[FREEZE1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[FREEZE]](s64), [[FREEZE1]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]]
@@ -30,8 +32,10 @@ body:             |
     ; CHECK-LABEL: name: test_ucmp
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE]](s64), [[FREEZE1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE]](s64), [[FREEZE1]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]]
@@ -61,8 +65,10 @@ body:             |
     ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $w2
     ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $w3
     ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ugt), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ult), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ugt), [[FREEZE]](<4 x s32>), [[FREEZE1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ult), [[FREEZE]](<4 x s32>), [[FREEZE1]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s16>) = G_SUB [[TRUNC]], [[TRUNC1]]
@@ -92,13 +98,17 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[DEF]]
+    ; CHECK-NEXT: [[FREEZE2:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: [[FREEZE3:%[0-9]+]]:_(s64) = G_FREEZE [[DEF]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE]](s64), [[FREEZE2]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE1]](s64), [[FREEZE3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[FREEZE1]](s64), [[FREEZE3]]
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[DEF]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE]](s64), [[FREEZE2]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE1]](s64), [[FREEZE3]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[FREEZE1]](s64), [[FREEZE3]]
     ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP3]], [[ICMP4]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index fae3bbe2dcfba..cdc7fc12404ca 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -522,16 +522,28 @@ define i32 @freeze_scmp(i32 %a0) nounwind {
 }
 
 define i32 @freeze_ucmp(i32 %a0) nounwind {
-; CHECK-LABEL: freeze_ucmp:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEXT:    cmp w8, w0
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    csinv w8, w8, wzr, hs
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    csinv w0, w8, wzr, hs
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: freeze_ucmp:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #2 // =0x2
+; CHECK-SD-NEXT:    cmp w8, w0
+; CHECK-SD-NEXT:    cset w8, hi
+; CHECK-SD-NEXT:    csinv w8, w8, wzr, hs
+; CHECK-SD-NEXT:    cmp w8, #1
+; CHECK-SD-NEXT:    cset w8, hi
+; CHECK-SD-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_ucmp:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov w9, #1 // =0x1
+; CHECK-GI-NEXT:    cmp w8, w0
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    csinv w8, w8, wzr, hs
+; CHECK-GI-NEXT:    cmp w8, w9
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    csinv w0, w8, wzr, hs
+; CHECK-GI-NEXT:    ret
   %x = call i32 @llvm.ucmp.i32(i32 2, i32 %a0)
   %y = freeze i32 %x
   %z = call i32 @llvm.ucmp.i32(i32 %y, i32 1)
diff --git a/llvm/test/CodeGen/AArch64/ucmp.ll b/llvm/test/CodeGen/AArch64/ucmp.ll
index af8225307fedd..6b5bcfa400230 100644
--- a/llvm/test/CodeGen/AArch64/ucmp.ll
+++ b/llvm/test/CodeGen/AArch64/ucmp.ll
@@ -13,8 +13,8 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
 ;
 ; CHECK-GI-LABEL: ucmp.8.8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    and w8, w0, #0xff
-; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    uxtb w8, w0
+; CHECK-GI-NEXT:    uxtb w9, w1
 ; CHECK-GI-NEXT:    cmp w8, w9
 ; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    csinv w0, w8, wzr, hs
@@ -34,8 +34,8 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
 ;
 ; CHECK-GI-LABEL: ucmp.8.16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    and w8, w0, #0xffff
-; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    uxth w8, w0
+; CHECK-GI-NEXT:    uxth w9, w1
 ; CHECK-GI-NEXT:    cmp w8, w9
 ; CHECK-GI-NEXT:    cset w8, hi
 ; CHECK-GI-NEXT:    csinv w0, w8, wzr, hs
diff --git a/llvm/test/CodeGen/ARM/scmp.ll b/llvm/test/CodeGen/ARM/scmp.ll
index 9189aee6aaf43..07a08f46ee1ca 100644
--- a/llvm/test/CodeGen/ARM/scmp.ll
+++ b/llvm/test/CodeGen/ARM/scmp.ll
@@ -58,23 +58,23 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CHECK-NEXT:    ldr r4, [sp, #24]
-; CHECK-NEXT:    mov r5, #0
-; CHECK-NEXT:    ldr r6, [sp, #28]
-; CHECK-NEXT:    subs r7, r0, r4
-; CHECK-NEXT:    ldr r12, [sp, #32]
-; CHECK-NEXT:    sbcs r7, r1, r6
-; CHECK-NEXT:    ldr lr, [sp, #36]
-; CHECK-NEXT:    sbcs r7, r2, r12
-; CHECK-NEXT:    sbcs r7, r3, lr
+; CHECK-NEXT:    ldr r5, [sp, #24]
+; CHECK-NEXT:    mov r6, #0
+; CHECK-NEXT:    ldr r4, [sp, #28]
+; CHECK-NEXT:    subs r7, r0, r5
+; CHECK-NEXT:    ldr lr, [sp, #32]
+; CHECK-NEXT:    sbcs r7, r1, r4
+; CHECK-NEXT:    ldr r12, [sp, #36]
+; CHECK-NEXT:    sbcs r7, r2, lr
+; CHECK-NEXT:    sbcs r7, r3, r12
 ; CHECK-NEXT:    mov r7, #0
 ; CHECK-NEXT:    movwlt r7, #1
-; CHECK-NEXT:    subs r0, r4, r0
-; CHECK-NEXT:    sbcs r0, r6, r1
-; CHECK-NEXT:    sbcs r0, r12, r2
-; CHECK-NEXT:    sbcs r0, lr, r3
-; CHECK-NEXT:    movwlt r5, #1
-; CHECK-NEXT:    sub r0, r5, r7
+; CHECK-NEXT:    subs r0, r5, r0
+; CHECK-NEXT:    sbcs r0, r4, r1
+; CHECK-NEXT:    sbcs r0, lr, r2
+; CHECK-NEXT:    sbcs r0, r12, r3
+; CHECK-NEXT:    movwlt r6, #1
+; CHECK-NEXT:    sub r0, r6, r7
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, pc}
   %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
   ret i8 %1
diff --git a/llvm/test/CodeGen/ARM/ucmp.ll b/llvm/test/CodeGen/ARM/ucmp.ll
index bb0201454d1ea..a15cc4cca0d39 100644
--- a/llvm/test/CodeGen/ARM/ucmp.ll
+++ b/llvm/test/CodeGen/ARM/ucmp.ll
@@ -58,23 +58,23 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CHECK-NEXT:    ldr r4, [sp, #24]
-; CHECK-NEXT:    mov r5, #0
-; CHECK-NEXT:    ldr r6, [sp, #28]
-; CHECK-NEXT:    subs r7, r0, r4
-; CHECK-NEXT:    ldr r12, [sp, #32]
-; CHECK-NEXT:    sbcs r7, r1, r6
-; CHECK-NEXT:    ldr lr, [sp, #36]
-; CHECK-NEXT:    sbcs r7, r2, r12
-; CHECK-NEXT:    sbcs r7, r3, lr
+; CHECK-NEXT:    ldr r5, [sp, #24]
+; CHECK-NEXT:    mov r6, #0
+; CHECK-NEXT:    ldr r4, [sp, #28]
+; CHECK-NEXT:    subs r7, r0, r5
+; CHECK-NEXT:    ldr lr, [sp, #32]
+; CHECK-NEXT:    sbcs r7, r1, r4
+; CHECK-NEXT:    ldr r12, [sp, #36]
+; CHECK-NEXT:    sbcs r7, r2, lr
+; CHECK-NEXT:    sbcs r7, r3, r12
 ; CHECK-NEXT:    mov r7, #0
 ; CHECK-NEXT:    movwlo r7, #1
-; CHECK-NEXT:    subs r0, r4, r0
-; CHECK-NEXT:    sbcs r0, r6, r1
-; CHECK-NEXT:    sbcs r0, r12, r2
-; CHECK-NEXT:    sbcs r0, lr, r3
-; CHECK-NEXT:    movwlo r5, #1
-; CHECK-NEXT:    sub r0, r5, r7
+; CHECK-NEXT:    subs r0, r5, r0
+; CHECK-NEXT:    sbcs r0, r4, r1
+; CHECK-NEXT:    sbcs r0, lr, r2
+; CHECK-NEXT:    sbcs r0, r12, r3
+; CHECK-NEXT:    movwlo r6, #1
+; CHECK-NEXT:    sub r0, r6, r7
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, pc}
   %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
   ret i8 %1
diff --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll
index d2dff6e7e05c8..22faf9cbd9c24 100644
--- a/llvm/test/CodeGen/PowerPC/ucmp.ll
+++ b/llvm/test/CodeGen/PowerPC/ucmp.ll
@@ -4,11 +4,13 @@
 define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
 ; CHECK-LABEL: ucmp_8_8:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    clrldi 5, 4, 32
+; CHECK-NEXT:    clrldi 6, 3, 32
+; CHECK-NEXT:    sub 5, 5, 6
 ; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    sub 5, 4, 3
 ; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
 ; CHECK-NEXT:    rldic 3, 3, 0, 32
+; CHECK-NEXT:    rldicl 5, 5, 1, 63
 ; CHECK-NEXT:    isellt 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
@@ -18,11 +20,13 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
 define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
 ; CHECK-LABEL: ucmp_8_16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    clrldi 5, 4, 32
+; CHECK-NEXT:    clrldi 6, 3, 32
+; CHECK-NEXT:    sub 5, 5, 6
 ; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    sub 5, 4, 3
 ; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
 ; CHECK-NEXT:    rldic 3, 3, 0, 32
+; CHECK-NEXT:    rldicl 5, 5, 1, 63
 ; CHECK-NEXT:    isellt 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
diff --git a/llvm/test/CodeGen/SystemZ/ucmp.ll b/llvm/test/CodeGen/SystemZ/ucmp.ll
index 4175cd7850a98..786f5610c2d1f 100644
--- a/llvm/test/CodeGen/SystemZ/ucmp.ll
+++ b/llvm/test/CodeGen/SystemZ/ucmp.ll
@@ -4,7 +4,7 @@
 define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind {
 ; CHECK-LABEL: ucmp.8.8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cr %r2, %r3
+; CHECK-NEXT:    clr %r2, %r3
 ; CHECK-NEXT:    lhi %r2, 0
 ; CHECK-NEXT:    lochih %r2, 1
 ; CHECK-NEXT:    lochil %r2, -1
@@ -16,7 +16,7 @@ define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind {
 define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind {
 ; CHECK-LABEL: ucmp.8.16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cr %r2, %r3
+; CHECK-NEXT:    clr %r2, %r3
 ; CHECK-NEXT:    lhi %r2, 0
 ; CHECK-NEXT:    lochih %r2, 1
 ; CHECK-NEXT:    lochil %r2, -1
diff --git a/llvm/test/CodeGen/Thumb/scmp.ll b/llvm/test/CodeGen/Thumb/scmp.ll
index c0024492b3a6d..cf73771d3c426 100644
--- a/llvm/test/CodeGen/Thumb/scmp.ll
+++ b/llvm/test/CodeGen/Thumb/scmp.ll
@@ -184,44 +184,50 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
 ; THUMB1:       @ %bb.0:
 ; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
 ; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
-; THUMB1-NEXT:    .pad #20
-; THUMB1-NEXT:    sub sp, #20
-; THUMB1-NEXT:    str r3, [sp, #16] @ 4-byte Spill
-; THUMB1-NEXT:    movs r3, #1
-; THUMB1-NEXT:    str r3, [sp] @ 4-byte Spill
-; THUMB1-NEXT:    movs r3, #0
-; THUMB1-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; THUMB1-NEXT:    ldr r6, [sp, #52]
-; THUMB1-NEXT:    add r7, sp, #40
-; THUMB1-NEXT:    ldm r7, {r3, r5, r7}
-; THUMB1-NEXT:    subs r4, r0, r3
-; THUMB1-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; THUMB1-NEXT:    .pad #36
+; THUMB1-NEXT:    sub sp, #36
+; THUMB1-NEXT:    ldr r4, [sp, #68]
+; THUMB1-NEXT:    str r4, [sp, #8] @ 4-byte Spill
+; THUMB1-NEXT:    add r7, sp, #56
+; THUMB1-NEXT:    ldm r7, {r5, r6, r7}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    str r4, [sp, #4] @ 4-byte Spill
+; THUMB1-NEXT:    movs r4, #0
+; THUMB1-NEXT:    str r4, [sp, #24] @ 4-byte Spill
+; THUMB1-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; THUMB1-NEXT:    str r5, [sp, #12] @ 4-byte Spill
+; THUMB1-NEXT:    subs r4, r0, r5
+; THUMB1-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; THUMB1-NEXT:    str r1, [sp, #20] @ 4-byte Spill
 ; THUMB1-NEXT:    mov r4, r1
-; THUMB1-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; THUMB1-NEXT:    sbcs r4, r5
-; THUMB1-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; THUMB1-NEXT:    sbcs r4, r6
+; THUMB1-NEXT:    str r2, [sp, #28] @ 4-byte Spill
 ; THUMB1-NEXT:    mov r4, r2
+; THUMB1-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
 ; THUMB1-NEXT:    sbcs r4, r7
-; THUMB1-NEXT:    ldr r4, [sp, #16] @ 4-byte Reload
-; THUMB1-NEXT:    sbcs r4, r6
-; THUMB1-NEXT:    mov r2, r1
+; THUMB1-NEXT:    str r3, [sp, #32] @ 4-byte Spill
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    sbcs r4, r5
+; THUMB1-NEXT:    mov r1, r2
 ; THUMB1-NEXT:    blt .LBB4_2
 ; THUMB1-NEXT:  @ %bb.1:
-; THUMB1-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; THUMB1-NEXT:  .LBB4_2:
-; THUMB1-NEXT:    subs r0, r3, r0
-; THUMB1-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; THUMB1-NEXT:    sbcs r5, r0
-; THUMB1-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; THUMB1-NEXT:    sbcs r7, r0
 ; THUMB1-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; THUMB1-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT:    subs r0, r3, r0
+; THUMB1-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; THUMB1-NEXT:    sbcs r6, r0
+; THUMB1-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; THUMB1-NEXT:    sbcs r7, r0
+; THUMB1-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
+; THUMB1-NEXT:    sbcs r5, r0
 ; THUMB1-NEXT:    blt .LBB4_4
 ; THUMB1-NEXT:  @ %bb.3:
-; THUMB1-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
 ; THUMB1-NEXT:  .LBB4_4:
-; THUMB1-NEXT:    subs r0, r1, r2
-; THUMB1-NEXT:    add sp, #20
+; THUMB1-NEXT:    subs r0, r2, r1
+; THUMB1-NEXT:    add sp, #36
 ; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: scmp_8_128:
diff --git a/llvm/test/CodeGen/Thumb/ucmp.ll b/llvm/test/CodeGen/Thumb/ucmp.ll
index 5d0f57e2a9d72..e10a162ed0474 100644
--- a/llvm/test/CodeGen/Thumb/ucmp.ll
+++ b/llvm/test/CodeGen/Thumb/ucmp.ll
@@ -151,44 +151,50 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
 ; THUMB1:       @ %bb.0:
 ; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
 ; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
-; THUMB1-NEXT:    .pad #20
-; THUMB1-NEXT:    sub sp, #20
-; THUMB1-NEXT:    str r3, [sp, #16] @ 4-byte Spill
-; THUMB1-NEXT:    movs r3, #1
-; THUMB1-NEXT:    str r3, [sp] @ 4-byte Spill
-; THUMB1-NEXT:    movs r3, #0
-; THUMB1-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; THUMB1-NEXT:    ldr r6, [sp, #52]
-; THUMB1-NEXT:    add r7, sp, #40
-; THUMB1-NEXT:    ldm r7, {r3, r5, r7}
-; THUMB1-NEXT:    subs r4, r0, r3
-; THUMB1-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; THUMB1-NEXT:    .pad #36
+; THUMB1-NEXT:    sub sp, #36
+; THUMB1-NEXT:    ldr r4, [sp, #68]
+; THUMB1-NEXT:    str r4, [sp, #8] @ 4-byte Spill
+; THUMB1-NEXT:    add r7, sp, #56
+; THUMB1-NEXT:    ldm r7, {r5, r6, r7}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    str r4, [sp, #4] @ 4-byte Spill
+; THUMB1-NEXT:    movs r4, #0
+; THUMB1-NEXT:    str r4, [sp, #24] @ 4-byte Spill
+; THUMB1-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; THUMB1-NEXT:    str r5, [sp, #12] @ 4-byte Spill
+; THUMB1-NEXT:    subs r4, r0, r5
+; THUMB1-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; THUMB1-NEXT:    str r1, [sp, #20] @ 4-byte Spill
 ; THUMB1-NEXT:    mov r4, r1
-; THUMB1-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; THUMB1-NEXT:    sbcs r4, r5
-; THUMB1-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; THUMB1-NEXT:    sbcs r4, r6
+; THUMB1-NEXT:    str r2, [sp, #28] @ 4-byte Spill
 ; THUMB1-NEXT:    mov r4, r2
+; THUMB1-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
 ; THUMB1-NEXT:    sbcs r4, r7
-; THUMB1-NEXT:    ldr r4, [sp, #16] @ 4-byte Reload
-; THUMB1-NEXT:    sbcs r4, r6
-; THUMB1-NEXT:    mov r2, r1
+; THUMB1-NEXT:    str r3, [sp, #32] @ 4-byte Spill
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    sbcs r4, r5
+; THUMB1-NEXT:    mov r1, r2
 ; THUMB1-NEXT:    blo .LBB4_2
 ; THUMB1-NEXT:  @ %bb.1:
-; THUMB1-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
 ; THUMB1-NEXT:  .LBB4_2:
-; THUMB1-NEXT:    subs r0, r3, r0
-; THUMB1-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; THUMB1-NEXT:    sbcs r5, r0
-; THUMB1-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; THUMB1-NEXT:    sbcs r7, r0
 ; THUMB1-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; THUMB1-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT:    subs r0, r3, r0
+; THUMB1-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; THUMB1-NEXT:    sbcs r6, r0
+; THUMB1-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; THUMB1-NEXT:    sbcs r7, r0
+; THUMB1-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
+; THUMB1-NEXT:    sbcs r5, r0
 ; THUMB1-NEXT:    blo .LBB4_4
 ; THUMB1-NEXT:  @ %bb.3:
-; THUMB1-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
 ; THUMB1-NEXT:  .LBB4_4:
-; THUMB1-NEXT:    subs r0, r1, r2
-; THUMB1-NEXT:    add sp, #20
+; THUMB1-NEXT:    subs r0, r2, r1
+; THUMB1-NEXT:    add sp, #36
 ; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ucmp_8_128:
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 8a287229a1cb1..5a7a05d09763e 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -17,7 +17,7 @@ define i8 @scmp.8.8(i8 %x, i8 %y) nounwind {
 ; X86-LABEL: scmp.8.8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    subb %cl, %al
@@ -38,7 +38,7 @@ define i8 @scmp.8.16(i16 %x, i16 %y) nounwind {
 ; X86-LABEL: scmp.8.16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %ax
+; X86-NEXT:    cmpw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    subb %cl, %al
@@ -59,7 +59,7 @@ define i8 @scmp.8.32(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: scmp.8.32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    subb %cl, %al
@@ -167,7 +167,7 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: scmp.32.32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
@@ -263,7 +263,7 @@ define i4 @scmp_narrow_result(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: scmp_narrow_result:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    subb %cl, %al
@@ -330,9 +330,9 @@ define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind {
 ;
 ; X86-LABEL: scmp_wide_result:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
 ; X86-NEXT:    subb %cl, %dl
@@ -471,27 +471,27 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %dl
 ; X86-NEXT:    setg %dh
 ; X86-NEXT:    subb %dl, %dh
 ; X86-NEXT:    movsbl %dh, %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %bl
 ; X86-NEXT:    setg %bh
 ; X86-NEXT:    subb %bl, %bh
 ; X86-NEXT:    movsbl %bh, %edi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %bl
 ; X86-NEXT:    setg %bh
 ; X86-NEXT:    subb %bl, %bh
 ; X86-NEXT:    movsbl %bh, %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %ch
 ; X86-NEXT:    subb %cl, %ch
@@ -628,31 +628,31 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setl %ch
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %ch, %cl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    setl %ch
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    setl %dh
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %dh, %dl
+; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    setl %dh
 ; X86-NEXT:    setg %bl
-; X86-NEXT:    subb %ch, %bl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    setl %ch
+; X86-NEXT:    subb %dh, %bl
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    setl %dh
 ; X86-NEXT:    setg %bh
-; X86-NEXT:    subb %ch, %bh
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    setl %dl
+; X86-NEXT:    subb %dh, %bh
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %ch
-; X86-NEXT:    subb %dl, %ch
+; X86-NEXT:    subb %cl, %ch
 ; X86-NEXT:    movb %ch, 3(%eax)
 ; X86-NEXT:    movb %bh, 2(%eax)
 ; X86-NEXT:    movb %bl, 1(%eax)
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    movb %dl, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -713,27 +713,27 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %dl
 ; X86-NEXT:    setg %dh
 ; X86-NEXT:    subb %dl, %dh
 ; X86-NEXT:    movsbl %dh, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    cmpb %bl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %bl
 ; X86-NEXT:    setg %bh
 ; X86-NEXT:    subb %bl, %bh
 ; X86-NEXT:    movsbl %bh, %esi
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    cmpb %ch, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %ch
 ; X86-NEXT:    setg %bl
 ; X86-NEXT:    subb %ch, %bl
 ; X86-NEXT:    movsbl %bl, %edi
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    cmpb %cl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %ch
 ; X86-NEXT:    subb %cl, %ch
@@ -869,90 +869,90 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    cmpb %bh, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    cmpb %bl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    cmpb %dh, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    cmpb %ch, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    cmpb %ah, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    cmpb %dl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %bh
 ; X86-NEXT:    subb %al, %bh
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %bl
 ; X86-NEXT:    subb %al, %bl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dh
 ; X86-NEXT:    subb %al, %dh
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dl
 ; X86-NEXT:    subb %al, %dl
 ; X86-NEXT:    movsbl %dl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dl
 ; X86-NEXT:    subb %al, %dl
 ; X86-NEXT:    movsbl %dl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dl
 ; X86-NEXT:    subb %al, %dl
 ; X86-NEXT:    movsbl %dl, %ebp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dl
 ; X86-NEXT:    subb %al, %dl
 ; X86-NEXT:    movsbl %dl, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %ah
 ; X86-NEXT:    subb %al, %ah
 ; X86-NEXT:    movsbl %ah, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dl
 ; X86-NEXT:    subb %al, %dl
@@ -999,154 +999,179 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ; SSE2-LABEL: scmp_wide_vec_op:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm7, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm8, %rax
 ; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movd %eax, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
-; SSE2-NEXT:    movq %xmm7, %rax
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq %xmm7, %rcx
+; SSE2-NEXT:    movd %eax, %xmm7
+; SSE2-NEXT:    movq %xmm8, %rax
+; SSE2-NEXT:    cmpq %rax, %rcx
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movd %eax, %xmm7
-; SSE2-NEXT:    movq %xmm6, %rax
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movd %eax, %xmm8
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm7, %rax
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movd %eax, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; SSE2-NEXT:    movq %xmm6, %rax
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq %xmm6, %rcx
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    movq %xmm7, %rax
+; SSE2-NEXT:    cmpq %rax, %rcx
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
 ; SSE2-NEXT:    movd %eax, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; SSE2-NEXT:    movq %xmm5, %rax
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm7, %rax
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7
 ; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm5, %rcx
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movd %eax, %xmm6
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    movq %xmm7, %rax
+; SSE2-NEXT:    cmpq %rax, %rcx
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movq %xmm4, %rcx
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movd %eax, %xmm8
+; SSE2-NEXT:    movd %eax, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm5, %rax
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; SSE2-NEXT:    movq %xmm4, %rax
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    movd %eax, %xmm8
+; SSE2-NEXT:    movq %xmm5, %rax
+; SSE2-NEXT:    cmpq %rax, %rcx
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
 ; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm5, %rax
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
 ; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm3, %rcx
 ; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    movq %xmm5, %rax
+; SSE2-NEXT:    cmpq %rax, %rcx
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movq %xmm2, %rcx
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rcx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    setl %al
-; SSE2-NEXT:    setg %dl
-; SSE2-NEXT:    subb %al, %dl
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT:    movzbl %dl, %eax
-; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    movd %eax, %xmm6
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    cmpq %rax, %rcx
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
 ; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
+; SSE2-NEXT:    movq %xmm1, %rcx
 ; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    cmpq %rax, %rcx
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
 ; SSE2-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    setl %al
 ; SSE2-NEXT:    setg %cl
 ; SSE2-NEXT:    subb %al, %cl
 ; SSE2-NEXT:    movzbl %cl, %eax
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    setl %al
+; SSE2-NEXT:    setg %cl
+; SSE2-NEXT:    subb %al, %cl
+; SSE2-NEXT:    movzbl %cl, %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: scmp_wide_vec_op:
 ; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm12
+; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13
+; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm15
 ; SSE4-NEXT:    pextrq $1, %xmm0, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    pextrq $1, %xmm15, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    movq %xmm0, %rcx
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
+; SSE4-NEXT:    movq %xmm15, %rdx
+; SSE4-NEXT:    cmpq %rdx, %rcx
 ; SSE4-NEXT:    setl %cl
 ; SSE4-NEXT:    setg %dl
 ; SSE4-NEXT:    subb %cl, %dl
@@ -1154,98 +1179,112 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ; SSE4-NEXT:    movd %ecx, %xmm0
 ; SSE4-NEXT:    pinsrb $1, %eax, %xmm0
 ; SSE4-NEXT:    movq %xmm1, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq %xmm14, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $2, %eax, %xmm0
 ; SSE4-NEXT:    pextrq $1, %xmm1, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    pextrq $1, %xmm14, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $3, %eax, %xmm0
 ; SSE4-NEXT:    movq %xmm2, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq %xmm13, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $4, %eax, %xmm0
 ; SSE4-NEXT:    pextrq $1, %xmm2, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    pextrq $1, %xmm13, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $5, %eax, %xmm0
 ; SSE4-NEXT:    movq %xmm3, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq %xmm12, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $6, %eax, %xmm0
 ; SSE4-NEXT:    pextrq $1, %xmm3, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    pextrq $1, %xmm12, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $7, %eax, %xmm0
 ; SSE4-NEXT:    movq %xmm4, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq %xmm11, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $8, %eax, %xmm0
 ; SSE4-NEXT:    pextrq $1, %xmm4, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    pextrq $1, %xmm11, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $9, %eax, %xmm0
 ; SSE4-NEXT:    movq %xmm5, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq %xmm10, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $10, %eax, %xmm0
 ; SSE4-NEXT:    pextrq $1, %xmm5, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    pextrq $1, %xmm10, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $11, %eax, %xmm0
 ; SSE4-NEXT:    movq %xmm6, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq %xmm9, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $12, %eax, %xmm0
 ; SSE4-NEXT:    pextrq $1, %xmm6, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    pextrq $1, %xmm9, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $13, %eax, %xmm0
 ; SSE4-NEXT:    movq %xmm7, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq %xmm8, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
 ; SSE4-NEXT:    movzbl %cl, %eax
 ; SSE4-NEXT:    pinsrb $14, %eax, %xmm0
 ; SSE4-NEXT:    pextrq $1, %xmm7, %rax
-; SSE4-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    pextrq $1, %xmm8, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
 ; SSE4-NEXT:    setl %al
 ; SSE4-NEXT:    setg %cl
 ; SSE4-NEXT:    subb %al, %cl
@@ -1767,58 +1806,71 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
 ; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
-; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; SSE2-NEXT:    addb %dil, %dil
+; SSE2-NEXT:    sarb %dil
+; SSE2-NEXT:    addb %sil, %sil
+; SSE2-NEXT:    sarb %sil
+; SSE2-NEXT:    cmpb %dil, %sil
+; SSE2-NEXT:    setl %sil
+; SSE2-NEXT:    setg %dil
+; SSE2-NEXT:    subb %sil, %dil
+; SSE2-NEXT:    movsbq %dil, %rdi
+; SSE2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rdi, (%rax)
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    addb %bl, %bl
+; SSE2-NEXT:    sarb %bl
+; SSE2-NEXT:    movl {{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    addb %sil, %sil
+; SSE2-NEXT:    sarb %sil
+; SSE2-NEXT:    cmpb %bl, %sil
+; SSE2-NEXT:    setl %sil
+; SSE2-NEXT:    setg %bl
+; SSE2-NEXT:    subb %sil, %bl
+; SSE2-NEXT:    movsbq %bl, %rbx
+; SSE2-NEXT:    movq %rbx, %r14
+; SSE2-NEXT:    sarq $63, %r14
 ; SSE2-NEXT:    addb %r15b, %r15b
 ; SSE2-NEXT:    sarb %r15b
+; SSE2-NEXT:    movl {{[0-9]+}}(%rsp), %esi
 ; SSE2-NEXT:    addb %sil, %sil
 ; SSE2-NEXT:    sarb %sil
 ; SSE2-NEXT:    cmpb %r15b, %sil
 ; SSE2-NEXT:    setl %sil
 ; SSE2-NEXT:    setg %r15b
 ; SSE2-NEXT:    subb %sil, %r15b
-; SSE2-NEXT:    movsbq %r15b, %rsi
-; SSE2-NEXT:    movq %rsi, (%rax)
-; SSE2-NEXT:    movq %rsi, %xmm0
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    addb %r14b, %r14b
-; SSE2-NEXT:    sarb %r14b
-; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
-; SSE2-NEXT:    addb %r15b, %r15b
-; SSE2-NEXT:    sarb %r15b
-; SSE2-NEXT:    cmpb %r14b, %r15b
-; SSE2-NEXT:    setl %r14b
-; SSE2-NEXT:    setg %r15b
-; SSE2-NEXT:    subb %r14b, %r15b
-; SSE2-NEXT:    movsbq %r15b, %r14
-; SSE2-NEXT:    movq %r14, %r15
-; SSE2-NEXT:    sarq $63, %r15
-; SSE2-NEXT:    addb %bpl, %bpl
-; SSE2-NEXT:    sarb %bpl
+; SSE2-NEXT:    movsbq %r15b, %r15
+; SSE2-NEXT:    movq %r15, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    addb %r12b, %r12b
+; SSE2-NEXT:    sarb %r12b
 ; SSE2-NEXT:    addb %dl, %dl
 ; SSE2-NEXT:    sarb %dl
-; SSE2-NEXT:    cmpb %bpl, %dl
+; SSE2-NEXT:    cmpb %r12b, %dl
 ; SSE2-NEXT:    setl %dl
-; SSE2-NEXT:    setg %bpl
-; SSE2-NEXT:    subb %dl, %bpl
-; SSE2-NEXT:    movsbq %bpl, %rdx
-; SSE2-NEXT:    movq %rdx, %r12
-; SSE2-NEXT:    sarq $63, %r12
-; SSE2-NEXT:    addb %bl, %bl
-; SSE2-NEXT:    sarb %bl
+; SSE2-NEXT:    setg %sil
+; SSE2-NEXT:    subb %dl, %sil
+; SSE2-NEXT:    movsbq %sil, %rdx
+; SSE2-NEXT:    movq %rdx, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    addb %bpl, %bpl
+; SSE2-NEXT:    sarb %bpl
 ; SSE2-NEXT:    addb %cl, %cl
 ; SSE2-NEXT:    sarb %cl
-; SSE2-NEXT:    cmpb %bl, %cl
+; SSE2-NEXT:    cmpb %bpl, %cl
 ; SSE2-NEXT:    setl %cl
-; SSE2-NEXT:    setg %bl
-; SSE2-NEXT:    subb %cl, %bl
-; SSE2-NEXT:    movsbq %bl, %rbx
-; SSE2-NEXT:    movq %rbx, %rcx
+; SSE2-NEXT:    setg %bpl
+; SSE2-NEXT:    subb %cl, %bpl
+; SSE2-NEXT:    movsbq %bpl, %r12
+; SSE2-NEXT:    movq %r12, %rcx
 ; SSE2-NEXT:    sarq $63, %rcx
 ; SSE2-NEXT:    addb %r11b, %r11b
 ; SSE2-NEXT:    sarb %r11b
@@ -1828,9 +1880,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; SSE2-NEXT:    setl %r8b
 ; SSE2-NEXT:    setg %r11b
 ; SSE2-NEXT:    subb %r8b, %r11b
-; SSE2-NEXT:    movsbq %r11b, %r8
-; SSE2-NEXT:    movq %r8, %r11
-; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movsbq %r11b, %rsi
+; SSE2-NEXT:    movq %rsi, %r8
+; SSE2-NEXT:    sarq $63, %r8
 ; SSE2-NEXT:    addb %r10b, %r10b
 ; SSE2-NEXT:    sarb %r10b
 ; SSE2-NEXT:    addb %r9b, %r9b
@@ -1842,68 +1894,59 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; SSE2-NEXT:    movsbq %r10b, %r9
 ; SSE2-NEXT:    movq %r9, %r10
 ; SSE2-NEXT:    sarq $63, %r10
-; SSE2-NEXT:    addb %dil, %dil
-; SSE2-NEXT:    sarb %dil
-; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
-; SSE2-NEXT:    addb %bpl, %bpl
-; SSE2-NEXT:    sarb %bpl
-; SSE2-NEXT:    cmpb %dil, %bpl
-; SSE2-NEXT:    setl %dil
-; SSE2-NEXT:    setg %bpl
-; SSE2-NEXT:    subb %dil, %bpl
-; SSE2-NEXT:    movsbq %bpl, %rdi
-; SSE2-NEXT:    movq %rdi, %r13
-; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movq %r10, %rbp
+; SSE2-NEXT:    shldq $20, %r9, %rbp
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero
+; SSE2-NEXT:    movq %r8, %r11
+; SSE2-NEXT:    shldq $31, %rsi, %r11
+; SSE2-NEXT:    movq %rbp, 64(%rax)
+; SSE2-NEXT:    movq %rcx, %rbp
+; SSE2-NEXT:    shldq $42, %r12, %rbp
+; SSE2-NEXT:    movq %r11, 48(%rax)
+; SSE2-NEXT:    movq %rbp, 32(%rax)
+; SSE2-NEXT:    movabsq $9007199254738944, %r11 # imm = 0x1FFFFFFFFFF800
+; SSE2-NEXT:    andq %rdi, %r11
+; SSE2-NEXT:    shldq $53, %rdx, %rdi
+; SSE2-NEXT:    movq %rdi, 16(%rax)
 ; SSE2-NEXT:    movl %r13d, 96(%rax)
-; SSE2-NEXT:    movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF
-; SSE2-NEXT:    andq %r13, %rbp
-; SSE2-NEXT:    shldq $62, %rdi, %r13
+; SSE2-NEXT:    movabsq $2251799813685247, %rdi # imm = 0x7FFFFFFFFFFFF
+; SSE2-NEXT:    andq %r13, %rdi
+; SSE2-NEXT:    shldq $62, %r15, %r13
 ; SSE2-NEXT:    movq %r13, 88(%rax)
-; SSE2-NEXT:    movq %r10, %r13
-; SSE2-NEXT:    shldq $20, %r9, %r13
-; SSE2-NEXT:    movq %r13, 64(%rax)
-; SSE2-NEXT:    movq %r11, %r13
-; SSE2-NEXT:    shldq $31, %r8, %r13
-; SSE2-NEXT:    movq %r13, 48(%rax)
-; SSE2-NEXT:    movq %rcx, %r13
-; SSE2-NEXT:    shldq $42, %rbx, %r13
-; SSE2-NEXT:    movq %r13, 32(%rax)
-; SSE2-NEXT:    movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800
-; SSE2-NEXT:    andq %r12, %r13
-; SSE2-NEXT:    shldq $53, %rdx, %r12
-; SSE2-NEXT:    movq %r12, 16(%rax)
-; SSE2-NEXT:    movq %rbp, %r12
-; SSE2-NEXT:    shrq $48, %r12
-; SSE2-NEXT:    movb %r12b, 102(%rax)
-; SSE2-NEXT:    shrq $32, %rbp
-; SSE2-NEXT:    movw %bp, 100(%rax)
-; SSE2-NEXT:    movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF
-; SSE2-NEXT:    andq %r12, %r15
-; SSE2-NEXT:    shldq $9, %r14, %r15
-; SSE2-NEXT:    shlq $62, %rdi
-; SSE2-NEXT:    orq %r15, %rdi
-; SSE2-NEXT:    movq %rdi, 80(%rax)
-; SSE2-NEXT:    shlq $42, %rbx
-; SSE2-NEXT:    shrq $11, %r13
-; SSE2-NEXT:    orq %rbx, %r13
-; SSE2-NEXT:    movq %r13, 24(%rax)
-; SSE2-NEXT:    shlq $9, %r14
+; SSE2-NEXT:    shlq $42, %r12
+; SSE2-NEXT:    shrq $11, %r11
+; SSE2-NEXT:    orq %r12, %r11
+; SSE2-NEXT:    movq %r11, 24(%rax)
+; SSE2-NEXT:    movq %rdi, %r11
+; SSE2-NEXT:    shrq $48, %r11
+; SSE2-NEXT:    movb %r11b, 102(%rax)
+; SSE2-NEXT:    shrq $32, %rdi
+; SSE2-NEXT:    movw %di, 100(%rax)
+; SSE2-NEXT:    movabsq $9007199254740991, %rdi # imm = 0x1FFFFFFFFFFFFF
+; SSE2-NEXT:    andq %rdi, %r14
+; SSE2-NEXT:    shldq $9, %rbx, %r14
+; SSE2-NEXT:    shlq $62, %r15
+; SSE2-NEXT:    orq %r14, %r15
+; SSE2-NEXT:    movq %r15, 80(%rax)
+; SSE2-NEXT:    shlq $9, %rbx
 ; SSE2-NEXT:    andl $511, %r10d # imm = 0x1FF
-; SSE2-NEXT:    orq %r14, %r10
+; SSE2-NEXT:    orq %rbx, %r10
 ; SSE2-NEXT:    movq %r10, 72(%rax)
 ; SSE2-NEXT:    shlq $20, %r9
-; SSE2-NEXT:    andl $1048575, %r11d # imm = 0xFFFFF
-; SSE2-NEXT:    orq %r9, %r11
-; SSE2-NEXT:    movq %r11, 56(%rax)
-; SSE2-NEXT:    shlq $31, %r8
+; SSE2-NEXT:    andl $1048575, %r8d # imm = 0xFFFFF
+; SSE2-NEXT:    orq %r9, %r8
+; SSE2-NEXT:    movq %r8, 56(%rax)
+; SSE2-NEXT:    shlq $31, %rsi
 ; SSE2-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; SSE2-NEXT:    orq %r8, %rcx
+; SSE2-NEXT:    orq %rsi, %rcx
 ; SSE2-NEXT:    movq %rcx, 40(%rax)
-; SSE2-NEXT:    movq %rsi, %xmm1
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload
+; SSE2-NEXT:    # xmm1 = mem[0],zero
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    andq %r12, %rcx
+; SSE2-NEXT:    andq %rdi, %rcx
 ; SSE2-NEXT:    shlq $53, %rdx
 ; SSE2-NEXT:    orq %rcx, %rdx
 ; SSE2-NEXT:    movq %rdx, 8(%rax)
@@ -1924,140 +1967,143 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; SSE4-NEXT:    pushq %r12
 ; SSE4-NEXT:    pushq %rbx
 ; SSE4-NEXT:    movq %rdi, %rax
-; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
+; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
 ; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
 ; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; SSE4-NEXT:    addb %dil, %dil
+; SSE4-NEXT:    sarb %dil
+; SSE4-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; SSE4-NEXT:    addb %r10b, %r10b
+; SSE4-NEXT:    sarb %r10b
+; SSE4-NEXT:    cmpb %dil, %r10b
+; SSE4-NEXT:    setl %dil
+; SSE4-NEXT:    setg %r10b
+; SSE4-NEXT:    subb %dil, %r10b
+; SSE4-NEXT:    movsbq %r10b, %r13
+; SSE4-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r13
+; SSE4-NEXT:    addb %r11b, %r11b
+; SSE4-NEXT:    sarb %r11b
+; SSE4-NEXT:    addb %sil, %sil
+; SSE4-NEXT:    sarb %sil
+; SSE4-NEXT:    cmpb %r11b, %sil
+; SSE4-NEXT:    setl %sil
+; SSE4-NEXT:    setg %r11b
+; SSE4-NEXT:    subb %sil, %r11b
+; SSE4-NEXT:    movsbq %r11b, %r11
+; SSE4-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r11
 ; SSE4-NEXT:    addb %r14b, %r14b
 ; SSE4-NEXT:    sarb %r14b
+; SSE4-NEXT:    movl {{[0-9]+}}(%rsp), %esi
 ; SSE4-NEXT:    addb %sil, %sil
 ; SSE4-NEXT:    sarb %sil
 ; SSE4-NEXT:    cmpb %r14b, %sil
 ; SSE4-NEXT:    setl %sil
 ; SSE4-NEXT:    setg %r14b
 ; SSE4-NEXT:    subb %sil, %r14b
-; SSE4-NEXT:    movsbq %r14b, %r14
-; SSE4-NEXT:    movq %r14, (%rax)
+; SSE4-NEXT:    movsbq %r14b, %rsi
+; SSE4-NEXT:    movq %rsi, %r14
 ; SSE4-NEXT:    sarq $63, %r14
 ; SSE4-NEXT:    addb %r15b, %r15b
 ; SSE4-NEXT:    sarb %r15b
-; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    addb %sil, %sil
-; SSE4-NEXT:    sarb %sil
-; SSE4-NEXT:    cmpb %r15b, %sil
-; SSE4-NEXT:    setl %sil
-; SSE4-NEXT:    setg %r15b
-; SSE4-NEXT:    subb %sil, %r15b
-; SSE4-NEXT:    movsbq %r15b, %rsi
-; SSE4-NEXT:    movq %rsi, %r15
-; SSE4-NEXT:    sarq $63, %r15
-; SSE4-NEXT:    addb %bpl, %bpl
-; SSE4-NEXT:    sarb %bpl
 ; SSE4-NEXT:    addb %dl, %dl
 ; SSE4-NEXT:    sarb %dl
-; SSE4-NEXT:    cmpb %bpl, %dl
+; SSE4-NEXT:    cmpb %r15b, %dl
 ; SSE4-NEXT:    setl %dl
-; SSE4-NEXT:    setg %bpl
-; SSE4-NEXT:    subb %dl, %bpl
-; SSE4-NEXT:    movsbq %bpl, %r12
-; SSE4-NEXT:    movq %r12, %r13
-; SSE4-NEXT:    sarq $63, %r13
-; SSE4-NEXT:    addb %bl, %bl
-; SSE4-NEXT:    sarb %bl
+; SSE4-NEXT:    setg %r15b
+; SSE4-NEXT:    subb %dl, %r15b
+; SSE4-NEXT:    movsbq %r15b, %r15
+; SSE4-NEXT:    movq %r15, %rdi
+; SSE4-NEXT:    sarq $63, %rdi
+; SSE4-NEXT:    addb %r12b, %r12b
+; SSE4-NEXT:    sarb %r12b
 ; SSE4-NEXT:    addb %cl, %cl
 ; SSE4-NEXT:    sarb %cl
-; SSE4-NEXT:    cmpb %bl, %cl
+; SSE4-NEXT:    cmpb %r12b, %cl
 ; SSE4-NEXT:    setl %cl
-; SSE4-NEXT:    setg %dl
-; SSE4-NEXT:    subb %cl, %dl
-; SSE4-NEXT:    movsbq %dl, %rbx
-; SSE4-NEXT:    movq %rbx, %rcx
+; SSE4-NEXT:    setg %r12b
+; SSE4-NEXT:    subb %cl, %r12b
+; SSE4-NEXT:    movsbq %r12b, %r12
+; SSE4-NEXT:    movq %r12, %rcx
 ; SSE4-NEXT:    sarq $63, %rcx
-; SSE4-NEXT:    addb %r11b, %r11b
-; SSE4-NEXT:    sarb %r11b
+; SSE4-NEXT:    addb %bpl, %bpl
+; SSE4-NEXT:    sarb %bpl
 ; SSE4-NEXT:    addb %r8b, %r8b
 ; SSE4-NEXT:    sarb %r8b
-; SSE4-NEXT:    cmpb %r11b, %r8b
-; SSE4-NEXT:    setl %dl
-; SSE4-NEXT:    setg %r8b
-; SSE4-NEXT:    subb %dl, %r8b
-; SSE4-NEXT:    movsbq %r8b, %rdx
-; SSE4-NEXT:    movq %rdx, %r8
-; SSE4-NEXT:    sarq $63, %r8
-; SSE4-NEXT:    addb %r10b, %r10b
-; SSE4-NEXT:    sarb %r10b
+; SSE4-NEXT:    cmpb %bpl, %r8b
+; SSE4-NEXT:    setl %r8b
+; SSE4-NEXT:    setg %bpl
+; SSE4-NEXT:    subb %r8b, %bpl
+; SSE4-NEXT:    movsbq %bpl, %r10
+; SSE4-NEXT:    movq %r10, %rbp
+; SSE4-NEXT:    sarq $63, %rbp
+; SSE4-NEXT:    addb %bl, %bl
+; SSE4-NEXT:    sarb %bl
 ; SSE4-NEXT:    addb %r9b, %r9b
 ; SSE4-NEXT:    sarb %r9b
-; SSE4-NEXT:    cmpb %r10b, %r9b
+; SSE4-NEXT:    cmpb %bl, %r9b
 ; SSE4-NEXT:    setl %r9b
-; SSE4-NEXT:    setg %r10b
-; SSE4-NEXT:    subb %r9b, %r10b
-; SSE4-NEXT:    movsbq %r10b, %r9
-; SSE4-NEXT:    movq %r9, %r10
-; SSE4-NEXT:    sarq $63, %r10
-; SSE4-NEXT:    addb %dil, %dil
-; SSE4-NEXT:    sarb %dil
-; SSE4-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE4-NEXT:    addb %r11b, %r11b
-; SSE4-NEXT:    sarb %r11b
-; SSE4-NEXT:    cmpb %dil, %r11b
-; SSE4-NEXT:    setl %dil
-; SSE4-NEXT:    setg %r11b
-; SSE4-NEXT:    subb %dil, %r11b
-; SSE4-NEXT:    movsbq %r11b, %rdi
-; SSE4-NEXT:    movq %rdi, %rbp
-; SSE4-NEXT:    sarq $63, %rbp
-; SSE4-NEXT:    movl %ebp, 96(%rax)
-; SSE4-NEXT:    movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF
-; SSE4-NEXT:    andq %rbp, %r11
-; SSE4-NEXT:    shldq $62, %rdi, %rbp
-; SSE4-NEXT:    movq %rbp, 88(%rax)
-; SSE4-NEXT:    movq %r10, %rbp
-; SSE4-NEXT:    shldq $20, %r9, %rbp
-; SSE4-NEXT:    movq %rbp, 64(%rax)
-; SSE4-NEXT:    movq %r8, %rbp
-; SSE4-NEXT:    shldq $31, %rdx, %rbp
-; SSE4-NEXT:    movq %rbp, 48(%rax)
-; SSE4-NEXT:    movq %rcx, %rbp
-; SSE4-NEXT:    shldq $42, %rbx, %rbp
-; SSE4-NEXT:    movq %rbp, 32(%rax)
-; SSE4-NEXT:    movabsq $9007199254738944, %rbp # imm = 0x1FFFFFFFFFF800
-; SSE4-NEXT:    andq %r13, %rbp
-; SSE4-NEXT:    shldq $53, %r12, %r13
-; SSE4-NEXT:    movq %r13, 16(%rax)
-; SSE4-NEXT:    movq %r11, %r13
-; SSE4-NEXT:    shrq $48, %r13
-; SSE4-NEXT:    movb %r13b, 102(%rax)
-; SSE4-NEXT:    shrq $32, %r11
-; SSE4-NEXT:    movw %r11w, 100(%rax)
-; SSE4-NEXT:    movabsq $9007199254740991, %r11 # imm = 0x1FFFFFFFFFFFFF
-; SSE4-NEXT:    andq %r11, %r15
-; SSE4-NEXT:    shldq $9, %rsi, %r15
-; SSE4-NEXT:    shlq $62, %rdi
-; SSE4-NEXT:    orq %r15, %rdi
-; SSE4-NEXT:    movq %rdi, 80(%rax)
-; SSE4-NEXT:    andq %r11, %r14
-; SSE4-NEXT:    shlq $53, %r12
-; SSE4-NEXT:    orq %r14, %r12
-; SSE4-NEXT:    movq %r12, 8(%rax)
-; SSE4-NEXT:    shlq $42, %rbx
-; SSE4-NEXT:    shrq $11, %rbp
-; SSE4-NEXT:    orq %rbx, %rbp
-; SSE4-NEXT:    movq %rbp, 24(%rax)
-; SSE4-NEXT:    shlq $9, %rsi
-; SSE4-NEXT:    andl $511, %r10d # imm = 0x1FF
-; SSE4-NEXT:    orq %rsi, %r10
-; SSE4-NEXT:    movq %r10, 72(%rax)
-; SSE4-NEXT:    shlq $20, %r9
-; SSE4-NEXT:    andl $1048575, %r8d # imm = 0xFFFFF
-; SSE4-NEXT:    orq %r9, %r8
-; SSE4-NEXT:    movq %r8, 56(%rax)
-; SSE4-NEXT:    shlq $31, %rdx
+; SSE4-NEXT:    setg %bl
+; SSE4-NEXT:    subb %r9b, %bl
+; SSE4-NEXT:    movsbq %bl, %rdx
+; SSE4-NEXT:    movq %rdx, %r9
+; SSE4-NEXT:    sarq $63, %r9
+; SSE4-NEXT:    movq %r9, %rbx
+; SSE4-NEXT:    shldq $20, %rdx, %rbx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT:    movq %r8, (%rax)
+; SSE4-NEXT:    movq %rbp, %r8
+; SSE4-NEXT:    shldq $31, %r10, %r8
+; SSE4-NEXT:    movq %rbx, 64(%rax)
+; SSE4-NEXT:    movq %rcx, %rbx
+; SSE4-NEXT:    shldq $42, %r12, %rbx
+; SSE4-NEXT:    movq %r8, 48(%rax)
+; SSE4-NEXT:    movq %rbx, 32(%rax)
+; SSE4-NEXT:    movabsq $9007199254738944, %r8 # imm = 0x1FFFFFFFFFF800
+; SSE4-NEXT:    andq %rdi, %r8
+; SSE4-NEXT:    shldq $53, %r15, %rdi
+; SSE4-NEXT:    movq %rdi, 16(%rax)
+; SSE4-NEXT:    movl %r14d, 96(%rax)
+; SSE4-NEXT:    movabsq $2251799813685247, %rdi # imm = 0x7FFFFFFFFFFFF
+; SSE4-NEXT:    andq %r14, %rdi
+; SSE4-NEXT:    shldq $62, %rsi, %r14
+; SSE4-NEXT:    movq %r14, 88(%rax)
+; SSE4-NEXT:    movabsq $9007199254740991, %rbx # imm = 0x1FFFFFFFFFFFFF
+; SSE4-NEXT:    andq %rbx, %r11
+; SSE4-NEXT:    shlq $53, %r15
+; SSE4-NEXT:    orq %r11, %r15
+; SSE4-NEXT:    movq %r15, 8(%rax)
+; SSE4-NEXT:    shlq $42, %r12
+; SSE4-NEXT:    shrq $11, %r8
+; SSE4-NEXT:    orq %r12, %r8
+; SSE4-NEXT:    movq %r8, 24(%rax)
+; SSE4-NEXT:    movq %rdi, %r8
+; SSE4-NEXT:    shrq $48, %r8
+; SSE4-NEXT:    movb %r8b, 102(%rax)
+; SSE4-NEXT:    shrq $32, %rdi
+; SSE4-NEXT:    movw %di, 100(%rax)
+; SSE4-NEXT:    andq %rbx, %r13
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE4-NEXT:    shldq $9, %rdi, %r13
+; SSE4-NEXT:    shlq $62, %rsi
+; SSE4-NEXT:    orq %r13, %rsi
+; SSE4-NEXT:    movq %rsi, 80(%rax)
+; SSE4-NEXT:    shlq $9, %rdi
+; SSE4-NEXT:    andl $511, %r9d # imm = 0x1FF
+; SSE4-NEXT:    orq %rdi, %r9
+; SSE4-NEXT:    movq %r9, 72(%rax)
+; SSE4-NEXT:    shlq $20, %rdx
+; SSE4-NEXT:    andl $1048575, %ebp # imm = 0xFFFFF
+; SSE4-NEXT:    orq %rdx, %rbp
+; SSE4-NEXT:    movq %rbp, 56(%rax)
+; SSE4-NEXT:    shlq $31, %r10
 ; SSE4-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; SSE4-NEXT:    orq %rdx, %rcx
+; SSE4-NEXT:    orq %r10, %rcx
 ; SSE4-NEXT:    movq %rcx, 40(%rax)
 ; SSE4-NEXT:    popq %rbx
 ; SSE4-NEXT:    popq %r12
@@ -2076,132 +2122,132 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
 ; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
-; AVX-NEXT:    addb %r14b, %r14b
-; AVX-NEXT:    sarb %r14b
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; AVX-NEXT:    addb %dil, %dil
+; AVX-NEXT:    sarb %dil
+; AVX-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT:    addb %r10b, %r10b
+; AVX-NEXT:    sarb %r10b
+; AVX-NEXT:    cmpb %dil, %r10b
+; AVX-NEXT:    setl %dil
+; AVX-NEXT:    setg %r10b
+; AVX-NEXT:    subb %dil, %r10b
+; AVX-NEXT:    movsbq %r10b, %rdi
+; AVX-NEXT:    movq %rdi, %r10
+; AVX-NEXT:    sarq $63, %r10
+; AVX-NEXT:    addb %bpl, %bpl
+; AVX-NEXT:    sarb %bpl
 ; AVX-NEXT:    addb %sil, %sil
 ; AVX-NEXT:    sarb %sil
-; AVX-NEXT:    cmpb %r14b, %sil
+; AVX-NEXT:    cmpb %bpl, %sil
 ; AVX-NEXT:    setl %sil
-; AVX-NEXT:    setg %r14b
-; AVX-NEXT:    subb %sil, %r14b
-; AVX-NEXT:    movsbq %r14b, %r14
-; AVX-NEXT:    movq %r14, (%rax)
-; AVX-NEXT:    sarq $63, %r14
+; AVX-NEXT:    setg %bpl
+; AVX-NEXT:    subb %sil, %bpl
+; AVX-NEXT:    movsbq %bpl, %r12
+; AVX-NEXT:    movq %r12, (%rax)
+; AVX-NEXT:    sarq $63, %r12
 ; AVX-NEXT:    addb %r15b, %r15b
 ; AVX-NEXT:    sarb %r15b
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; AVX-NEXT:    movl {{[0-9]+}}(%rsp), %esi
 ; AVX-NEXT:    addb %sil, %sil
 ; AVX-NEXT:    sarb %sil
 ; AVX-NEXT:    cmpb %r15b, %sil
 ; AVX-NEXT:    setl %sil
-; AVX-NEXT:    setg %r15b
-; AVX-NEXT:    subb %sil, %r15b
-; AVX-NEXT:    movsbq %r15b, %rsi
-; AVX-NEXT:    movq %rsi, %r12
-; AVX-NEXT:    sarq $63, %r12
-; AVX-NEXT:    addb %bpl, %bpl
-; AVX-NEXT:    sarb %bpl
+; AVX-NEXT:    setg %bpl
+; AVX-NEXT:    subb %sil, %bpl
+; AVX-NEXT:    movsbq %bpl, %rsi
+; AVX-NEXT:    movq %rsi, %r15
+; AVX-NEXT:    sarq $63, %r15
+; AVX-NEXT:    addb %r14b, %r14b
+; AVX-NEXT:    sarb %r14b
 ; AVX-NEXT:    addb %dl, %dl
 ; AVX-NEXT:    sarb %dl
-; AVX-NEXT:    cmpb %bpl, %dl
+; AVX-NEXT:    cmpb %r14b, %dl
 ; AVX-NEXT:    setl %dl
 ; AVX-NEXT:    setg %bpl
 ; AVX-NEXT:    subb %dl, %bpl
-; AVX-NEXT:    movsbq %bpl, %r15
-; AVX-NEXT:    movq %r15, %r13
-; AVX-NEXT:    sarq $63, %r13
-; AVX-NEXT:    addb %bl, %bl
-; AVX-NEXT:    sarb %bl
+; AVX-NEXT:    movsbq %bpl, %r14
+; AVX-NEXT:    movq %r14, %rbp
+; AVX-NEXT:    sarq $63, %rbp
+; AVX-NEXT:    addb %r13b, %r13b
+; AVX-NEXT:    sarb %r13b
 ; AVX-NEXT:    addb %cl, %cl
 ; AVX-NEXT:    sarb %cl
-; AVX-NEXT:    cmpb %bl, %cl
+; AVX-NEXT:    cmpb %r13b, %cl
 ; AVX-NEXT:    setl %cl
 ; AVX-NEXT:    setg %dl
 ; AVX-NEXT:    subb %cl, %dl
-; AVX-NEXT:    movsbq %dl, %rbx
-; AVX-NEXT:    movq %rbx, %rcx
+; AVX-NEXT:    movsbq %dl, %r13
+; AVX-NEXT:    movq %r13, %rcx
 ; AVX-NEXT:    sarq $63, %rcx
-; AVX-NEXT:    addb %r11b, %r11b
-; AVX-NEXT:    sarb %r11b
+; AVX-NEXT:    addb %bl, %bl
+; AVX-NEXT:    sarb %bl
 ; AVX-NEXT:    addb %r8b, %r8b
 ; AVX-NEXT:    sarb %r8b
-; AVX-NEXT:    cmpb %r11b, %r8b
+; AVX-NEXT:    cmpb %bl, %r8b
 ; AVX-NEXT:    setl %dl
 ; AVX-NEXT:    setg %r8b
 ; AVX-NEXT:    subb %dl, %r8b
 ; AVX-NEXT:    movsbq %r8b, %rdx
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    sarq $63, %r8
-; AVX-NEXT:    addb %r10b, %r10b
-; AVX-NEXT:    sarb %r10b
+; AVX-NEXT:    addb %r11b, %r11b
+; AVX-NEXT:    sarb %r11b
 ; AVX-NEXT:    addb %r9b, %r9b
 ; AVX-NEXT:    sarb %r9b
-; AVX-NEXT:    cmpb %r10b, %r9b
+; AVX-NEXT:    cmpb %r11b, %r9b
 ; AVX-NEXT:    setl %r9b
-; AVX-NEXT:    setg %r10b
-; AVX-NEXT:    subb %r9b, %r10b
-; AVX-NEXT:    movsbq %r10b, %r9
-; AVX-NEXT:    movq %r9, %r10
-; AVX-NEXT:    sarq $63, %r10
-; AVX-NEXT:    addb %dil, %dil
-; AVX-NEXT:    sarb %dil
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; AVX-NEXT:    addb %r11b, %r11b
-; AVX-NEXT:    sarb %r11b
-; AVX-NEXT:    cmpb %dil, %r11b
-; AVX-NEXT:    setl %dil
 ; AVX-NEXT:    setg %r11b
-; AVX-NEXT:    subb %dil, %r11b
-; AVX-NEXT:    movsbq %r11b, %rdi
-; AVX-NEXT:    movq %rdi, %rbp
-; AVX-NEXT:    sarq $63, %rbp
-; AVX-NEXT:    movl %ebp, 96(%rax)
-; AVX-NEXT:    movb $51, %r11b
-; AVX-NEXT:    bzhiq %r11, %rbp, %r11
-; AVX-NEXT:    shldq $62, %rdi, %rbp
-; AVX-NEXT:    movq %rbp, 88(%rax)
-; AVX-NEXT:    movq %r10, %rbp
-; AVX-NEXT:    shldq $20, %r9, %rbp
-; AVX-NEXT:    movq %rbp, 64(%rax)
-; AVX-NEXT:    movq %r8, %rbp
-; AVX-NEXT:    shldq $31, %rdx, %rbp
-; AVX-NEXT:    movq %rbp, 48(%rax)
-; AVX-NEXT:    movq %rcx, %rbp
-; AVX-NEXT:    shldq $42, %rbx, %rbp
-; AVX-NEXT:    movq %rbp, 32(%rax)
-; AVX-NEXT:    movb $42, %bpl
-; AVX-NEXT:    bzhiq %rbp, %r13, %rbp
-; AVX-NEXT:    shldq $53, %r15, %r13
-; AVX-NEXT:    movq %r13, 16(%rax)
-; AVX-NEXT:    movq %r11, %r13
-; AVX-NEXT:    shrq $48, %r13
-; AVX-NEXT:    movb %r13b, 102(%rax)
-; AVX-NEXT:    shrq $32, %r11
-; AVX-NEXT:    movw %r11w, 100(%rax)
-; AVX-NEXT:    movb $53, %r11b
-; AVX-NEXT:    bzhiq %r11, %r12, %r12
-; AVX-NEXT:    shldq $9, %rsi, %r12
-; AVX-NEXT:    shlq $62, %rdi
-; AVX-NEXT:    orq %r12, %rdi
-; AVX-NEXT:    movq %rdi, 80(%rax)
-; AVX-NEXT:    shlq $42, %rbx
-; AVX-NEXT:    orq %rbp, %rbx
-; AVX-NEXT:    movq %rbx, 24(%rax)
-; AVX-NEXT:    bzhiq %r11, %r14, %rdi
-; AVX-NEXT:    shlq $53, %r15
-; AVX-NEXT:    orq %rdi, %r15
-; AVX-NEXT:    movq %r15, 8(%rax)
-; AVX-NEXT:    shlq $9, %rsi
-; AVX-NEXT:    andl $511, %r10d # imm = 0x1FF
-; AVX-NEXT:    orq %rsi, %r10
-; AVX-NEXT:    movq %r10, 72(%rax)
+; AVX-NEXT:    subb %r9b, %r11b
+; AVX-NEXT:    movsbq %r11b, %r9
+; AVX-NEXT:    movq %r9, %r11
+; AVX-NEXT:    sarq $63, %r11
+; AVX-NEXT:    movq %r11, %rbx
+; AVX-NEXT:    shldq $20, %r9, %rbx
+; AVX-NEXT:    movq %rbx, 64(%rax)
+; AVX-NEXT:    movq %r8, %rbx
+; AVX-NEXT:    shldq $31, %rdx, %rbx
+; AVX-NEXT:    movq %rbx, 48(%rax)
+; AVX-NEXT:    movq %rcx, %rbx
+; AVX-NEXT:    shldq $42, %r13, %rbx
+; AVX-NEXT:    movq %rbx, 32(%rax)
+; AVX-NEXT:    movb $42, %bl
+; AVX-NEXT:    bzhiq %rbx, %rbp, %rbx
+; AVX-NEXT:    shldq $53, %r14, %rbp
+; AVX-NEXT:    movq %rbp, 16(%rax)
+; AVX-NEXT:    movl %r15d, 96(%rax)
+; AVX-NEXT:    movb $51, %bpl
+; AVX-NEXT:    bzhiq %rbp, %r15, %rbp
+; AVX-NEXT:    shldq $62, %rsi, %r15
+; AVX-NEXT:    movq %r15, 88(%rax)
+; AVX-NEXT:    shlq $42, %r13
+; AVX-NEXT:    orq %rbx, %r13
+; AVX-NEXT:    movq %r13, 24(%rax)
+; AVX-NEXT:    movb $53, %bl
+; AVX-NEXT:    bzhiq %rbx, %r12, %r15
+; AVX-NEXT:    shlq $53, %r14
+; AVX-NEXT:    orq %r15, %r14
+; AVX-NEXT:    movq %r14, 8(%rax)
+; AVX-NEXT:    movq %rbp, %r14
+; AVX-NEXT:    shrq $48, %r14
+; AVX-NEXT:    movb %r14b, 102(%rax)
+; AVX-NEXT:    shrq $32, %rbp
+; AVX-NEXT:    movw %bp, 100(%rax)
+; AVX-NEXT:    bzhiq %rbx, %r10, %r10
+; AVX-NEXT:    shldq $9, %rdi, %r10
+; AVX-NEXT:    shlq $62, %rsi
+; AVX-NEXT:    orq %r10, %rsi
+; AVX-NEXT:    movq %rsi, 80(%rax)
+; AVX-NEXT:    shlq $9, %rdi
+; AVX-NEXT:    andl $511, %r11d # imm = 0x1FF
+; AVX-NEXT:    orq %rdi, %r11
+; AVX-NEXT:    movq %r11, 72(%rax)
 ; AVX-NEXT:    shlq $20, %r9
 ; AVX-NEXT:    andl $1048575, %r8d # imm = 0xFFFFF
 ; AVX-NEXT:    orq %r9, %r8
@@ -2671,12 +2717,12 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dl
 ; X86-NEXT:    subb %al, %dl
 ; X86-NEXT:    movsbl %dl, %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    cmpb %cl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
 ; X86-NEXT:    subb %cl, %dl
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 7f17299b39e33..af2275eda305f 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -16,7 +16,7 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
 ; X86-LABEL: ucmp.8.8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    retl
@@ -35,7 +35,7 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
 ; X86-LABEL: ucmp.8.16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %ax
+; X86-NEXT:    cmpw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    retl
@@ -54,7 +54,7 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: ucmp.8.32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    retl
@@ -155,7 +155,7 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: ucmp.32.32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movsbl %al, %eax
@@ -245,7 +245,7 @@ define i4 @ucmp_narrow_result(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: ucmp_narrow_result:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    retl
@@ -278,18 +278,18 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl $1073741823, %ecx # imm = 0x3FFFFFFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $1073741823, %edx # imm = 0x3FFFFFFF
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1073741823, %edi # imm = 0x3FFFFFFF
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %edi, %esi
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -313,9 +313,9 @@ define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind {
 ;
 ; X86-LABEL: ucmp_wide_result:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movsbl %cl, %ecx
@@ -366,26 +366,26 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl $8191, %ecx # imm = 0x1FFF
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $8191, %ecx # imm = 0x1FFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $8191, %esi # imm = 0x1FFF
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -470,24 +470,24 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %dl
 ; X86-NEXT:    sbbb $0, %dl
 ; X86-NEXT:    movsbl %dl, %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %bl
 ; X86-NEXT:    sbbb $0, %bl
 ; X86-NEXT:    movsbl %bl, %edi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %bl
 ; X86-NEXT:    sbbb $0, %bl
 ; X86-NEXT:    movsbl %bl, %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movsbl %cl, %ecx
@@ -611,27 +611,27 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    seta %ch
-; X86-NEXT:    sbbb $0, %ch
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    seta %bl
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %dl
 ; X86-NEXT:    sbbb $0, %dl
-; X86-NEXT:    movb %dl, 3(%eax)
+; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    seta %dh
+; X86-NEXT:    sbbb $0, %dh
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    seta %bl
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, 3(%eax)
 ; X86-NEXT:    movb %bl, 2(%eax)
-; X86-NEXT:    movb %ch, 1(%eax)
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    movb %dh, 1(%eax)
+; X86-NEXT:    movb %dl, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -691,24 +691,24 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %dl
 ; X86-NEXT:    sbbb $0, %dl
 ; X86-NEXT:    movsbl %dl, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    cmpb %bl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %bl
 ; X86-NEXT:    sbbb $0, %bl
 ; X86-NEXT:    movsbl %bl, %esi
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    cmpb %ch, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %ch
 ; X86-NEXT:    sbbb $0, %ch
 ; X86-NEXT:    movsbl %ch, %edi
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    cmpb %cl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movsbl %cl, %ecx
@@ -767,38 +767,44 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ;
 ; SSE2-LABEL: ucmp_wide_vec_result:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    psubd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
 ; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT:    psubd %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pmaxud %xmm1, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT:    pminud %xmm2, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT:    psubd %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
-; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pmaxud %xmm2, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT:    pminud %xmm5, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm2
 ; SSE2-NEXT:    psubd %xmm6, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
-; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
+; SSE2-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pmaxud %xmm3, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pminud %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT:    psubd %xmm5, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: ucmp_wide_vec_result:
@@ -812,8 +818,10 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubd %ymm3, %ymm0, %ymm1
 ; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX2-NEXT:    retq
@@ -842,74 +850,74 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    cmpb %cl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    cmpb %bh, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    cmpb %bl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    cmpb %dh, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    cmpb %ch, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    cmpb %ah, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    cmpb %dl, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %bl
 ; X86-NEXT:    sbbb $0, %bl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %bh
 ; X86-NEXT:    sbbb $0, %bh
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movsbl %al, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movsbl %al, %ebp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movsbl %al, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movsbl %al, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movsbl %al, %ecx
@@ -1368,72 +1376,72 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %bh
 ; X86-NEXT:    sbbb $0, %bh
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    seta %bl
 ; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %dh
 ; X86-NEXT:    sbbb $0, %dh
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %ch
 ; X86-NEXT:    sbbb $0, %ch
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %dl
 ; X86-NEXT:    sbbb $0, %dl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1523,10 +1531,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, (%rsp) # 8-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1554,240 +1562,240 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE4-NEXT:    andl $127, %r10d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE4-NEXT:    andl $127, %ebp
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    andl $127, %ecx
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; SSE4-NEXT:    andl $127, %r8d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE4-NEXT:    andl $127, %ebx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT:    andl $127, %edx
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; SSE4-NEXT:    andl $127, %r13d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT:    andl $127, %r11d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT:    andl $127, %r10d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    andl $127, %edx
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE4-NEXT:    andl $127, %r15d
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE4-NEXT:    andl $127, %r14d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT:    andl $127, %r12d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE4-NEXT:    cmpq %rax, %rbp
-; SSE4-NEXT:    movq %r12, %r15
-; SSE4-NEXT:    sbbq %r14, %r15
-; SSE4-NEXT:    setb %r15b
-; SSE4-NEXT:    cmpq %rbp, %rax
-; SSE4-NEXT:    sbbq %r12, %r14
-; SSE4-NEXT:    sbbb $0, %r15b
-; SSE4-NEXT:    movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    cmpq %rax, %r14
-; SSE4-NEXT:    movq %r11, %r15
-; SSE4-NEXT:    sbbq %r13, %r15
-; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %r14, %rax
-; SSE4-NEXT:    sbbq %r11, %r13
-; SSE4-NEXT:    sbbb $0, %bpl
-; SSE4-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT:    cmpq %rax, %r11
-; SSE4-NEXT:    movq %rdx, %r14
-; SSE4-NEXT:    sbbq %rbx, %r14
-; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %r11, %rax
-; SSE4-NEXT:    sbbq %rdx, %rbx
-; SSE4-NEXT:    sbbb $0, %bpl
-; SSE4-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    andl $127, %r11d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE4-NEXT:    andl $127, %ecx
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT:    cmpq %rax, %rdx
-; SSE4-NEXT:    movq %r8, %r11
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT:    cmpq %rax, %r12
+; SSE4-NEXT:    movq %rcx, %rbx
+; SSE4-NEXT:    sbbq %r11, %rbx
+; SSE4-NEXT:    setb %bl
+; SSE4-NEXT:    cmpq %r12, %rax
 ; SSE4-NEXT:    sbbq %rcx, %r11
+; SSE4-NEXT:    sbbb $0, %bl
+; SSE4-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE4-NEXT:    cmpq %rax, %rcx
+; SSE4-NEXT:    movq %r14, %r11
+; SSE4-NEXT:    sbbq %r15, %r11
 ; SSE4-NEXT:    setb %r11b
-; SSE4-NEXT:    cmpq %rdx, %rax
-; SSE4-NEXT:    sbbq %r8, %rcx
+; SSE4-NEXT:    cmpq %rcx, %rax
+; SSE4-NEXT:    sbbq %r14, %r15
 ; SSE4-NEXT:    sbbb $0, %r11b
 ; SSE4-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rdx
-; SSE4-NEXT:    sbbq %r10, %rdx
-; SSE4-NEXT:    setb %dl
+; SSE4-NEXT:    movq %rdx, %r11
+; SSE4-NEXT:    sbbq %r10, %r11
+; SSE4-NEXT:    setb %r11b
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r8, %r10
-; SSE4-NEXT:    sbbb $0, %dl
-; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    sbbq %rdx, %r10
+; SSE4-NEXT:    sbbb $0, %r11b
+; SSE4-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT:    movq %r13, %rdx
 ; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    setb %dl
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %r10b
+; SSE4-NEXT:    sbbq %r13, %r8
+; SSE4-NEXT:    sbbb $0, %dl
+; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
+; SSE4-NEXT:    movq %r8, %rdx
+; SSE4-NEXT:    sbbq %rbp, %rdx
 ; SSE4-NEXT:    setb %dl
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
+; SSE4-NEXT:    sbbq %r8, %rbp
 ; SSE4-NEXT:    sbbb $0, %dl
 ; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    movq %r10, %rdx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %bpl
+; SSE4-NEXT:    setb %dl
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %bpl
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    sbbb $0, %dl
+; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    movq %r10, %rdx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; SSE4-NEXT:    sbbq %r8, %rdx
 ; SSE4-NEXT:    setb %dl
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
+; SSE4-NEXT:    sbbq %r10, %r8
 ; SSE4-NEXT:    sbbb $0, %dl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT:    movq %r14, %r8
-; SSE4-NEXT:    movq (%rsp), %rbx # 8-byte Reload
-; SSE4-NEXT:    sbbq %rbx, %r8
-; SSE4-NEXT:    setb %r11b
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r14, %rbx
-; SSE4-NEXT:    sbbb $0, %r11b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rcx, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    movq %r10, %rax
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r8, %rax
+; SSE4-NEXT:    setb %r12b
+; SSE4-NEXT:    cmpq %rdx, %rcx
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    sbbb $0, %r12b
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT:    movq %r14, %rbx
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT:    cmpq %rcx, %r10
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rbx
+; SSE4-NEXT:    movq %r8, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    sbbq %rax, %rdx
+; SSE4-NEXT:    setb %bpl
+; SSE4-NEXT:    cmpq %r10, %rcx
+; SSE4-NEXT:    sbbq %r8, %rax
+; SSE4-NEXT:    sbbb $0, %bpl
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE4-NEXT:    cmpq %r10, %r11
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE4-NEXT:    movq %rdx, %rcx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    sbbq %rax, %rcx
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    cmpq %r11, %r10
+; SSE4-NEXT:    sbbq %rdx, %rax
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE4-NEXT:    cmpq %r11, %rbx
+; SSE4-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; SSE4-NEXT:    movq %rcx, %r10
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    sbbq %rax, %r10
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    cmpq %rbx, %r11
+; SSE4-NEXT:    sbbq %rcx, %rax
+; SSE4-NEXT:    sbbb $0, %r10b
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE4-NEXT:    cmpq %rbx, %r14
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    movq %rcx, %r11
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    sbbq %rax, %r11
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    cmpq %r14, %rbx
+; SSE4-NEXT:    sbbq %rcx, %rax
+; SSE4-NEXT:    sbbb $0, %r11b
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE4-NEXT:    cmpq %r14, %r15
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    movq %rcx, %rbx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    sbbq %rax, %rbx
 ; SSE4-NEXT:    setb %bl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r14, %r8
+; SSE4-NEXT:    cmpq %r15, %r14
+; SSE4-NEXT:    sbbq %rcx, %rax
 ; SSE4-NEXT:    sbbb $0, %bl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    cmpq %rax, %r14
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE4-NEXT:    movq %r15, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rcx
-; SSE4-NEXT:    setb %cl
-; SSE4-NEXT:    cmpq %r14, %rax
-; SSE4-NEXT:    sbbq %r15, %r8
-; SSE4-NEXT:    sbbb $0, %cl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE4-NEXT:    cmpq %rax, %r15
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    movq %r12, %r14
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r14
+; SSE4-NEXT:    cmpq %r9, %r15
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    movq %rcx, %r14
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    sbbq %rax, %r14
 ; SSE4-NEXT:    setb %r14b
-; SSE4-NEXT:    cmpq %r15, %rax
-; SSE4-NEXT:    sbbq %r12, %r8
+; SSE4-NEXT:    cmpq %r15, %r9
+; SSE4-NEXT:    sbbq %rcx, %rax
 ; SSE4-NEXT:    sbbb $0, %r14b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    cmpq %r9, %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    movq %r12, %r15
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r15
-; SSE4-NEXT:    setb %r15b
-; SSE4-NEXT:    cmpq %rax, %r9
-; SSE4-NEXT:    sbbq %r12, %r8
-; SSE4-NEXT:    sbbb $0, %r15b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    cmpq %r12, %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE4-NEXT:    movq %r13, %r9
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r9
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    cmpq %rcx, %r15
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE4-NEXT:    movq %rdx, %r9
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    sbbq %rax, %r9
 ; SSE4-NEXT:    setb %r9b
-; SSE4-NEXT:    cmpq %rax, %r12
-; SSE4-NEXT:    sbbq %r13, %r8
-; SSE4-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT:    cmpq %r15, %rcx
+; SSE4-NEXT:    sbbq %rdx, %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; SSE4-NEXT:    sbbb $0, %r9b
-; SSE4-NEXT:    cmpq %rsi, %r12
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rdi
+; SSE4-NEXT:    cmpq %rsi, %r13
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    movq %rcx, %r15
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    sbbq %rax, %rdi
-; SSE4-NEXT:    setb %dil
-; SSE4-NEXT:    cmpq %r12, %rsi
-; SSE4-NEXT:    sbbq %r8, %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT:    sbbq %rax, %r15
+; SSE4-NEXT:    setb %r15b
+; SSE4-NEXT:    cmpq %r13, %rsi
+; SSE4-NEXT:    sbbq %rcx, %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE4-NEXT:    sbbb $0, %dil
-; SSE4-NEXT:    cmpq %r12, %r13
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rsi
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    sbbq %rax, %rsi
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    sbbb $0, %r15b
+; SSE4-NEXT:    cmpq %r13, %rax
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE4-NEXT:    movq %rdx, %rsi
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    sbbq %rcx, %rsi
 ; SSE4-NEXT:    setb %sil
-; SSE4-NEXT:    cmpq %r13, %r12
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm1
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm2
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm3
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm4
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm5
-; SSE4-NEXT:    movzbl %r10b, %r10d
-; SSE4-NEXT:    movd %r10d, %xmm6
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r10d, %xmm7
-; SSE4-NEXT:    movzbl %bpl, %r10d
-; SSE4-NEXT:    movd %r10d, %xmm0
-; SSE4-NEXT:    movzbl %dl, %edx
-; SSE4-NEXT:    movd %edx, %xmm8
-; SSE4-NEXT:    movzbl %r11b, %edx
-; SSE4-NEXT:    movd %edx, %xmm9
-; SSE4-NEXT:    movzbl %bl, %edx
-; SSE4-NEXT:    movd %edx, %xmm10
-; SSE4-NEXT:    movzbl %cl, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm11
-; SSE4-NEXT:    movzbl %r14b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm12
-; SSE4-NEXT:    movzbl %r15b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm13
-; SSE4-NEXT:    movzbl %r9b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm14
-; SSE4-NEXT:    movzbl %dil, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm15
+; SSE4-NEXT:    cmpq %rax, %r13
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm1
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm2
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm3
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm4
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm5
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm6
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm7
+; SSE4-NEXT:    movzbl %r12b, %eax
+; SSE4-NEXT:    movd %eax, %xmm0
+; SSE4-NEXT:    movzbl %bpl, %eax
+; SSE4-NEXT:    movd %eax, %xmm8
+; SSE4-NEXT:    movzbl %r8b, %eax
+; SSE4-NEXT:    movd %eax, %xmm9
+; SSE4-NEXT:    movzbl %r10b, %eax
+; SSE4-NEXT:    movd %eax, %xmm10
+; SSE4-NEXT:    movzbl %r11b, %eax
+; SSE4-NEXT:    movd %eax, %xmm11
+; SSE4-NEXT:    movzbl %bl, %eax
+; SSE4-NEXT:    movd %eax, %xmm12
+; SSE4-NEXT:    movzbl %r14b, %eax
+; SSE4-NEXT:    movd %eax, %xmm13
+; SSE4-NEXT:    movzbl %r9b, %eax
+; SSE4-NEXT:    movd %eax, %xmm14
+; SSE4-NEXT:    movzbl %r15b, %eax
+; SSE4-NEXT:    movd %eax, %xmm15
 ; SSE4-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE4-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
@@ -1802,76 +1810,76 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE4-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
 ; SSE4-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
-; SSE4-NEXT:    sbbq %r8, %rax
+; SSE4-NEXT:    sbbq %rdx, %rcx
 ; SSE4-NEXT:    sbbb $0, %sil
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
-; SSE4-NEXT:    movzbl %sil, %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    movb %cl, 4(%rax)
+; SSE4-NEXT:    movzbl %sil, %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    movb %al, 4(%rdi)
 ; SSE4-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    leaq (%rcx,%rax,4), %rax
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $4, %ecx
+; SSE4-NEXT:    orq %rax, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    shll $6, %eax
+; SSE4-NEXT:    orq %rcx, %rax
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $8, %ecx
+; SSE4-NEXT:    orq %rax, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    shll $10, %eax
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    leaq (%rdx,%rcx,4), %rcx
+; SSE4-NEXT:    shll $12, %edx
+; SSE4-NEXT:    orq %rax, %rdx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE4-NEXT:    andl $3, %esi
+; SSE4-NEXT:    shll $14, %esi
+; SSE4-NEXT:    orq %rdx, %rsi
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    shll $16, %eax
+; SSE4-NEXT:    orq %rsi, %rax
+; SSE4-NEXT:    orq %rcx, %rax
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $18, %ecx
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $4, %edx
+; SSE4-NEXT:    shll $20, %edx
 ; SSE4-NEXT:    orq %rcx, %rdx
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $6, %ecx
+; SSE4-NEXT:    shll $22, %ecx
 ; SSE4-NEXT:    orq %rdx, %rcx
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $8, %edx
+; SSE4-NEXT:    shll $24, %edx
 ; SSE4-NEXT:    orq %rcx, %rdx
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $10, %ecx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $12, %esi
-; SSE4-NEXT:    orq %rcx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
-; SSE4-NEXT:    andl $3, %edi
-; SSE4-NEXT:    shll $14, %edi
-; SSE4-NEXT:    orq %rsi, %rdi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $16, %ecx
-; SSE4-NEXT:    orq %rdi, %rcx
+; SSE4-NEXT:    shlq $26, %rcx
 ; SSE4-NEXT:    orq %rdx, %rcx
+; SSE4-NEXT:    orq %rax, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    shlq $28, %rax
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $18, %edx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $20, %esi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $22, %edx
-; SSE4-NEXT:    orq %rsi, %rdx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $24, %esi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shlq $26, %rdx
-; SSE4-NEXT:    orq %rsi, %rdx
+; SSE4-NEXT:    shlq $30, %rdx
+; SSE4-NEXT:    orq %rax, %rdx
 ; SSE4-NEXT:    orq %rcx, %rdx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shlq $28, %rcx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shlq $30, %rsi
-; SSE4-NEXT:    orq %rcx, %rsi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movl %esi, (%rax)
+; SSE4-NEXT:    movl %edx, (%rdi)
+; SSE4-NEXT:    movq %rdi, %rax
 ; SSE4-NEXT:    addq $120, %rsp
 ; SSE4-NEXT:    popq %rbx
 ; SSE4-NEXT:    popq %r12
@@ -1961,88 +1969,76 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    andl $127, %eax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    andl $127, %ecx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT:    andl $127, %r11d
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    andl $127, %eax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT:    andl $127, %ebx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE2-NEXT:    andl $127, %ebp
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; SSE2-NEXT:    andl $127, %edx
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; SSE2-NEXT:    andl $127, %r10d
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE2-NEXT:    andl $127, %r14d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT:    andl $127, %ebp
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE2-NEXT:    andl $127, %r13d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE2-NEXT:    andl $127, %r11d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE2-NEXT:    andl $127, %r8d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE2-NEXT:    andl $127, %ebx
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; SSE2-NEXT:    andl $127, %r15d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    cmpq %r13, %rcx
+; SSE2-NEXT:    movq %r15, %r12
+; SSE2-NEXT:    sbbq %rbx, %r12
+; SSE2-NEXT:    setb %r12b
+; SSE2-NEXT:    cmpq %rcx, %r13
+; SSE2-NEXT:    sbbq %r15, %rbx
+; SSE2-NEXT:    sbbb $0, %r12b
+; SSE2-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE2-NEXT:    cmpq %rcx, %rbx
+; SSE2-NEXT:    movq %rax, %r15
+; SSE2-NEXT:    sbbq %r8, %r15
+; SSE2-NEXT:    setb %r15b
+; SSE2-NEXT:    cmpq %rbx, %rcx
+; SSE2-NEXT:    sbbq %rax, %r8
+; SSE2-NEXT:    sbbb $0, %r15b
+; SSE2-NEXT:    movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT:    cmpq %rax, %r12
-; SSE2-NEXT:    movq %r15, %r8
-; SSE2-NEXT:    sbbq %r11, %r8
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    movq %r14, %r8
+; SSE2-NEXT:    sbbq %r10, %r8
 ; SSE2-NEXT:    setb %r8b
-; SSE2-NEXT:    cmpq %r12, %rax
-; SSE2-NEXT:    sbbq %r15, %r11
+; SSE2-NEXT:    cmpq %rcx, %rax
+; SSE2-NEXT:    sbbq %r14, %r10
 ; SSE2-NEXT:    sbbb $0, %r8b
 ; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
-; SSE2-NEXT:    movq %r13, %r11
-; SSE2-NEXT:    sbbq %rbp, %r11
-; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %r13, %rbp
-; SSE2-NEXT:    sbbb $0, %r11b
-; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
-; SSE2-NEXT:    movq %r14, %r11
-; SSE2-NEXT:    sbbq %r10, %r11
-; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %r14, %r10
-; SSE2-NEXT:    sbbb $0, %r11b
-; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
-; SSE2-NEXT:    movq %rdx, %r10
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    setb %r10b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %rdx, %rbx
-; SSE2-NEXT:    sbbb $0, %r10b
-; SSE2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    cmpq %rax, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    movq %r10, %r8
-; SSE2-NEXT:    sbbq %rcx, %r8
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    movq %rdx, %r8
+; SSE2-NEXT:    sbbq %rbp, %r8
 ; SSE2-NEXT:    setb %r8b
-; SSE2-NEXT:    cmpq %rdx, %rax
-; SSE2-NEXT:    sbbq %r10, %rcx
+; SSE2-NEXT:    cmpq %rcx, %rax
+; SSE2-NEXT:    sbbq %rdx, %rbp
 ; SSE2-NEXT:    sbbb $0, %r8b
 ; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    movq %r10, %rdx
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r8, %rdx
+; SSE2-NEXT:    movq %r8, %rdx
+; SSE2-NEXT:    sbbq %r11, %rdx
 ; SSE2-NEXT:    setb %dl
 ; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    sbbq %r8, %r11
 ; SSE2-NEXT:    sbbb $0, %dl
 ; SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -2058,117 +2054,129 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    sbbb $0, %dl
 ; SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE2-NEXT:    movq %r11, %rdx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    cmpq %rax, %rdx
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r10, %rdx
-; SSE2-NEXT:    setb %r8b
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r11, %r10
-; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    movq %r10, %rcx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r8, %rcx
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    sbbb $0, %cl
+; SSE2-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT:    movq %rbx, %rdx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    cmpq %rax, %rdx
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r10, %rdx
-; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    sbbb $0, %r11b
+; SSE2-NEXT:    movq %r10, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    sbbq %rcx, %r8
+; SSE2-NEXT:    setb %r12b
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    sbbq %r10, %rcx
+; SSE2-NEXT:    sbbb $0, %r12b
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT:    movq %rbx, %rdx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    cmpq %rax, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    movq %r11, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    sbbq %r11, %r10
+; SSE2-NEXT:    sbbb $0, %cl
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE2-NEXT:    cmpq %rax, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    movq %r11, %rdx
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; SSE2-NEXT:    sbbq %r10, %rdx
 ; SSE2-NEXT:    setb %dl
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %rbx, %r10
+; SSE2-NEXT:    cmpq %r8, %rax
+; SSE2-NEXT:    sbbq %r11, %r10
 ; SSE2-NEXT:    sbbb $0, %dl
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    cmpq %rax, %r10
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    movq %rbx, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r11, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    cmpq %r10, %rax
+; SSE2-NEXT:    sbbq %rbx, %r11
+; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT:    cmpq %rax, %r11
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; SSE2-NEXT:    movq %r14, %r10
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; SSE2-NEXT:    sbbq %rbx, %r10
 ; SSE2-NEXT:    setb %r10b
-; SSE2-NEXT:    cmpq %rcx, %rax
+; SSE2-NEXT:    cmpq %r11, %rax
 ; SSE2-NEXT:    sbbq %r14, %rbx
 ; SSE2-NEXT:    sbbb $0, %r10b
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT:    cmpq %rax, %rbx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    movq %r15, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r14, %rcx
-; SSE2-NEXT:    setb %cl
-; SSE2-NEXT:    cmpq %rbx, %rax
-; SSE2-NEXT:    sbbq %r15, %r14
-; SSE2-NEXT:    sbbb $0, %cl
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE2-NEXT:    cmpq %rax, %r14
-; SSE2-NEXT:    movq (%rsp), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %rbx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r15, %rbx
-; SSE2-NEXT:    setb %bl
+; SSE2-NEXT:    movq (%rsp), %r15 # 8-byte Reload
+; SSE2-NEXT:    movq %r15, %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    sbbq %rbx, %r11
+; SSE2-NEXT:    setb %r11b
 ; SSE2-NEXT:    cmpq %r14, %rax
-; SSE2-NEXT:    sbbq %r12, %r15
-; SSE2-NEXT:    sbbb $0, %bl
+; SSE2-NEXT:    sbbq %r15, %rbx
+; SSE2-NEXT:    sbbb $0, %r11b
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    cmpq %r9, %rax
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %r14
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r15, %r14
+; SSE2-NEXT:    movq %r15, %r14
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    sbbq %rbx, %r14
 ; SSE2-NEXT:    setb %bpl
 ; SSE2-NEXT:    cmpq %rax, %r9
-; SSE2-NEXT:    sbbq %r12, %r15
+; SSE2-NEXT:    sbbq %r15, %rbx
 ; SSE2-NEXT:    sbbb $0, %bpl
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    cmpq %rsi, %rax
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    movq %r15, %r9
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r14, %r9
+; SSE2-NEXT:    movq %r14, %r9
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    sbbq %rbx, %r9
 ; SSE2-NEXT:    setb %r9b
 ; SSE2-NEXT:    cmpq %rax, %rsi
-; SSE2-NEXT:    sbbq %r15, %r14
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    sbbq %r14, %rbx
+; SSE2-NEXT:    movq %rdi, %rbx
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSE2-NEXT:    sbbb $0, %r9b
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; SSE2-NEXT:    cmpq %r15, %rsi
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %rdi
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    movq %rax, %rdi
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; SSE2-NEXT:    sbbq %r14, %rdi
 ; SSE2-NEXT:    setb %dil
 ; SSE2-NEXT:    cmpq %rsi, %r15
-; SSE2-NEXT:    sbbq %r12, %r14
+; SSE2-NEXT:    sbbq %rax, %r14
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE2-NEXT:    sbbb $0, %dil
 ; SSE2-NEXT:    cmpq %rsi, %r14
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    movq %rax, %r15
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE2-NEXT:    movq %r13, %r15
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r12, %r15
+; SSE2-NEXT:    sbbq %r13, %r15
 ; SSE2-NEXT:    setb %r15b
 ; SSE2-NEXT:    cmpq %r14, %rsi
-; SSE2-NEXT:    sbbq %r13, %r12
+; SSE2-NEXT:    sbbq %rax, %r13
 ; SSE2-NEXT:    sbbb $0, %r15b
 ; SSE2-NEXT:    movzbl %r15b, %esi
 ; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    movb %sil, 4(%rax)
+; SSE2-NEXT:    movb %sil, 4(%rbx)
 ; SSE2-NEXT:    movzbl %dil, %esi
 ; SSE2-NEXT:    movzbl %r9b, %edi
 ; SSE2-NEXT:    andl $3, %esi
@@ -2178,59 +2186,60 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    andl $3, %edi
 ; SSE2-NEXT:    shll $4, %edi
 ; SSE2-NEXT:    orq %rsi, %rdi
-; SSE2-NEXT:    movzbl %bl, %r9d
+; SSE2-NEXT:    movzbl %r11b, %r9d
 ; SSE2-NEXT:    andl $3, %r9d
 ; SSE2-NEXT:    shll $6, %r9d
 ; SSE2-NEXT:    orq %rdi, %r9
-; SSE2-NEXT:    movzbl %cl, %esi
+; SSE2-NEXT:    movzbl %r10b, %esi
 ; SSE2-NEXT:    andl $3, %esi
 ; SSE2-NEXT:    shll $8, %esi
 ; SSE2-NEXT:    orq %r9, %rsi
-; SSE2-NEXT:    movzbl %dl, %ecx
-; SSE2-NEXT:    movzbl %r10b, %edx
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    andl $3, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    movzbl %r11b, %edx
+; SSE2-NEXT:    movzbl %dl, %edx
+; SSE2-NEXT:    movzbl %r8b, %edi
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    shll $10, %edi
 ; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orq %rcx, %rdx
-; SSE2-NEXT:    movzbl %r8b, %ecx
-; SSE2-NEXT:    andl $3, %ecx
-; SSE2-NEXT:    shll $16, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    orq %rsi, %rcx
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    shll $18, %esi
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orq %rdi, %rdx
+; SSE2-NEXT:    movzbl %cl, %edi
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    shll $14, %edi
+; SSE2-NEXT:    orq %rdx, %rdi
+; SSE2-NEXT:    movzbl %r12b, %edx
 ; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $20, %edx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    orq %rdi, %rdx
 ; SSE2-NEXT:    orq %rsi, %rdx
 ; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    shll $18, %ecx
 ; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    shll $22, %esi
-; SSE2-NEXT:    orq %rdx, %rsi
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $24, %edx
-; SSE2-NEXT:    orq %rsi, %rdx
+; SSE2-NEXT:    shll $20, %esi
+; SSE2-NEXT:    orq %rcx, %rsi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    shll $22, %ecx
+; SSE2-NEXT:    orq %rsi, %rcx
 ; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
 ; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    shlq $26, %rsi
-; SSE2-NEXT:    orq %rdx, %rsi
+; SSE2-NEXT:    shll $24, %esi
 ; SSE2-NEXT:    orq %rcx, %rsi
 ; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shlq $28, %rdx
 ; SSE2-NEXT:    andl $3, %ecx
-; SSE2-NEXT:    shlq $30, %rcx
-; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    shlq $26, %rcx
 ; SSE2-NEXT:    orq %rsi, %rcx
-; SSE2-NEXT:    movl %ecx, (%rax)
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    shlq $28, %rsi
+; SSE2-NEXT:    andl $3, %edx
+; SSE2-NEXT:    shlq $30, %rdx
+; SSE2-NEXT:    orq %rsi, %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    movl %edx, (%rbx)
+; SSE2-NEXT:    movq %rbx, %rax
 ; SSE2-NEXT:    addq $88, %rsp
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
@@ -2333,34 +2342,34 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    andl $127, %r14d
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX2-NEXT:    andl $127, %edx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX2-NEXT:    andl $127, %ebp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    andl $127, %ebx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    andl $127, %r11d
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; AVX2-NEXT:    andl $127, %r8d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    andl $127, %r12d
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; AVX2-NEXT:    andl $127, %r13d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %rbx, %r11
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX2-NEXT:    cmpq %r12, %rbp
 ; AVX2-NEXT:    movq %r13, %r10
-; AVX2-NEXT:    sbbq %r12, %r10
+; AVX2-NEXT:    sbbq %r8, %r10
 ; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r11, %rbx
-; AVX2-NEXT:    sbbq %r13, %r12
+; AVX2-NEXT:    cmpq %rbp, %r12
+; AVX2-NEXT:    sbbq %r13, %r8
 ; AVX2-NEXT:    sbbb $0, %r10b
 ; AVX2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %r10, %r11
-; AVX2-NEXT:    movq %r8, %rbx
-; AVX2-NEXT:    sbbq %rbp, %rbx
-; AVX2-NEXT:    setb %bl
-; AVX2-NEXT:    cmpq %r11, %r10
-; AVX2-NEXT:    sbbq %r8, %rbp
-; AVX2-NEXT:    sbbb $0, %bl
-; AVX2-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    cmpq %r8, %r10
+; AVX2-NEXT:    movq %r11, %r12
+; AVX2-NEXT:    sbbq %rbx, %r12
+; AVX2-NEXT:    setb %bpl
+; AVX2-NEXT:    cmpq %r10, %r8
+; AVX2-NEXT:    sbbq %r11, %rbx
+; AVX2-NEXT:    sbbb $0, %bpl
+; AVX2-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-NEXT:    cmpq %r8, %r10
@@ -2431,13 +2440,13 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX2-NEXT:    cmpq %rax, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    movq %rbx, %r10
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    sbbq %r11, %r10
 ; AVX2-NEXT:    setb %r8b
 ; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    sbbq %rbx, %r11
 ; AVX2-NEXT:    sbbb $0, %r8b
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
@@ -2451,81 +2460,81 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    sbbq %rbx, %r11
 ; AVX2-NEXT:    sbbb $0, %dl
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %rax, %r11
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    cmpq %rax, %rbx
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    sbbq %rbx, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r11, %r10
 ; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r11, %rax
-; AVX2-NEXT:    sbbq %r14, %rbx
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    cmpq %rax, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    movq %r15, %r11
-; AVX2-NEXT:    movq (%rsp), %r14 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r14, %r11
-; AVX2-NEXT:    setb %r11b
 ; AVX2-NEXT:    cmpq %rbx, %rax
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    sbbb $0, %r11b
+; AVX2-NEXT:    sbbq %r14, %r11
+; AVX2-NEXT:    sbbb $0, %r10b
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; AVX2-NEXT:    cmpq %rax, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %rbx
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %rbx
+; AVX2-NEXT:    movq %r15, %rbx
+; AVX2-NEXT:    movq (%rsp), %r11 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r11, %rbx
 ; AVX2-NEXT:    setb %bl
 ; AVX2-NEXT:    cmpq %r14, %rax
-; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    sbbq %r15, %r11
 ; AVX2-NEXT:    sbbb $0, %bl
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    cmpq %r9, %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    cmpq %rax, %r14
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    movq %r13, %r15
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r11, %r15
 ; AVX2-NEXT:    setb %bpl
-; AVX2-NEXT:    cmpq %rax, %r9
-; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    cmpq %r14, %rax
+; AVX2-NEXT:    sbbq %r13, %r11
 ; AVX2-NEXT:    sbbb $0, %bpl
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    cmpq %r9, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    movq %r15, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r11, %r14
+; AVX2-NEXT:    setb %r14b
+; AVX2-NEXT:    cmpq %rax, %r9
+; AVX2-NEXT:    sbbq %r15, %r11
+; AVX2-NEXT:    sbbb $0, %r14b
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    cmpq %rsi, %rax
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; AVX2-NEXT:    movq %r15, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r14, %r9
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r11, %r9
 ; AVX2-NEXT:    setb %r9b
 ; AVX2-NEXT:    cmpq %rax, %rsi
-; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    sbbq %r15, %r11
 ; AVX2-NEXT:    sbbb $0, %r9b
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    cmpq %rcx, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    movq %r11, %rsi
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    movq %r15, %rsi
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r14, %rsi
+; AVX2-NEXT:    sbbq %r15, %rsi
 ; AVX2-NEXT:    setb %sil
 ; AVX2-NEXT:    cmpq %rax, %rcx
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    sbbq %r11, %r15
 ; AVX2-NEXT:    sbbb $0, %sil
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX2-NEXT:    cmpq %rax, %rcx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    movq %r11, %r15
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    setb %r14b
-; AVX2-NEXT:    cmpq %rcx, %rax
 ; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    setb %r15b
+; AVX2-NEXT:    cmpq %rcx, %rax
+; AVX2-NEXT:    sbbq %r11, %r13
 ; AVX2-NEXT:    movq %rdi, %rax
-; AVX2-NEXT:    sbbb $0, %r14b
-; AVX2-NEXT:    movzbl %r14b, %ecx
+; AVX2-NEXT:    sbbb $0, %r15b
+; AVX2-NEXT:    movzbl %r15b, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    movb %cl, 4(%rdi)
 ; AVX2-NEXT:    movzbl %sil, %ecx
@@ -2533,15 +2542,15 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    movzbl %r9b, %esi
 ; AVX2-NEXT:    andl $3, %esi
 ; AVX2-NEXT:    leaq (%rsi,%rcx,4), %rcx
-; AVX2-NEXT:    movzbl %bpl, %esi
+; AVX2-NEXT:    movzbl %r14b, %esi
 ; AVX2-NEXT:    andl $3, %esi
 ; AVX2-NEXT:    shll $4, %esi
 ; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    movzbl %bl, %ecx
+; AVX2-NEXT:    movzbl %bpl, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    shll $6, %ecx
 ; AVX2-NEXT:    orq %rsi, %rcx
-; AVX2-NEXT:    movzbl %r11b, %esi
+; AVX2-NEXT:    movzbl %bl, %esi
 ; AVX2-NEXT:    andl $3, %esi
 ; AVX2-NEXT:    shll $8, %esi
 ; AVX2-NEXT:    orq %rcx, %rsi
@@ -2676,18 +2685,18 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    andl $127, %eax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX512-NEXT:    andl $127, %ebp
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512-NEXT:    andl $127, %r12d
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; AVX512-NEXT:    andl $127, %r13d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX512-NEXT:    andl $127, %ebp
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; AVX512-NEXT:    andl $127, %r15d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    andl $127, %r12d
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    andl $127, %r10d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT:    andl $127, %ebx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT:    andl $127, %r14d
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; AVX512-NEXT:    andl $127, %r8d
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
@@ -2700,13 +2709,13 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    andl $127, %eax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX512-NEXT:    andl $127, %edx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    cmpq %r14, %r11
+; AVX512-NEXT:    cmpq %rbx, %r11
 ; AVX512-NEXT:    movq %rdx, %rcx
 ; AVX512-NEXT:    sbbq %rax, %rcx
 ; AVX512-NEXT:    setb %cl
-; AVX512-NEXT:    cmpq %r11, %r14
+; AVX512-NEXT:    cmpq %r11, %rbx
 ; AVX512-NEXT:    sbbq %rdx, %rax
 ; AVX512-NEXT:    sbbb $0, %cl
 ; AVX512-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
@@ -2733,31 +2742,31 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %rbx, %rdx
+; AVX512-NEXT:    movq %r14, %rdx
 ; AVX512-NEXT:    sbbq %r10, %rdx
 ; AVX512-NEXT:    setb %dl
 ; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rbx, %r10
+; AVX512-NEXT:    sbbq %r14, %r10
 ; AVX512-NEXT:    sbbb $0, %dl
 ; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r15, %rdx
-; AVX512-NEXT:    sbbq %r13, %rdx
+; AVX512-NEXT:    movq %r12, %rdx
+; AVX512-NEXT:    sbbq %r15, %rdx
 ; AVX512-NEXT:    setb %dl
 ; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r15, %r13
+; AVX512-NEXT:    sbbq %r12, %r15
 ; AVX512-NEXT:    sbbb $0, %dl
 ; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r12, %rdx
-; AVX512-NEXT:    sbbq %rbp, %rdx
+; AVX512-NEXT:    movq %rbp, %rdx
+; AVX512-NEXT:    sbbq %r13, %rdx
 ; AVX512-NEXT:    setb %dl
 ; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r12, %rbp
+; AVX512-NEXT:    sbbq %rbp, %r13
 ; AVX512-NEXT:    sbbb $0, %dl
 ; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -2767,21 +2776,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq %rdi, %rdx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX512-NEXT:    sbbq %rsi, %rdx
-; AVX512-NEXT:    setb %r13b
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %r13b
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    sbbq %rsi, %rdx
-; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    setb %r15b
 ; AVX512-NEXT:    cmpq %rcx, %rax
 ; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %bpl
+; AVX512-NEXT:    sbbb $0, %r15b
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX512-NEXT:    cmpq %rcx, %rdx
@@ -2789,10 +2787,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq %rdi, %rax
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX512-NEXT:    sbbq %rsi, %rax
-; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    setb %bl
 ; AVX512-NEXT:    cmpq %rdx, %rcx
 ; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    sbbb $0, %bl
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; AVX512-NEXT:    cmpq %rdx, %rsi
@@ -2818,107 +2816,118 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; AVX512-NEXT:    cmpq %rdi, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    movq %r10, %rsi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT:    movq %r9, %rsi
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX512-NEXT:    sbbq %rax, %rsi
 ; AVX512-NEXT:    setb %sil
 ; AVX512-NEXT:    cmpq %r8, %rdi
-; AVX512-NEXT:    sbbq %r10, %rax
+; AVX512-NEXT:    sbbq %r9, %rax
 ; AVX512-NEXT:    sbbb $0, %sil
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    cmpq %r8, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    movq %r11, %rdi
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    cmpq %r8, %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    movq %r10, %rdi
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX512-NEXT:    sbbq %rax, %rdi
 ; AVX512-NEXT:    setb %dil
-; AVX512-NEXT:    cmpq %r10, %r8
-; AVX512-NEXT:    sbbq %r11, %rax
+; AVX512-NEXT:    cmpq %r9, %r8
+; AVX512-NEXT:    sbbq %r10, %rax
 ; AVX512-NEXT:    sbbb $0, %dil
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    cmpq %r9, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT:    movq %r11, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    sbbq %rax, %r8
+; AVX512-NEXT:    setb %r8b
+; AVX512-NEXT:    cmpq %r10, %r9
+; AVX512-NEXT:    sbbq %r11, %rax
+; AVX512-NEXT:    sbbb $0, %r8b
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX512-NEXT:    cmpq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    movq %rbx, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT:    movq %r14, %r9
 ; AVX512-NEXT:    movq (%rsp), %r11 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r11, %r8
-; AVX512-NEXT:    setb %r8b
+; AVX512-NEXT:    sbbq %r11, %r9
+; AVX512-NEXT:    setb %r9b
 ; AVX512-NEXT:    cmpq %r10, %rax
-; AVX512-NEXT:    sbbq %rbx, %r11
-; AVX512-NEXT:    sbbb $0, %r8b
+; AVX512-NEXT:    sbbq %r14, %r11
+; AVX512-NEXT:    sbbb $0, %r9b
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    cmpq %rbx, %r11
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    movq %r14, %r10
+; AVX512-NEXT:    cmpq %r14, %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT:    movq %r12, %r10
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX512-NEXT:    sbbq %rax, %r10
 ; AVX512-NEXT:    setb %r10b
-; AVX512-NEXT:    cmpq %r11, %rbx
-; AVX512-NEXT:    sbbq %r14, %rax
+; AVX512-NEXT:    cmpq %r11, %r14
+; AVX512-NEXT:    sbbq %r12, %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512-NEXT:    sbbb $0, %r10b
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    cmpq %r15, %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    cmpq %r13, %r11
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r14, %rbx
-; AVX512-NEXT:    setb %bl
-; AVX512-NEXT:    cmpq %r11, %r15
+; AVX512-NEXT:    movq %rax, %r14
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r12, %r14
+; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    cmpq %r11, %r13
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    sbbq %rax, %r14
+; AVX512-NEXT:    sbbq %rax, %r12
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT:    sbbb $0, %bl
+; AVX512-NEXT:    sbbb $0, %bpl
 ; AVX512-NEXT:    cmpq %r11, %r14
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r12, %r15
-; AVX512-NEXT:    setb %r15b
+; AVX512-NEXT:    movq %rax, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r13, %r12
+; AVX512-NEXT:    setb %r12b
 ; AVX512-NEXT:    cmpq %r14, %r11
-; AVX512-NEXT:    sbbq %rax, %r12
-; AVX512-NEXT:    sbbb $0, %r15b
-; AVX512-NEXT:    movzbl %r15b, %r11d
+; AVX512-NEXT:    sbbq %rax, %r13
+; AVX512-NEXT:    sbbb $0, %r12b
+; AVX512-NEXT:    movzbl %r12b, %r11d
 ; AVX512-NEXT:    andl $3, %r11d
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; AVX512-NEXT:    movb %r11b, 4(%r14)
-; AVX512-NEXT:    movzbl %bl, %r11d
+; AVX512-NEXT:    movzbl %bpl, %r11d
 ; AVX512-NEXT:    andl $3, %r11d
 ; AVX512-NEXT:    movzbl %r10b, %r10d
 ; AVX512-NEXT:    andl $3, %r10d
 ; AVX512-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512-NEXT:    movzbl %r9b, %r9d
+; AVX512-NEXT:    andl $3, %r9d
+; AVX512-NEXT:    shll $4, %r9d
+; AVX512-NEXT:    orq %r10, %r9
 ; AVX512-NEXT:    movzbl %r8b, %r8d
 ; AVX512-NEXT:    andl $3, %r8d
-; AVX512-NEXT:    shll $4, %r8d
-; AVX512-NEXT:    orq %r10, %r8
+; AVX512-NEXT:    shll $6, %r8d
+; AVX512-NEXT:    orq %r9, %r8
 ; AVX512-NEXT:    movzbl %dil, %edi
 ; AVX512-NEXT:    andl $3, %edi
-; AVX512-NEXT:    shll $6, %edi
+; AVX512-NEXT:    shll $8, %edi
 ; AVX512-NEXT:    orq %r8, %rdi
 ; AVX512-NEXT:    movzbl %sil, %esi
 ; AVX512-NEXT:    andl $3, %esi
-; AVX512-NEXT:    shll $8, %esi
-; AVX512-NEXT:    orq %rdi, %rsi
+; AVX512-NEXT:    shll $10, %esi
 ; AVX512-NEXT:    movzbl %dl, %edx
 ; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shll $10, %edx
+; AVX512-NEXT:    shll $12, %edx
+; AVX512-NEXT:    orq %rsi, %rdx
 ; AVX512-NEXT:    movzbl %cl, %ecx
 ; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    shll $12, %ecx
+; AVX512-NEXT:    shll $14, %ecx
 ; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    movzbl %r9b, %edx
-; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shll $14, %edx
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movzbl %bpl, %eax
+; AVX512-NEXT:    movzbl %bl, %eax
 ; AVX512-NEXT:    andl $3, %eax
 ; AVX512-NEXT:    shll $16, %eax
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    movzbl %r13b, %ecx
+; AVX512-NEXT:    orq %rcx, %rax
+; AVX512-NEXT:    orq %rdi, %rax
+; AVX512-NEXT:    movzbl %r15b, %ecx
 ; AVX512-NEXT:    andl $3, %ecx
 ; AVX512-NEXT:    shll $18, %ecx
 ; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
@@ -2963,7 +2972,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $132, %esp
+; X86-NEXT:    subl $128, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $127, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3054,31 +3063,47 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $127, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    andl $127, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $127, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $127, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $127, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
@@ -3091,6 +3116,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    sbbl %ebp, %edi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
@@ -3243,26 +3269,6 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -3387,8 +3393,8 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3398,7 +3404,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %edi, %esi
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    sbbl %eax, %eax
@@ -3469,7 +3475,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    movl %edx, (%edi)
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    addl $132, %esp
+; X86-NEXT:    addl $128, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

>From ff2c357952c7f7824773bb3ecf37b59a630063d8 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Mon, 22 Sep 2025 11:13:03 -0400
Subject: [PATCH 2/2] Fix mitigiations in PowerPC by having custom lowering
 anyway

---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 33 +++++++++
 llvm/lib/Target/PowerPC/PPCISelLowering.h   |  1 +
 llvm/test/CodeGen/PowerPC/ucmp.ll           | 78 ++++++++-------------
 3 files changed, 62 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2907303874de5..932c224561033 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -585,6 +585,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // We cannot sextinreg(i1).  Expand to shifts.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
+  // Custom handling for PowerPC ucmp instruction
+  setOperationAction(ISD::UCMP, MVT::i32, Custom);
+  setOperationAction(ISD::UCMP, MVT::i64, isPPC64 ? Custom : Expand);
+
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
   // support continuation, user-level threading, and etc.. As a result, no
@@ -12618,6 +12622,33 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues({Sub, OverflowTrunc}, dl);
 }
 
+// Lower unsigned 3-way compare producing -1/0/1.
+SDValue PPCTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue A = DAG.getFreeze(Op.getOperand(0));
+  SDValue B = DAG.getFreeze(Op.getOperand(1));
+  EVT OpVT = A.getValueType();   // operand type
+  EVT ResVT = Op.getValueType(); // result type
+
+  // First compute diff = A - B (will become subf).
+  SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, A, B);
+
+  // Generate B - A using SUBC to capture carry.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  SDValue SubC = DAG.getNode(PPCISD::SUBC, DL, VTs, B, A);
+  SDValue CA0 = SubC.getValue(1);
+
+  // t2 = A - B + CA0 using SUBE.
+  SDValue SubE1 = DAG.getNode(PPCISD::SUBE, DL, VTs, A, B, CA0);
+  SDValue CA1 = SubE1.getValue(1);
+
+  // res = diff - t2 + CA1 using SUBE (produces desired -1/0/1).
+  SDValue ResPair = DAG.getNode(PPCISD::SUBE, DL, VTs, Diff, SubE1, CA1);
+
+  // Extract the first result and truncate to result type if needed
+  return DAG.getSExtOrTrunc(ResPair.getValue(0), DL, ResVT);
+}
+
 /// LowerOperation - Provide custom lowering hooks for some operations.
 ///
 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -12722,6 +12753,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UADDO_CARRY:
   case ISD::USUBO_CARRY:
     return LowerADDSUBO_CARRY(Op, DAG);
+  case ISD::UCMP:
+    return LowerUCMP(Op, DAG);
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 669430550f4e6..b82533fac2eb8 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1343,6 +1343,7 @@ namespace llvm {
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUCMP(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll
index 22faf9cbd9c24..4d393dd00e3db 100644
--- a/llvm/test/CodeGen/PowerPC/ucmp.ll
+++ b/llvm/test/CodeGen/PowerPC/ucmp.ll
@@ -4,14 +4,10 @@
 define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
 ; CHECK-LABEL: ucmp_8_8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    clrldi 5, 4, 32
-; CHECK-NEXT:    clrldi 6, 3, 32
-; CHECK-NEXT:    sub 5, 5, 6
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldic 3, 3, 0, 32
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
   ret i8 %1
@@ -20,14 +16,10 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
 define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
 ; CHECK-LABEL: ucmp_8_16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    clrldi 5, 4, 32
-; CHECK-NEXT:    clrldi 6, 3, 32
-; CHECK-NEXT:    sub 5, 5, 6
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldic 3, 3, 0, 32
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
   ret i8 %1
@@ -36,14 +28,10 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
 define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ucmp_8_32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    clrldi 5, 4, 32
-; CHECK-NEXT:    clrldi 6, 3, 32
-; CHECK-NEXT:    sub 5, 5, 6
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldic 3, 3, 0, 32
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
   ret i8 %1
@@ -52,12 +40,10 @@ define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
 define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: ucmp_8_64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    subc 3, 4, 3
-; CHECK-NEXT:    subfe 3, 4, 4
-; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
   ret i8 %1
@@ -86,14 +72,10 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
 define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ucmp_32_32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    clrldi 5, 4, 32
-; CHECK-NEXT:    clrldi 6, 3, 32
-; CHECK-NEXT:    sub 5, 5, 6
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    rldic 3, 3, 0, 32
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -102,12 +84,10 @@ define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
 define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: ucmp_32_64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    subc 3, 4, 3
-; CHECK-NEXT:    subfe 3, 4, 4
-; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
   ret i32 %1
@@ -116,12 +96,10 @@ define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
 define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: ucmp_64_64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subc 5, 4, 3
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    subfe 5, 4, 4
-; CHECK-NEXT:    neg 5, 5
-; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    subc 6, 4, 3
+; CHECK-NEXT:    sub 5, 3, 4
+; CHECK-NEXT:    subfe 3, 4, 3
+; CHECK-NEXT:    subfe 3, 3, 5
 ; CHECK-NEXT:    blr
   %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
   ret i64 %1



More information about the llvm-commits mailing list