[llvm] e094abd - [SelectionDAG] Expand [US]CMP using arithmetic on boolean values instead of selects (#98774)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 16 12:56:23 PDT 2024


Author: Volodymyr Vasylkun
Date: 2024-07-16T20:56:18+01:00
New Revision: e094abde42634e38cda85a6024792f681fc58f32

URL: https://github.com/llvm/llvm-project/commit/e094abde42634e38cda85a6024792f681fc58f32
DIFF: https://github.com/llvm/llvm-project/commit/e094abde42634e38cda85a6024792f681fc58f32.diff

LOG: [SelectionDAG] Expand [US]CMP using arithmetic on boolean values instead of selects (#98774)

The previous expansion of [US]CMP was done using two selects and two
compares. It produced decent code, but on many platforms it is better to
implement [US]CMP nodes by performing the following operation:

  ```
[us]cmp(x, y) = (x [us]> y) - (x [us]< y)
```

This patch adds this new expansion, as well as a hook in TargetLowering to allow some targets to still use the select-based approach. AArch64 and SystemZ are currently the only targets to prefer the former approach, but other targets may also start to use it if it provides for better codegen.

Added: 
    llvm/test/CodeGen/ARM/scmp.ll
    llvm/test/CodeGen/ARM/ucmp.ll
    llvm/test/CodeGen/LoongArch/scmp.ll
    llvm/test/CodeGen/LoongArch/ucmp.ll
    llvm/test/CodeGen/PowerPC/scmp.ll
    llvm/test/CodeGen/PowerPC/ucmp.ll
    llvm/test/CodeGen/RISCV/scmp.ll
    llvm/test/CodeGen/RISCV/ucmp.ll
    llvm/test/CodeGen/SystemZ/scmp.ll
    llvm/test/CodeGen/SystemZ/ucmp.ll
    llvm/test/CodeGen/Thumb/scmp.ll
    llvm/test/CodeGen/Thumb/ucmp.ll
    llvm/test/CodeGen/WebAssembly/scmp.ll
    llvm/test/CodeGen/WebAssembly/ucmp.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/SystemZ/SystemZISelLowering.h
    llvm/test/CodeGen/X86/scmp.ll
    llvm/test/CodeGen/X86/ucmp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 06e802314d97c..ef66b82d6f414 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3391,6 +3391,10 @@ class TargetLoweringBase {
     return isOperationLegalOrCustom(Op, VT);
   }
 
+  /// Should we expand [US]CMP nodes using two selects and two compares, or by
+  /// doing arithmetic on boolean types
+  virtual bool shouldExpandCmpUsingSelects() const { return false; }
+
   /// Does this target support complex deinterleaving
   virtual bool isComplexDeinterleavingSupported() const { return false; }
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 92e18a4b630e9..1433c8821248d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10391,14 +10391,28 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
 
   auto LTPredicate = (Opcode == ISD::UCMP ? ISD::SETULT : ISD::SETLT);
   auto GTPredicate = (Opcode == ISD::UCMP ? ISD::SETUGT : ISD::SETGT);
-
   SDValue IsLT = DAG.getSetCC(dl, BoolVT, LHS, RHS, LTPredicate);
   SDValue IsGT = DAG.getSetCC(dl, BoolVT, LHS, RHS, GTPredicate);
-  SDValue SelectZeroOrOne =
-      DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
-                    DAG.getConstant(0, dl, ResVT));
-  return DAG.getSelect(dl, ResVT, IsLT, DAG.getConstant(-1, dl, ResVT),
-                       SelectZeroOrOne);
+
+  // We can't perform arithmetic on i1 values. Extending them would
+  // probably result in worse codegen, so let's just use two selects instead.
+  // Some targets are also just better off using selects rather than subtraction
+  // because one of the conditions can be merged with one of the selects.
+  // And finally, if we don't know the contents of high bits of a boolean value
+  // we can't perform any arithmetic either.
+  if (shouldExpandCmpUsingSelects() || BoolVT.getScalarSizeInBits() == 1 ||
+      getBooleanContents(BoolVT) == UndefinedBooleanContent) {
+    SDValue SelectZeroOrOne =
+        DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
+                      DAG.getConstant(0, dl, ResVT));
+    return DAG.getSelect(dl, ResVT, IsLT, DAG.getConstant(-1, dl, ResVT),
+                         SelectZeroOrOne);
+  }
+
+  if (getBooleanContents(BoolVT) == ZeroOrNegativeOneBooleanContent)
+    std::swap(IsGT, IsLT);
+  return DAG.getSExtOrTrunc(DAG.getNode(ISD::SUB, dl, BoolVT, IsGT, IsLT), dl,
+                            ResVT);
 }
 
 SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 047c852bb01d2..fcdd47541be82 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -907,6 +907,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
 
+  bool shouldExpandCmpUsingSelects() const override { return true; }
+
   bool isComplexDeinterleavingSupported() const override;
   bool isComplexDeinterleavingOperationSupported(
       ComplexDeinterleavingOperation Operation, Type *Ty) const override;

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 2290a7d62e89f..1e7285e3e0fc5 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -507,6 +507,8 @@ class SystemZTargetLowering : public TargetLowering {
 
   bool shouldConsiderGEPOffsetSplit() const override { return true; }
 
+  bool shouldExpandCmpUsingSelects() const override { return true; }
+
   const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

diff  --git a/llvm/test/CodeGen/ARM/scmp.ll b/llvm/test/CodeGen/ARM/scmp.ll
new file mode 100644
index 0000000000000..6e493c993751c
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/scmp.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=armv7-unknown-eabi %s -o - | FileCheck %s
+
+define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
+; CHECK-LABEL: scmp_8_8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movwlt r0, #1
+; CHECK-NEXT:    movwgt r2, #1
+; CHECK-NEXT:    sub r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
+; CHECK-LABEL: scmp_8_16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movwlt r0, #1
+; CHECK-NEXT:    movwgt r2, #1
+; CHECK-NEXT:    sub r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp_8_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movwlt r0, #1
+; CHECK-NEXT:    movwgt r2, #1
+; CHECK-NEXT:    sub r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_8_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    subs lr, r0, r2
+; CHECK-NEXT:    mov r12, #0
+; CHECK-NEXT:    sbcs lr, r1, r3
+; CHECK-NEXT:    mov lr, #0
+; CHECK-NEXT:    movwlt lr, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs r0, r3, r1
+; CHECK-NEXT:    movwlt r12, #1
+; CHECK-NEXT:    sub r0, r12, lr
+; CHECK-NEXT:    pop {r11, pc}
+  %1 = call i8 @llvm.scmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: scmp_8_128:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    ldr r4, [sp, #24]
+; CHECK-NEXT:    mov r5, #0
+; CHECK-NEXT:    ldr r6, [sp, #28]
+; CHECK-NEXT:    subs r7, r0, r4
+; CHECK-NEXT:    ldr r12, [sp, #32]
+; CHECK-NEXT:    sbcs r7, r1, r6
+; CHECK-NEXT:    ldr lr, [sp, #36]
+; CHECK-NEXT:    sbcs r7, r2, r12
+; CHECK-NEXT:    sbcs r7, r3, lr
+; CHECK-NEXT:    mov r7, #0
+; CHECK-NEXT:    movwlt r7, #1
+; CHECK-NEXT:    subs r0, r4, r0
+; CHECK-NEXT:    sbcs r0, r6, r1
+; CHECK-NEXT:    sbcs r0, r12, r2
+; CHECK-NEXT:    sbcs r0, lr, r3
+; CHECK-NEXT:    movwlt r5, #1
+; CHECK-NEXT:    sub r0, r5, r7
+; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+  %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp_32_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movwlt r0, #1
+; CHECK-NEXT:    movwgt r2, #1
+; CHECK-NEXT:    sub r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_32_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    subs lr, r0, r2
+; CHECK-NEXT:    mov r12, #0
+; CHECK-NEXT:    sbcs lr, r1, r3
+; CHECK-NEXT:    mov lr, #0
+; CHECK-NEXT:    movwlt lr, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs r0, r3, r1
+; CHECK-NEXT:    movwlt r12, #1
+; CHECK-NEXT:    sub r0, r12, lr
+; CHECK-NEXT:    pop {r11, pc}
+  %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @scmp_64_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_64_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    subs lr, r0, r2
+; CHECK-NEXT:    mov r12, #0
+; CHECK-NEXT:    sbcs lr, r1, r3
+; CHECK-NEXT:    mov lr, #0
+; CHECK-NEXT:    movwlt lr, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs r0, r3, r1
+; CHECK-NEXT:    movwlt r12, #1
+; CHECK-NEXT:    sub r0, r12, lr
+; CHECK-NEXT:    asr r1, r0, #31
+; CHECK-NEXT:    pop {r11, pc}
+  %1 = call i64 @llvm.scmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/ARM/ucmp.ll b/llvm/test/CodeGen/ARM/ucmp.ll
new file mode 100644
index 0000000000000..ad4af534ee8fe
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ucmp.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=armv7-unknown-eabi %s -o - | FileCheck %s
+
+define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp_8_8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movwlo r0, #1
+; CHECK-NEXT:    movwhi r2, #1
+; CHECK-NEXT:    sub r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp_8_16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movwlo r0, #1
+; CHECK-NEXT:    movwhi r2, #1
+; CHECK-NEXT:    sub r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp_8_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movwlo r0, #1
+; CHECK-NEXT:    movwhi r2, #1
+; CHECK-NEXT:    sub r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_8_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    subs lr, r0, r2
+; CHECK-NEXT:    mov r12, #0
+; CHECK-NEXT:    sbcs lr, r1, r3
+; CHECK-NEXT:    mov lr, #0
+; CHECK-NEXT:    movwlo lr, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs r0, r3, r1
+; CHECK-NEXT:    movwlo r12, #1
+; CHECK-NEXT:    sub r0, r12, lr
+; CHECK-NEXT:    pop {r11, pc}
+  %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: ucmp_8_128:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT:    ldr r4, [sp, #24]
+; CHECK-NEXT:    mov r5, #0
+; CHECK-NEXT:    ldr r6, [sp, #28]
+; CHECK-NEXT:    subs r7, r0, r4
+; CHECK-NEXT:    ldr r12, [sp, #32]
+; CHECK-NEXT:    sbcs r7, r1, r6
+; CHECK-NEXT:    ldr lr, [sp, #36]
+; CHECK-NEXT:    sbcs r7, r2, r12
+; CHECK-NEXT:    sbcs r7, r3, lr
+; CHECK-NEXT:    mov r7, #0
+; CHECK-NEXT:    movwlo r7, #1
+; CHECK-NEXT:    subs r0, r4, r0
+; CHECK-NEXT:    sbcs r0, r6, r1
+; CHECK-NEXT:    sbcs r0, r12, r2
+; CHECK-NEXT:    sbcs r0, lr, r3
+; CHECK-NEXT:    movwlo r5, #1
+; CHECK-NEXT:    sub r0, r5, r7
+; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+  %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp_32_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    movwlo r0, #1
+; CHECK-NEXT:    movwhi r2, #1
+; CHECK-NEXT:    sub r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_32_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    subs lr, r0, r2
+; CHECK-NEXT:    mov r12, #0
+; CHECK-NEXT:    sbcs lr, r1, r3
+; CHECK-NEXT:    mov lr, #0
+; CHECK-NEXT:    movwlo lr, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs r0, r3, r1
+; CHECK-NEXT:    movwlo r12, #1
+; CHECK-NEXT:    sub r0, r12, lr
+; CHECK-NEXT:    pop {r11, pc}
+  %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_64_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    subs lr, r0, r2
+; CHECK-NEXT:    mov r12, #0
+; CHECK-NEXT:    sbcs lr, r1, r3
+; CHECK-NEXT:    mov lr, #0
+; CHECK-NEXT:    movwlo lr, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs r0, r3, r1
+; CHECK-NEXT:    movwlo r12, #1
+; CHECK-NEXT:    sub r0, r12, lr
+; CHECK-NEXT:    asr r1, r0, #31
+; CHECK-NEXT:    pop {r11, pc}
+  %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/LoongArch/scmp.ll b/llvm/test/CodeGen/LoongArch/scmp.ll
new file mode 100644
index 0000000000000..69a92968173d2
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/scmp.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d --verify-machineinstrs < %s | FileCheck %s
+
+define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind {
+; CHECK-LABEL: scmp.8.8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slt $a2, $a0, $a1
+; CHECK-NEXT:    slt $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind {
+; CHECK-LABEL: scmp.8.16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slt $a2, $a0, $a1
+; CHECK-NEXT:    slt $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp.8.32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.w $a1, $a1, 0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    slt $a2, $a0, $a1
+; CHECK-NEXT:    slt $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.8.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slt $a2, $a0, $a1
+; CHECK-NEXT:    slt $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: scmp.8.128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slt $a4, $a1, $a3
+; CHECK-NEXT:    xor $a5, $a1, $a3
+; CHECK-NEXT:    sltui $a5, $a5, 1
+; CHECK-NEXT:    masknez $a4, $a4, $a5
+; CHECK-NEXT:    sltu $a6, $a0, $a2
+; CHECK-NEXT:    maskeqz $a6, $a6, $a5
+; CHECK-NEXT:    or $a4, $a6, $a4
+; CHECK-NEXT:    slt $a1, $a3, $a1
+; CHECK-NEXT:    masknez $a1, $a1, $a5
+; CHECK-NEXT:    sltu $a0, $a2, $a0
+; CHECK-NEXT:    maskeqz $a0, $a0, $a5
+; CHECK-NEXT:    or $a0, $a0, $a1
+; CHECK-NEXT:    sub.d $a0, $a0, $a4
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp.32.32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.w $a1, $a1, 0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    slt $a2, $a0, $a1
+; CHECK-NEXT:    slt $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.32.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slt $a2, $a0, $a1
+; CHECK-NEXT:    slt $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.64.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slt $a2, $a0, $a1
+; CHECK-NEXT:    slt $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.scmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/LoongArch/ucmp.ll b/llvm/test/CodeGen/LoongArch/ucmp.ll
new file mode 100644
index 0000000000000..548c5bd0db72b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/ucmp.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d --verify-machineinstrs < %s | FileCheck %s
+
+define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp.8.8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu $a2, $a0, $a1
+; CHECK-NEXT:    sltu $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp.8.16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu $a2, $a0, $a1
+; CHECK-NEXT:    sltu $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp.8.32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    sltu $a2, $a0, $a1
+; CHECK-NEXT:    sltu $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.8.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu $a2, $a0, $a1
+; CHECK-NEXT:    sltu $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: ucmp.8.128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu $a4, $a1, $a3
+; CHECK-NEXT:    xor $a5, $a1, $a3
+; CHECK-NEXT:    sltui $a5, $a5, 1
+; CHECK-NEXT:    masknez $a4, $a4, $a5
+; CHECK-NEXT:    sltu $a6, $a0, $a2
+; CHECK-NEXT:    maskeqz $a6, $a6, $a5
+; CHECK-NEXT:    or $a4, $a6, $a4
+; CHECK-NEXT:    sltu $a1, $a3, $a1
+; CHECK-NEXT:    masknez $a1, $a1, $a5
+; CHECK-NEXT:    sltu $a0, $a2, $a0
+; CHECK-NEXT:    maskeqz $a0, $a0, $a5
+; CHECK-NEXT:    or $a0, $a0, $a1
+; CHECK-NEXT:    sub.d $a0, $a0, $a4
+; CHECK-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp.32.32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 0
+; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    sltu $a2, $a0, $a1
+; CHECK-NEXT:    sltu $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.32.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu $a2, $a0, $a1
+; CHECK-NEXT:    sltu $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.64.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu $a2, $a0, $a1
+; CHECK-NEXT:    sltu $a0, $a1, $a0
+; CHECK-NEXT:    sub.d $a0, $a0, $a2
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/PowerPC/scmp.ll b/llvm/test/CodeGen/PowerPC/scmp.ll
new file mode 100644
index 0000000000000..107137c0bea7c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/scmp.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=ppc64le-unknown-unknown %s -o - | FileCheck %s
+
+define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
+; CHECK-LABEL: scmp_8_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpw 3, 4
+; CHECK-NEXT:    sub 5, 4, 3
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    rldicl 5, 5, 1, 63
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
+; CHECK-LABEL: scmp_8_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpw 3, 4
+; CHECK-NEXT:    sub 5, 4, 3
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    rldicl 5, 5, 1, 63
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp_8_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    extsw 4, 4
+; CHECK-NEXT:    extsw 3, 3
+; CHECK-NEXT:    cmpw 3, 4
+; CHECK-NEXT:    sub 3, 4, 3
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    rldicl 3, 3, 1, 63
+; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_8_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sradi 5, 4, 63
+; CHECK-NEXT:    rldicl 6, 3, 1, 63
+; CHECK-NEXT:    subc 7, 4, 3
+; CHECK-NEXT:    adde 5, 6, 5
+; CHECK-NEXT:    cmpd 3, 4
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    xori 5, 5, 1
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.scmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: scmp_8_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpld 4, 6
+; CHECK-NEXT:    cmpd 1, 4, 6
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    cmpld 5, 3, 5
+; CHECK-NEXT:    li 3, 1
+; CHECK-NEXT:    crandc 22, 5, 2
+; CHECK-NEXT:    crand 21, 2, 21
+; CHECK-NEXT:    crand 20, 2, 20
+; CHECK-NEXT:    crnor 21, 21, 22
+; CHECK-NEXT:    isel 3, 0, 3, 21
+; CHECK-NEXT:    crandc 21, 4, 2
+; CHECK-NEXT:    cror 20, 20, 21
+; CHECK-NEXT:    isel 3, 4, 3, 20
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp_32_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    extsw 4, 4
+; CHECK-NEXT:    extsw 3, 3
+; CHECK-NEXT:    cmpw 3, 4
+; CHECK-NEXT:    sub 3, 4, 3
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    rldicl 3, 3, 1, 63
+; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    blr
+  %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_32_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sradi 5, 4, 63
+; CHECK-NEXT:    rldicl 6, 3, 1, 63
+; CHECK-NEXT:    subc 7, 4, 3
+; CHECK-NEXT:    adde 5, 6, 5
+; CHECK-NEXT:    cmpd 3, 4
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    xori 5, 5, 1
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @scmp_64_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_64_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sradi 5, 4, 63
+; CHECK-NEXT:    rldicl 6, 3, 1, 63
+; CHECK-NEXT:    subc 7, 4, 3
+; CHECK-NEXT:    adde 5, 6, 5
+; CHECK-NEXT:    cmpd 3, 4
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    xori 5, 5, 1
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i64 @llvm.scmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll
new file mode 100644
index 0000000000000..d2dff6e7e05c8
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ucmp.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=ppc64le-unknown-unknown %s -o - | FileCheck %s
+
+define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp_8_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmplw 3, 4
+; CHECK-NEXT:    sub 5, 4, 3
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    rldicl 5, 5, 1, 63
+; CHECK-NEXT:    rldic 3, 3, 0, 32
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp_8_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmplw 3, 4
+; CHECK-NEXT:    sub 5, 4, 3
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    rldicl 5, 5, 1, 63
+; CHECK-NEXT:    rldic 3, 3, 0, 32
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp_8_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clrldi 5, 4, 32
+; CHECK-NEXT:    clrldi 6, 3, 32
+; CHECK-NEXT:    sub 5, 5, 6
+; CHECK-NEXT:    cmplw 3, 4
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    rldic 3, 3, 0, 32
+; CHECK-NEXT:    rldicl 5, 5, 1, 63
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_8_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpld 3, 4
+; CHECK-NEXT:    subc 3, 4, 3
+; CHECK-NEXT:    subfe 3, 4, 4
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    neg 3, 3
+; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: ucmp_8_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpld 4, 6
+; CHECK-NEXT:    cmpld 1, 3, 5
+; CHECK-NEXT:    li 3, 1
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    crandc 20, 1, 2
+; CHECK-NEXT:    crand 21, 2, 5
+; CHECK-NEXT:    crnor 20, 21, 20
+; CHECK-NEXT:    crand 21, 2, 4
+; CHECK-NEXT:    isel 3, 0, 3, 20
+; CHECK-NEXT:    crandc 20, 0, 2
+; CHECK-NEXT:    cror 20, 21, 20
+; CHECK-NEXT:    isel 3, 4, 3, 20
+; CHECK-NEXT:    blr
+  %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp_32_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clrldi 5, 4, 32
+; CHECK-NEXT:    clrldi 6, 3, 32
+; CHECK-NEXT:    sub 5, 5, 6
+; CHECK-NEXT:    cmplw 3, 4
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    rldic 3, 3, 0, 32
+; CHECK-NEXT:    rldicl 5, 5, 1, 63
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_32_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpld 3, 4
+; CHECK-NEXT:    subc 3, 4, 3
+; CHECK-NEXT:    subfe 3, 4, 4
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    neg 3, 3
+; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    blr
+  %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_64_64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subc 5, 4, 3
+; CHECK-NEXT:    cmpld 3, 4
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    subfe 5, 4, 4
+; CHECK-NEXT:    neg 5, 5
+; CHECK-NEXT:    isellt 3, 3, 5
+; CHECK-NEXT:    blr
+  %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/RISCV/scmp.ll b/llvm/test/CodeGen/RISCV/scmp.ll
new file mode 100644
index 0000000000000..e79b6989410a6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/scmp.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 | FileCheck %s --check-prefix=RV32I
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s --check-prefix=RV64I
+
+define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind {
+; RV32I-LABEL: scmp.8.8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slt a2, a0, a1
+; RV32I-NEXT:    slt a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: scmp.8.8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slt a2, a0, a1
+; RV64I-NEXT:    slt a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind {
+; RV32I-LABEL: scmp.8.16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slt a2, a0, a1
+; RV32I-NEXT:    slt a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: scmp.8.16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slt a2, a0, a1
+; RV64I-NEXT:    slt a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: scmp.8.32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slt a2, a0, a1
+; RV32I-NEXT:    slt a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: scmp.8.32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    slt a2, a0, a1
+; RV64I-NEXT:    slt a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: scmp.8.64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a3
+; RV32I-NEXT:    slt a0, a3, a1
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB3_2:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: scmp.8.64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slt a2, a0, a1
+; RV64I-NEXT:    slt a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
+; RV32I-LABEL: scmp.8.128:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a2, 4(a1)
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a6, 12(a0)
+; RV32I-NEXT:    lw a7, 8(a0)
+; RV32I-NEXT:    beq a6, a5, .LBB4_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt t2, a6, a5
+; RV32I-NEXT:    j .LBB4_3
+; RV32I-NEXT:  .LBB4_2:
+; RV32I-NEXT:    sltu t2, a7, a4
+; RV32I-NEXT:  .LBB4_3:
+; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    lw t0, 0(a0)
+; RV32I-NEXT:    beq a3, a2, .LBB4_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    sltu a0, a3, a2
+; RV32I-NEXT:    j .LBB4_6
+; RV32I-NEXT:  .LBB4_5:
+; RV32I-NEXT:    sltu a0, t0, a1
+; RV32I-NEXT:  .LBB4_6:
+; RV32I-NEXT:    xor t1, a6, a5
+; RV32I-NEXT:    xor t3, a7, a4
+; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    beqz t1, .LBB4_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    mv a0, t2
+; RV32I-NEXT:  .LBB4_8:
+; RV32I-NEXT:    beq a6, a5, .LBB4_11
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    slt a4, a5, a6
+; RV32I-NEXT:    bne a3, a2, .LBB4_12
+; RV32I-NEXT:  .LBB4_10:
+; RV32I-NEXT:    sltu a1, a1, t0
+; RV32I-NEXT:    bnez t1, .LBB4_13
+; RV32I-NEXT:    j .LBB4_14
+; RV32I-NEXT:  .LBB4_11:
+; RV32I-NEXT:    sltu a4, a4, a7
+; RV32I-NEXT:    beq a3, a2, .LBB4_10
+; RV32I-NEXT:  .LBB4_12:
+; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:    beqz t1, .LBB4_14
+; RV32I-NEXT:  .LBB4_13:
+; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:  .LBB4_14:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: scmp.8.128:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beq a1, a3, .LBB4_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    slt a4, a1, a3
+; RV64I-NEXT:    slt a0, a3, a1
+; RV64I-NEXT:    sub a0, a0, a4
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB4_2:
+; RV64I-NEXT:    sltu a4, a0, a2
+; RV64I-NEXT:    sltu a0, a2, a0
+; RV64I-NEXT:    sub a0, a0, a4
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: scmp.32.32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slt a2, a0, a1
+; RV32I-NEXT:    slt a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: scmp.32.32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    slt a2, a0, a1
+; RV64I-NEXT:    slt a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: scmp.32.64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a3
+; RV32I-NEXT:    slt a0, a3, a1
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: scmp.32.64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slt a2, a0, a1
+; RV64I-NEXT:    slt a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: scmp.64.64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a3
+; RV32I-NEXT:    slt a0, a3, a1
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: scmp.64.64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slt a2, a0, a1
+; RV64I-NEXT:    slt a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i64 @llvm.scmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/RISCV/ucmp.ll b/llvm/test/CodeGen/RISCV/ucmp.ll
new file mode 100644
index 0000000000000..026340ede1f90
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ucmp.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 | FileCheck %s --check-prefix=RV32I
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s --check-prefix=RV64I
+
+define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind {
+; RV32I-LABEL: ucmp.8.8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.8.8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind {
+; RV32I-LABEL: ucmp.8.16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.8.16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: ucmp.8.32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.8.32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: ucmp.8.64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a3
+; RV32I-NEXT:    sltu a0, a3, a1
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB3_2:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.8.64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
+; RV32I-LABEL: ucmp.8.128:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lw a2, 4(a1)
+; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a5, 12(a1)
+; RV32I-NEXT:    lw a6, 12(a0)
+; RV32I-NEXT:    lw a7, 8(a0)
+; RV32I-NEXT:    beq a6, a5, .LBB4_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu t2, a6, a5
+; RV32I-NEXT:    j .LBB4_3
+; RV32I-NEXT:  .LBB4_2:
+; RV32I-NEXT:    sltu t2, a7, a4
+; RV32I-NEXT:  .LBB4_3:
+; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    lw t0, 0(a0)
+; RV32I-NEXT:    beq a3, a2, .LBB4_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    sltu a0, a3, a2
+; RV32I-NEXT:    j .LBB4_6
+; RV32I-NEXT:  .LBB4_5:
+; RV32I-NEXT:    sltu a0, t0, a1
+; RV32I-NEXT:  .LBB4_6:
+; RV32I-NEXT:    xor t1, a6, a5
+; RV32I-NEXT:    xor t3, a7, a4
+; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    beqz t1, .LBB4_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    mv a0, t2
+; RV32I-NEXT:  .LBB4_8:
+; RV32I-NEXT:    beq a6, a5, .LBB4_11
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    sltu a4, a5, a6
+; RV32I-NEXT:    bne a3, a2, .LBB4_12
+; RV32I-NEXT:  .LBB4_10:
+; RV32I-NEXT:    sltu a1, a1, t0
+; RV32I-NEXT:    bnez t1, .LBB4_13
+; RV32I-NEXT:    j .LBB4_14
+; RV32I-NEXT:  .LBB4_11:
+; RV32I-NEXT:    sltu a4, a4, a7
+; RV32I-NEXT:    beq a3, a2, .LBB4_10
+; RV32I-NEXT:  .LBB4_12:
+; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:    beqz t1, .LBB4_14
+; RV32I-NEXT:  .LBB4_13:
+; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:  .LBB4_14:
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.8.128:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beq a1, a3, .LBB4_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sltu a4, a1, a3
+; RV64I-NEXT:    sltu a0, a3, a1
+; RV64I-NEXT:    sub a0, a0, a4
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB4_2:
+; RV64I-NEXT:    sltu a4, a0, a2
+; RV64I-NEXT:    sltu a0, a2, a0
+; RV64I-NEXT:    sub a0, a0, a4
+; RV64I-NEXT:    ret
+  %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
+; RV32I-LABEL: ucmp.32.32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.32.32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: ucmp.32.64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a3
+; RV32I-NEXT:    sltu a0, a3, a1
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.32.64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
+; RV32I-LABEL: ucmp.64.64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a3
+; RV32I-NEXT:    sltu a0, a3, a1
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    sub a0, a0, a4
+; RV32I-NEXT:    srai a1, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.64.64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/SystemZ/scmp.ll b/llvm/test/CodeGen/SystemZ/scmp.ll
new file mode 100644
index 0000000000000..3ecaa60a58d24
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/scmp.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind {
+; CHECK-LABEL: scmp.8.8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind {
+; CHECK-LABEL: scmp.8.16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp.8.32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.8.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cgr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.scmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: scmp.8.128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vecg %v0, %v1
+; CHECK-NEXT:    jlh .LBB4_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vchlgs %v2, %v1, %v0
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochil %r2, 1
+; CHECK-NEXT:    vecg %v1, %v0
+; CHECK-NEXT:    jlh .LBB4_4
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    vchlgs %v0, %v0, %v1
+; CHECK-NEXT:  .LBB4_4:
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp.32.32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.32.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cgr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.64.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cgr %r2, %r3
+; CHECK-NEXT:    lghi %r2, 0
+; CHECK-NEXT:    locghih %r2, 1
+; CHECK-NEXT:    locghil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i64 @llvm.scmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/SystemZ/ucmp.ll b/llvm/test/CodeGen/SystemZ/ucmp.ll
new file mode 100644
index 0000000000000..4175cd7850a98
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/ucmp.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp.8.8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp.8.16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp.8.32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.8.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clgr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: ucmp.8.128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    veclg %v0, %v1
+; CHECK-NEXT:    jlh .LBB4_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vchlgs %v2, %v1, %v0
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochil %r2, 1
+; CHECK-NEXT:    veclg %v1, %v0
+; CHECK-NEXT:    jlh .LBB4_4
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    vchlgs %v0, %v0, %v1
+; CHECK-NEXT:  .LBB4_4:
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp.32.32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.32.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clgr %r2, %r3
+; CHECK-NEXT:    lhi %r2, 0
+; CHECK-NEXT:    lochih %r2, 1
+; CHECK-NEXT:    lochil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.64.64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clgr %r2, %r3
+; CHECK-NEXT:    lghi %r2, 0
+; CHECK-NEXT:    locghih %r2, 1
+; CHECK-NEXT:    locghil %r2, -1
+; CHECK-NEXT:    br %r14
+  %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/Thumb/scmp.ll b/llvm/test/CodeGen/Thumb/scmp.ll
new file mode 100644
index 0000000000000..661dbe97cdb3c
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/scmp.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=thumbv7-apple-darwin %s -o - | FileCheck %s
+
+define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
+; CHECK-LABEL: scmp_8_8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r2, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
+; CHECK-LABEL: scmp_8_16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r2, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp_8_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r2, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_8_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    subs.w r12, r0, r2
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    sbcs.w r12, r1, r3
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt.w r12, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt.w r9, #1
+; CHECK-NEXT:    sub.w r0, r9, r12
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.scmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: scmp_8_128:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    add.w lr, sp, #16
+; CHECK-NEXT:    ldr r4, [sp, #28]
+; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    ldm.w lr, {r9, r12, lr}
+; CHECK-NEXT:    subs.w r6, r0, r9
+; CHECK-NEXT:    sbcs.w r6, r1, r12
+; CHECK-NEXT:    sbcs.w r6, r2, lr
+; CHECK-NEXT:    sbcs.w r6, r3, r4
+; CHECK-NEXT:    mov.w r6, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r6, #1
+; CHECK-NEXT:    subs.w r0, r9, r0
+; CHECK-NEXT:    sbcs.w r0, r12, r1
+; CHECK-NEXT:    sbcs.w r0, lr, r2
+; CHECK-NEXT:    sbcs.w r0, r4, r3
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r5, #1
+; CHECK-NEXT:    subs r0, r5, r6
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+  %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp_32_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r0, #1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    movgt r2, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_32_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    subs.w r12, r0, r2
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    sbcs.w r12, r1, r3
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt.w r12, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt.w r9, #1
+; CHECK-NEXT:    sub.w r0, r9, r12
+; CHECK-NEXT:    bx lr
+  %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @scmp_64_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp_64_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    subs.w r12, r0, r2
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    sbcs.w r12, r1, r3
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt.w r12, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt.w r9, #1
+; CHECK-NEXT:    sub.w r0, r9, r12
+; CHECK-NEXT:    asrs r1, r0, #31
+; CHECK-NEXT:    bx lr
+  %1 = call i64 @llvm.scmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/Thumb/ucmp.ll b/llvm/test/CodeGen/Thumb/ucmp.ll
new file mode 100644
index 0000000000000..7e6d0a323b11c
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/ucmp.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=thumbv7-apple-darwin %s -o - | FileCheck %s
+
+define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp_8_8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r2, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp_8_16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r2, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp_8_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r2, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_8_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    subs.w r12, r0, r2
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    sbcs.w r12, r1, r3
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo.w r12, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo.w r9, #1
+; CHECK-NEXT:    sub.w r0, r9, r12
+; CHECK-NEXT:    bx lr
+  %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: ucmp_8_128:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    add.w lr, sp, #16
+; CHECK-NEXT:    ldr r4, [sp, #28]
+; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    ldm.w lr, {r9, r12, lr}
+; CHECK-NEXT:    subs.w r6, r0, r9
+; CHECK-NEXT:    sbcs.w r6, r1, r12
+; CHECK-NEXT:    sbcs.w r6, r2, lr
+; CHECK-NEXT:    sbcs.w r6, r3, r4
+; CHECK-NEXT:    mov.w r6, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r6, #1
+; CHECK-NEXT:    subs.w r0, r9, r0
+; CHECK-NEXT:    sbcs.w r0, r12, r1
+; CHECK-NEXT:    sbcs.w r0, lr, r2
+; CHECK-NEXT:    sbcs.w r0, r4, r3
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r5, #1
+; CHECK-NEXT:    subs r0, r5, r6
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+  %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp_32_32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    mov.w r0, #0
+; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo r0, #1
+; CHECK-NEXT:    it hi
+; CHECK-NEXT:    movhi r2, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    bx lr
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_32_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    subs.w r12, r0, r2
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    sbcs.w r12, r1, r3
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo.w r12, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo.w r9, #1
+; CHECK-NEXT:    sub.w r0, r9, r12
+; CHECK-NEXT:    bx lr
+  %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp_64_64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    subs.w r12, r0, r2
+; CHECK-NEXT:    mov.w r9, #0
+; CHECK-NEXT:    sbcs.w r12, r1, r3
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo.w r12, #1
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    sbcs.w r0, r3, r1
+; CHECK-NEXT:    it lo
+; CHECK-NEXT:    movlo.w r9, #1
+; CHECK-NEXT:    sub.w r0, r9, r12
+; CHECK-NEXT:    asrs r1, r0, #31
+; CHECK-NEXT:    bx lr
+  %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/WebAssembly/scmp.ll b/llvm/test/CodeGen/WebAssembly/scmp.ll
new file mode 100644
index 0000000000000..60ab6ef2f527a
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/scmp.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -wasm-keep-registers | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define i8 @scmp.8.8(i8 signext %x, i8 signext %y) nounwind {
+; CHECK-LABEL: scmp.8.8:
+; CHECK:         .functype scmp.8.8 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i32.gt_s $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i32.lt_s $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.16(i16 signext %x, i16 signext %y) nounwind {
+; CHECK-LABEL: scmp.8.16:
+; CHECK:         .functype scmp.8.16 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i32.gt_s $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i32.lt_s $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp.8.32:
+; CHECK:         .functype scmp.8.32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i32.gt_s $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i32.lt_s $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.8.64:
+; CHECK:         .functype scmp.8.64 (i64, i64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i64.gt_s $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i64.lt_s $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.scmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: scmp.8.128:
+; CHECK:         .functype scmp.8.128 (i64, i64, i64, i64) -> (i32)
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push10=, 0
+; CHECK-NEXT:    local.get $push9=, 2
+; CHECK-NEXT:    i64.gt_u $push4=, $pop10, $pop9
+; CHECK-NEXT:    local.get $push12=, 1
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    i64.gt_s $push3=, $pop12, $pop11
+; CHECK-NEXT:    local.get $push14=, 1
+; CHECK-NEXT:    local.get $push13=, 3
+; CHECK-NEXT:    i64.eq $push8=, $pop14, $pop13
+; CHECK-NEXT:    local.tee $push7=, 4, $pop8
+; CHECK-NEXT:    i32.select $push5=, $pop4, $pop3, $pop7
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 2
+; CHECK-NEXT:    i64.lt_u $push1=, $pop16, $pop15
+; CHECK-NEXT:    local.get $push18=, 1
+; CHECK-NEXT:    local.get $push17=, 3
+; CHECK-NEXT:    i64.lt_s $push0=, $pop18, $pop17
+; CHECK-NEXT:    local.get $push19=, 4
+; CHECK-NEXT:    i32.select $push2=, $pop1, $pop0, $pop19
+; CHECK-NEXT:    i32.sub $push6=, $pop5, $pop2
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.scmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp.32.32:
+; CHECK:         .functype scmp.32.32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i32.gt_s $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i32.lt_s $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.32.64:
+; CHECK:         .functype scmp.32.64 (i64, i64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i64.gt_s $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i64.lt_s $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: scmp.64.64:
+; CHECK:         .functype scmp.64.64 (i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push5=, 0
+; CHECK-NEXT:    local.get $push4=, 1
+; CHECK-NEXT:    i64.gt_s $push1=, $pop5, $pop4
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    i64.lt_s $push0=, $pop7, $pop6
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    i64.extend_i32_s $push3=, $pop2
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i64 @llvm.scmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/WebAssembly/ucmp.ll b/llvm/test/CodeGen/WebAssembly/ucmp.ll
new file mode 100644
index 0000000000000..ab7f9b2bab1da
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ucmp.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -wasm-keep-registers | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp.8.8:
+; CHECK:         .functype ucmp.8.8 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i32.gt_u $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i32.lt_u $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind {
+; CHECK-LABEL: ucmp.8.16:
+; CHECK:         .functype ucmp.8.16 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i32.gt_u $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i32.lt_u $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp.8.32:
+; CHECK:         .functype ucmp.8.32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i32.gt_u $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i32.lt_u $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.8.64:
+; CHECK:         .functype ucmp.8.64 (i64, i64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i64.gt_u $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i64.lt_u $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
+  ret i8 %1
+}
+
+define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: ucmp.8.128:
+; CHECK:         .functype ucmp.8.128 (i64, i64, i64, i64) -> (i32)
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push10=, 0
+; CHECK-NEXT:    local.get $push9=, 2
+; CHECK-NEXT:    i64.gt_u $push4=, $pop10, $pop9
+; CHECK-NEXT:    local.get $push12=, 1
+; CHECK-NEXT:    local.get $push11=, 3
+; CHECK-NEXT:    i64.gt_u $push3=, $pop12, $pop11
+; CHECK-NEXT:    local.get $push14=, 1
+; CHECK-NEXT:    local.get $push13=, 3
+; CHECK-NEXT:    i64.eq $push8=, $pop14, $pop13
+; CHECK-NEXT:    local.tee $push7=, 4, $pop8
+; CHECK-NEXT:    i32.select $push5=, $pop4, $pop3, $pop7
+; CHECK-NEXT:    local.get $push16=, 0
+; CHECK-NEXT:    local.get $push15=, 2
+; CHECK-NEXT:    i64.lt_u $push1=, $pop16, $pop15
+; CHECK-NEXT:    local.get $push18=, 1
+; CHECK-NEXT:    local.get $push17=, 3
+; CHECK-NEXT:    i64.lt_u $push0=, $pop18, $pop17
+; CHECK-NEXT:    local.get $push19=, 4
+; CHECK-NEXT:    i32.select $push2=, $pop1, $pop0, $pop19
+; CHECK-NEXT:    i32.sub $push6=, $pop5, $pop2
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
+  ret i8 %1
+}
+
+define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: ucmp.32.32:
+; CHECK:         .functype ucmp.32.32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i32.gt_u $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i32.lt_u $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.32.64:
+; CHECK:         .functype ucmp.32.64 (i64, i64) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push4=, 0
+; CHECK-NEXT:    local.get $push3=, 1
+; CHECK-NEXT:    i64.gt_u $push1=, $pop4, $pop3
+; CHECK-NEXT:    local.get $push6=, 0
+; CHECK-NEXT:    local.get $push5=, 1
+; CHECK-NEXT:    i64.lt_u $push0=, $pop6, $pop5
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
+  ret i32 %1
+}
+
+define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: ucmp.64.64:
+; CHECK:         .functype ucmp.64.64 (i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get $push5=, 0
+; CHECK-NEXT:    local.get $push4=, 1
+; CHECK-NEXT:    i64.gt_u $push1=, $pop5, $pop4
+; CHECK-NEXT:    local.get $push7=, 0
+; CHECK-NEXT:    local.get $push6=, 1
+; CHECK-NEXT:    i64.lt_u $push0=, $pop7, $pop6
+; CHECK-NEXT:    i32.sub $push2=, $pop1, $pop0
+; CHECK-NEXT:    i64.extend_i32_s $push3=, $pop2
+; CHECK-NEXT:    # fallthrough-return
+  %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
+  ret i64 %1
+}

diff  --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 55dc0d6059e05..7d4bbb06534e6 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -5,24 +5,19 @@
 define i8 @scmp.8.8(i8 %x, i8 %y) nounwind {
 ; X64-LABEL: scmp.8.8:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpb %sil, %dil
-; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp.8.8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB0_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    retl
   %1 = call i8 @llvm.scmp(i8 %x, i8 %y)
   ret i8 %1
@@ -31,24 +26,19 @@ define i8 @scmp.8.8(i8 %x, i8 %y) nounwind {
 define i8 @scmp.8.16(i16 %x, i16 %y) nounwind {
 ; X64-LABEL: scmp.8.16:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpw %si, %di
-; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp.8.16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %ax
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB1_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB1_2:
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    retl
   %1 = call i8 @llvm.scmp(i16 %x, i16 %y)
   ret i8 %1
@@ -57,24 +47,19 @@ define i8 @scmp.8.16(i16 %x, i16 %y) nounwind {
 define i8 @scmp.8.32(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: scmp.8.32:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp.8.32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB2_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    retl
   %1 = call i8 @llvm.scmp(i32 %x, i32 %y)
   ret i8 %1
@@ -83,35 +68,32 @@ define i8 @scmp.8.32(i32 %x, i32 %y) nounwind {
 define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
 ; X64-LABEL: scmp.8.64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp.8.64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB3_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB3_2:
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %1 = call i8 @llvm.scmp(i64 %x, i64 %y)
   ret i8 %1
@@ -120,16 +102,14 @@ define i8 @scmp.8.64(i64 %x, i64 %y) nounwind {
 define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; X64-LABEL: scmp.8.128:
 ; X64:       # %bb.0:
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    setl %r8b
 ; X64-NEXT:    cmpq %rdi, %rdx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    sbbq %rsi, %rax
+; X64-NEXT:    sbbq %rsi, %rcx
 ; X64-NEXT:    setl %al
-; X64-NEXT:    movzbl %al, %r8d
-; X64-NEXT:    cmpq %rdx, %rdi
-; X64-NEXT:    sbbq %rcx, %rsi
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %r8d, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    subb %r8b, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp.8.128:
@@ -142,26 +122,23 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB4_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB4_2:
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -174,25 +151,21 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: scmp.32.32:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setl %al
 ; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp.32.32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setg %dl
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    jl .LBB5_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB5_2:
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %eax
 ; X86-NEXT:    retl
   %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
   ret i32 %1
@@ -201,34 +174,34 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
 define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
 ; X64-LABEL: scmp.32.64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    setl %al
 ; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp.32.64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    jl .LBB6_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:  .LBB6_2:
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
   ret i32 %1
@@ -237,36 +210,36 @@ define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
 define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
 ; X64-LABEL: scmp.64.64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    setl %al
 ; X64-NEXT:    setg %cl
-; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    cmovgeq %rcx, %rax
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbq %cl, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp.64.64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    jl .LBB7_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:  .LBB7_2:
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %1 = call i64 @llvm.scmp(i64 %x, i64 %y)
   ret i64 %1
@@ -275,24 +248,19 @@ define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
 define i4 @scmp_narrow_result(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: scmp_narrow_result:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_narrow_result:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB8_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB8_2:
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    retl
   %1 = call i4 @llvm.scmp(i32 %x, i32 %y)
   ret i4 %1
@@ -305,39 +273,36 @@ define i8 @scmp_narrow_op(i62 %x, i62 %y) nounwind {
 ; X64-NEXT:    sarq $2, %rsi
 ; X64-NEXT:    shlq $2, %rdi
 ; X64-NEXT:    sarq $2, %rdi
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_narrow_op:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $2, %eax
 ; X86-NEXT:    sarl $2, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shll $2, %edi
-; X86-NEXT:    sarl $2, %edi
-; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    setl %dl
-; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB9_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:  .LBB9_2:
+; X86-NEXT:    shll $2, %esi
+; X86-NEXT:    sarl $2, %esi
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %1 = call i8 @llvm.scmp(i62 %x, i62 %y)
   ret i8 %1
@@ -346,39 +311,33 @@ define i8 @scmp_narrow_op(i62 %x, i62 %y) nounwind {
 define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: scmp_wide_result:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setl %al
 ; X64-NEXT:    setg %cl
-; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    cmovgeq %rcx, %rax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbq %cl, %rax
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    andl $8191, %ecx # imm = 0x1FFF
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_wide_result:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    setg %bl
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jl .LBB10_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    movb %bl, %cl
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:  .LBB10_2:
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl $0, 12(%eax)
-; X86-NEXT:    movl $0, 8(%eax)
-; X86-NEXT:    movw $0, 16(%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    andl $8191, %ecx # imm = 0x1FFF
+; X86-NEXT:    movw %cx, 16(%eax)
 ; X86-NEXT:    retl $4
   %1 = call i141 @llvm.scmp(i32 %x, i32 %y)
   ret i141 %1
@@ -387,20 +346,18 @@ define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind {
 define i8 @scmp_wide_op(i109 %x, i109 %y) nounwind {
 ; X64-LABEL: scmp_wide_op:
 ; X64:       # %bb.0:
-; X64-NEXT:    shlq $19, %rsi
-; X64-NEXT:    sarq $19, %rsi
 ; X64-NEXT:    shlq $19, %rcx
 ; X64-NEXT:    sarq $19, %rcx
+; X64-NEXT:    shlq $19, %rsi
+; X64-NEXT:    sarq $19, %rsi
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    setl %r8b
 ; X64-NEXT:    cmpq %rdi, %rdx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    sbbq %rsi, %rax
+; X64-NEXT:    sbbq %rsi, %rcx
 ; X64-NEXT:    setl %al
-; X64-NEXT:    movzbl %al, %r8d
-; X64-NEXT:    cmpq %rdx, %rdi
-; X64-NEXT:    sbbq %rcx, %rsi
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %r8d, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    subb %r8b, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_wide_op:
@@ -409,35 +366,31 @@ define i8 @scmp_wide_op(i109 %x, i109 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $19, %eax
 ; X86-NEXT:    sarl $19, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $19, %ecx
 ; X86-NEXT:    sarl $19, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebp, %esi
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    setl {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    setl %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    sbbl %edi, %ebp
 ; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB11_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:  .LBB11_2:
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -454,34 +407,28 @@ define i41 @scmp_uncommon_types(i7 %x, i7 %y) nounwind {
 ; X64-NEXT:    sarb %sil
 ; X64-NEXT:    addb %dil, %dil
 ; X64-NEXT:    sarb %dil
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpb %sil, %dil
+; X64-NEXT:    setl %al
 ; X64-NEXT:    setg %cl
-; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    cmovgeq %rcx, %rax
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbq %cl, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_uncommon_types:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addb %al, %al
 ; X86-NEXT:    sarb %al
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addb %dl, %dl
-; X86-NEXT:    sarb %dl
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpb %al, %dl
-; X86-NEXT:    setg %bl
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    jl .LBB12_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    movb %bl, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB12_2:
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    sarb %cl
+; X86-NEXT:    cmpb %al, %cl
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    retl
   %1 = call i41 @llvm.scmp(i7 %x, i7 %y)
   ret i41 %1
@@ -494,38 +441,41 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movd %xmm2, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
 ; X64-NEXT:    movd %xmm2, %ecx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm2
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    movd %xmm3, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT:    movd %xmm1, %ecx
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    movd %xmm1, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; X64-NEXT:    movdqa %xmm2, %xmm0
@@ -533,59 +483,41 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: scmp_normal_vectors:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jl .LBB13_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:  .LBB13_2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jl .LBB13_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:  .LBB13_4:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    setl %dl
+; X86-NEXT:    setg %dh
+; X86-NEXT:    subb %dl, %dh
+; X86-NEXT:    movsbl %dh, %edx
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jl .LBB13_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:  .LBB13_6:
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    setg %bh
+; X86-NEXT:    subb %bl, %bh
+; X86-NEXT:    movsbl %bh, %edi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    setg %bh
+; X86-NEXT:    subb %bl, %bh
+; X86-NEXT:    movsbl %bh, %esi
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setg %cl
-; X86-NEXT:    jl .LBB13_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:  .LBB13_8:
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %ch
+; X86-NEXT:    subb %cl, %ch
+; X86-NEXT:    movsbl %ch, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %1 = call <4 x i32> @llvm.scmp(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %1
@@ -596,45 +528,45 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movd %xmm1, %eax
 ; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movzbl %dl, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm2, %edx
+; X64-NEXT:    movd %xmm2, %ecx
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X64-NEXT:    movd %xmm2, %esi
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpl %edx, %esi
-; X64-NEXT:    setg %dil
-; X64-NEXT:    cmovll %eax, %edi
-; X64-NEXT:    movzbl %dil, %edx
-; X64-NEXT:    shll $8, %edx
-; X64-NEXT:    orl %ecx, %edx
+; X64-NEXT:    movd %xmm2, %edx
+; X64-NEXT:    cmpl %ecx, %edx
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %dl
+; X64-NEXT:    subb %cl, %dl
+; X64-NEXT:    movzbl %dl, %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    movd %xmm2, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    movd %xmm2, %esi
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpl %ecx, %esi
-; X64-NEXT:    setg %dil
-; X64-NEXT:    cmovll %eax, %edi
-; X64-NEXT:    movzbl %dil, %ecx
-; X64-NEXT:    shll $16, %ecx
-; X64-NEXT:    orl %edx, %ecx
+; X64-NEXT:    movd %xmm2, %edx
+; X64-NEXT:    cmpl %eax, %edx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %dl
+; X64-NEXT:    subb %al, %dl
+; X64-NEXT:    movzbl %dl, %eax
+; X64-NEXT:    shll $16, %eax
+; X64-NEXT:    orl %ecx, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    movd %xmm1, %ecx
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-NEXT:    movd %xmm0, %esi
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpl %edx, %esi
-; X64-NEXT:    setg %dil
-; X64-NEXT:    cmovll %eax, %edi
-; X64-NEXT:    shll $24, %edi
-; X64-NEXT:    orl %ecx, %edi
-; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %edx
+; X64-NEXT:    cmpl %ecx, %edx
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %dl
+; X64-NEXT:    subb %cl, %dl
+; X64-NEXT:    movzbl %dl, %ecx
+; X64-NEXT:    shll $24, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_narrow_vec_result:
@@ -643,41 +575,29 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setg %ch
-; X86-NEXT:    movb $-1, %dl
-; X86-NEXT:    movb $-1, %cl
-; X86-NEXT:    jl .LBB14_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:  .LBB14_2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    movb $-1, %ch
-; X86-NEXT:    jl .LBB14_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:  .LBB14_4:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    setg %bl
-; X86-NEXT:    movb $-1, %dh
-; X86-NEXT:    jl .LBB14_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %bl, %dh
-; X86-NEXT:  .LBB14_6:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %ch
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %ch, %cl
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    setl %ch
 ; X86-NEXT:    setg %bl
-; X86-NEXT:    jl .LBB14_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %bl, %dl
-; X86-NEXT:  .LBB14_8:
-; X86-NEXT:    movb %dl, 3(%eax)
-; X86-NEXT:    movb %dh, 2(%eax)
-; X86-NEXT:    movb %ch, 1(%eax)
+; X86-NEXT:    subb %ch, %bl
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    setl %ch
+; X86-NEXT:    setg %bh
+; X86-NEXT:    subb %ch, %bh
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    setl %dl
+; X86-NEXT:    setg %ch
+; X86-NEXT:    subb %dl, %ch
+; X86-NEXT:    movb %ch, 3(%eax)
+; X86-NEXT:    movb %bh, 2(%eax)
+; X86-NEXT:    movb %bl, 1(%eax)
 ; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -700,97 +620,82 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
 ; X64-NEXT:    psrad $24, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
 ; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    movd %xmm3, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; X64-NEXT:    movd %xmm1, %ecx
-; X64-NEXT:    movd %xmm2, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    movd %xmm1, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
-; X64-NEXT:    movd %xmm1, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_narrow_vec_op:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jl .LBB15_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:  .LBB15_2:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %bl
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jl .LBB15_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %bl, %al
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:  .LBB15_4:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    setl %dl
+; X86-NEXT:    setg %dh
+; X86-NEXT:    subb %dl, %dh
+; X86-NEXT:    movsbl %dh, %edx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    setl %bl
+; X86-NEXT:    setg %bh
+; X86-NEXT:    subb %bl, %bh
+; X86-NEXT:    movsbl %bh, %esi
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jl .LBB15_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:  .LBB15_6:
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %ch
+; X86-NEXT:    setg %bl
+; X86-NEXT:    subb %ch, %bl
+; X86-NEXT:    movsbl %bl, %edi
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %cl
-; X86-NEXT:    jl .LBB15_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:  .LBB15_8:
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %ch
+; X86-NEXT:    subb %cl, %ch
+; X86-NEXT:    movsbl %ch, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl %edi, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %1 = call <4 x i32> @llvm.scmp(<4 x i8> %x, <4 x i8> %y)
   ret <4 x i32> %1
@@ -811,163 +716,178 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X64-NEXT:    psrad $24, %xmm6
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
 ; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %ecx
+; X64-NEXT:    movd %xmm7, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm7
+; X64-NEXT:    movd %xmm7, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm7
 ; X64-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; X64-NEXT:    movd %xmm5, %ecx
-; X64-NEXT:    movd %xmm6, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    movd %xmm5, %eax
+; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    movd %xmm5, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm5
+; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm5
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
 ; X64-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
 ; X64-NEXT:    psrad $24, %xmm5
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
-; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    movd %xmm1, %eax
 ; X64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
 ; X64-NEXT:    psrad $24, %xmm4
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
-; X64-NEXT:    movd %xmm1, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    movd %xmm6, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
-; X64-NEXT:    movd %xmm6, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm6
+; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm6
 ; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; X64-NEXT:    movd %xmm5, %ecx
-; X64-NEXT:    movd %xmm4, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    movd %xmm5, %eax
+; X64-NEXT:    movd %xmm4, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    movd %xmm5, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
-; X64-NEXT:    movd %xmm4, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm4
+; X64-NEXT:    movd %xmm4, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm4
 ; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
 ; X64-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
 ; X64-NEXT:    psrad $24, %xmm5
 ; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[3,3,3,3]
-; X64-NEXT:    movd %xmm4, %ecx
+; X64-NEXT:    movd %xmm4, %eax
 ; X64-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
 ; X64-NEXT:    psrad $24, %xmm6
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3]
-; X64-NEXT:    movd %xmm2, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %ecx
+; X64-NEXT:    movd %xmm7, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm7
+; X64-NEXT:    movd %xmm7, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm7
 ; X64-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; X64-NEXT:    movd %xmm5, %ecx
-; X64-NEXT:    movd %xmm6, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    movd %xmm5, %eax
+; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    movd %xmm5, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm5
+; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm5
 ; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
 ; X64-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
 ; X64-NEXT:    psrad $24, %xmm5
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3]
-; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    movd %xmm3, %eax
 ; X64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
 ; X64-NEXT:    psrad $24, %xmm4
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3]
-; X64-NEXT:    movd %xmm3, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    movd %xmm6, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
-; X64-NEXT:    movd %xmm6, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm6
+; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm6
 ; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; X64-NEXT:    movd %xmm5, %ecx
-; X64-NEXT:    movd %xmm4, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    movd %xmm5, %eax
+; X64-NEXT:    movd %xmm4, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    movd %xmm5, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
-; X64-NEXT:    movd %xmm4, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %eax, %esi
-; X64-NEXT:    movd %esi, %xmm4
+; X64-NEXT:    movd %xmm4, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm4
 ; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
 ; X64-NEXT:    retq
@@ -978,202 +898,132 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    setg %dl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jl .LBB16_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:  .LBB16_2:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jl .LBB16_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:  .LBB16_4:
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %bh
+; X86-NEXT:    subb %al, %bh
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    jl .LBB16_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:  .LBB16_6:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jl .LBB16_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:  .LBB16_8:
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %bl
+; X86-NEXT:    subb %al, %bl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB16_10
-; X86-NEXT:  # %bb.9:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_10:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB16_12
-; X86-NEXT:  # %bb.11:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_12:
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dh
+; X86-NEXT:    subb %al, %dh
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB16_14
-; X86-NEXT:  # %bb.13:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_14:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB16_16
-; X86-NEXT:  # %bb.15:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_16:
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB16_18
-; X86-NEXT:  # %bb.17:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_18:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB16_20
-; X86-NEXT:  # %bb.19:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:  .LBB16_20:
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    jl .LBB16_22
-; X86-NEXT:  # %bb.21:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:  .LBB16_22:
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jl .LBB16_24
-; X86-NEXT:  # %bb.23:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:  .LBB16_24:
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %ebp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %ah
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    jl .LBB16_26
-; X86-NEXT:  # %bb.25:
-; X86-NEXT:    movb %ah, %bl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:  .LBB16_26:
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    subb %al, %ah
+; X86-NEXT:    movsbl %ah, %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jl .LBB16_28
-; X86-NEXT:  # %bb.27:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:  .LBB16_28:
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jl .LBB16_30
-; X86-NEXT:  # %bb.29:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:  .LBB16_30:
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    setg %cl
-; X86-NEXT:    jl .LBB16_32
-; X86-NEXT:  # %bb.31:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_32:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %edi, 56(%eax)
-; X86-NEXT:    movl %esi, 52(%eax)
-; X86-NEXT:    movl %edx, 48(%eax)
-; X86-NEXT:    movl %ebp, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl %esi, 56(%eax)
+; X86-NEXT:    movl %edi, 52(%eax)
+; X86-NEXT:    movl %ebp, 48(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ecx, 44(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movsbl %dh, %edx
+; X86-NEXT:    movl %edx, 36(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NEXT:    movsbl %bl, %esi
+; X86-NEXT:    movl %esi, 32(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movsbl %bh, %edi
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $48, %esp
+; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1187,121 +1037,136 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ; X64-LABEL: scmp_wide_vec_op:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %xmm7, %rax
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
 ; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovll %eax, %ecx
-; X64-NEXT:    movd %ecx, %xmm8
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm8
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
-; X64-NEXT:    movq %xmm7, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm7
+; X64-NEXT:    movq %xmm7, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm7
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; X64-NEXT:    movq %xmm6, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm7
+; X64-NEXT:    movq %xmm6, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm7
 ; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; X64-NEXT:    movq %xmm6, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm6
+; X64-NEXT:    movq %xmm6, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm6
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; X64-NEXT:    movq %xmm5, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm6
+; X64-NEXT:    movq %xmm5, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm6
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; X64-NEXT:    movq %xmm5, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm5
+; X64-NEXT:    movq %xmm5, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm5
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; X64-NEXT:    movq %xmm4, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm5
+; X64-NEXT:    movq %xmm4, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm5
 ; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X64-NEXT:    movq %xmm4, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm4
+; X64-NEXT:    movq %xmm4, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm4
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; X64-NEXT:    movq %xmm3, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm4
+; X64-NEXT:    movq %xmm3, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm4
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; X64-NEXT:    movq %xmm3, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm3
+; X64-NEXT:    movq %xmm3, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; X64-NEXT:    movq %xmm2, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm3
+; X64-NEXT:    movq %xmm2, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; X64-NEXT:    movq %xmm2, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm2
+; X64-NEXT:    movq %xmm2, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-NEXT:    movq %xmm1, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm2
+; X64-NEXT:    movq %xmm1, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-NEXT:    movq %xmm1, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm1
+; X64-NEXT:    movq %xmm1, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm1
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovll %eax, %edx
-; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
@@ -1315,41 +1180,76 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    setl %al
+; X86-NEXT:    cmpl %edi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    setl %ah
+; X86-NEXT:    subb %al, %ah
+; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl %ecx, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %edx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    setl %al
+; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    setl %ah
+; X86-NEXT:    subb %al, %ah
+; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl %edi, %ecx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    cmpl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    setl %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl %ebp, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmpl %edi, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb $-1, %bh
-; X86-NEXT:    jl .LBB17_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %bl, %bh
-; X86-NEXT:  .LBB17_2:
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %edi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_4:
+; X86-NEXT:    setl %bh
+; X86-NEXT:    subb %bl, %bh
+; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl %edx, %eax
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -1359,215 +1259,138 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %bh
+; X86-NEXT:    subb %bl, %bh
+; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_6:
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    setl %bl
 ; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_8:
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %bh
+; X86-NEXT:    subb %bl, %bh
+; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_10
-; X86-NEXT:  # %bb.9:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_10:
-; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_12
-; X86-NEXT:  # %bb.11:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_12:
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    setl %dl
+; X86-NEXT:    subb %bl, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_14
-; X86-NEXT:  # %bb.13:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_14:
-; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_16
-; X86-NEXT:  # %bb.15:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_16:
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %dl
+; X86-NEXT:    subb %bl, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_18
-; X86-NEXT:  # %bb.17:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_18:
-; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    setl %al
+; X86-NEXT:    cmpl %ecx, %ebp
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebp, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_20
-; X86-NEXT:  # %bb.19:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_20:
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    setl %al
+; X86-NEXT:    cmpl %ecx, %ebp
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, (%esp) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jl .LBB17_22
-; X86-NEXT:  # %bb.21:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_22:
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb $-1, %cl
-; X86-NEXT:    jl .LBB17_24
-; X86-NEXT:  # %bb.23:
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:  .LBB17_24:
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    setl %dl
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %esi
 ; X86-NEXT:    setl %ch
-; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    subb %dl, %ch
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    setl %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movb $-1, %cl
-; X86-NEXT:    jl .LBB17_26
-; X86-NEXT:  # %bb.25:
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:  .LBB17_26:
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl %edi, %esi
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    setl %dh
-; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    cmpl %ebx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB17_28
-; X86-NEXT:  # %bb.27:
-; X86-NEXT:    movb %dh, %al
-; X86-NEXT:  .LBB17_28:
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %ebx
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    setl %dh
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %ebx, %edi
+; X86-NEXT:    subb %dh, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movb $-1, %dh
-; X86-NEXT:    jl .LBB17_30
-; X86-NEXT:  # %bb.29:
-; X86-NEXT:    movb %cl, %dh
-; X86-NEXT:  .LBB17_30:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    setl %dl
-; X86-NEXT:    cmpl %ecx, %ebx
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movb $-1, %bl
-; X86-NEXT:    jl .LBB17_32
-; X86-NEXT:  # %bb.31:
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:  .LBB17_32:
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    setl %dh
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    subb %dh, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb %bl, 15(%eax)
-; X86-NEXT:    movb %dh, 14(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 13(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 12(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movb %cl, 14(%eax)
+; X86-NEXT:    movb %dl, 13(%eax)
+; X86-NEXT:    movb %ch, 12(%eax)
+; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movb %cl, 11(%eax)
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movb %cl, 10(%eax)
@@ -1591,7 +1414,7 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ; X86-NEXT:    movb %cl, 1(%eax)
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1607,111 +1430,158 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X64-NEXT:    pushq %rbp
 ; X64-NEXT:    pushq %r15
 ; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
-; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
-; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; X64-NEXT:    addb %r11b, %r11b
-; X64-NEXT:    sarb %r11b
-; X64-NEXT:    addb %dl, %dl
-; X64-NEXT:    sarb %dl
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpb %r11b, %dl
-; X64-NEXT:    setg %dil
-; X64-NEXT:    movq $-1, %r11
-; X64-NEXT:    cmovlq %r11, %rdi
-; X64-NEXT:    addb %r12b, %r12b
-; X64-NEXT:    sarb %r12b
-; X64-NEXT:    addb %cl, %cl
-; X64-NEXT:    sarb %cl
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpb %r12b, %cl
-; X64-NEXT:    setg %dl
-; X64-NEXT:    cmovlq %r11, %rdx
 ; X64-NEXT:    addb %r15b, %r15b
 ; X64-NEXT:    sarb %r15b
-; X64-NEXT:    addb %r8b, %r8b
-; X64-NEXT:    sarb %r8b
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    cmpb %r15b, %r8b
-; X64-NEXT:    setg %cl
-; X64-NEXT:    cmovlq %r11, %rcx
+; X64-NEXT:    addb %sil, %sil
+; X64-NEXT:    sarb %sil
+; X64-NEXT:    cmpb %r15b, %sil
+; X64-NEXT:    setl %sil
+; X64-NEXT:    setg %r15b
+; X64-NEXT:    subb %sil, %r15b
+; X64-NEXT:    movsbq %r15b, %rsi
+; X64-NEXT:    movq %rsi, (%rax)
+; X64-NEXT:    movq %rsi, %xmm0
+; X64-NEXT:    sarq $63, %rsi
 ; X64-NEXT:    addb %r14b, %r14b
 ; X64-NEXT:    sarb %r14b
-; X64-NEXT:    addb %r9b, %r9b
-; X64-NEXT:    sarb %r9b
-; X64-NEXT:    xorl %r8d, %r8d
-; X64-NEXT:    cmpb %r14b, %r9b
-; X64-NEXT:    setg %r8b
-; X64-NEXT:    cmovlq %r11, %r8
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; X64-NEXT:    addb %r15b, %r15b
+; X64-NEXT:    sarb %r15b
+; X64-NEXT:    cmpb %r14b, %r15b
+; X64-NEXT:    setl %r14b
+; X64-NEXT:    setg %r15b
+; X64-NEXT:    subb %r14b, %r15b
+; X64-NEXT:    movsbq %r15b, %r14
+; X64-NEXT:    movq %r14, %r15
+; X64-NEXT:    sarq $63, %r15
 ; X64-NEXT:    addb %bpl, %bpl
 ; X64-NEXT:    sarb %bpl
-; X64-NEXT:    addb %sil, %sil
-; X64-NEXT:    sarb %sil
-; X64-NEXT:    xorl %r9d, %r9d
-; X64-NEXT:    cmpb %bpl, %sil
-; X64-NEXT:    setg %r9b
-; X64-NEXT:    cmovlq %r11, %r9
+; X64-NEXT:    addb %dl, %dl
+; X64-NEXT:    sarb %dl
+; X64-NEXT:    cmpb %bpl, %dl
+; X64-NEXT:    setl %dl
+; X64-NEXT:    setg %bpl
+; X64-NEXT:    subb %dl, %bpl
+; X64-NEXT:    movsbq %bpl, %rdx
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    sarq $63, %r12
 ; X64-NEXT:    addb %bl, %bl
 ; X64-NEXT:    sarb %bl
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    sarb %cl
+; X64-NEXT:    cmpb %bl, %cl
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %bl
+; X64-NEXT:    subb %cl, %bl
+; X64-NEXT:    movsbq %bl, %rbx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    addb %r11b, %r11b
+; X64-NEXT:    sarb %r11b
+; X64-NEXT:    addb %r8b, %r8b
+; X64-NEXT:    sarb %r8b
+; X64-NEXT:    cmpb %r11b, %r8b
+; X64-NEXT:    setl %r8b
+; X64-NEXT:    setg %r11b
+; X64-NEXT:    subb %r8b, %r11b
+; X64-NEXT:    movsbq %r11b, %r8
+; X64-NEXT:    movq %r8, %r11
+; X64-NEXT:    sarq $63, %r11
+; X64-NEXT:    addb %r10b, %r10b
+; X64-NEXT:    sarb %r10b
+; X64-NEXT:    addb %r9b, %r9b
+; X64-NEXT:    sarb %r9b
+; X64-NEXT:    cmpb %r10b, %r9b
+; X64-NEXT:    setl %r9b
+; X64-NEXT:    setg %r10b
+; X64-NEXT:    subb %r9b, %r10b
+; X64-NEXT:    movsbq %r10b, %r9
+; X64-NEXT:    movq %r9, %r10
+; X64-NEXT:    sarq $63, %r10
+; X64-NEXT:    addb %dil, %dil
+; X64-NEXT:    sarb %dil
 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
 ; X64-NEXT:    addb %bpl, %bpl
 ; X64-NEXT:    sarb %bpl
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpb %bl, %bpl
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovlq %r11, %rsi
-; X64-NEXT:    addb %r10b, %r10b
-; X64-NEXT:    sarb %r10b
-; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
-; X64-NEXT:    addb %bl, %bl
-; X64-NEXT:    sarb %bl
-; X64-NEXT:    xorl %r14d, %r14d
-; X64-NEXT:    cmpb %r10b, %bl
-; X64-NEXT:    setg %r14b
-; X64-NEXT:    cmovlq %r11, %r14
-; X64-NEXT:    movq %r14, %r10
-; X64-NEXT:    shrq $2, %r10
-; X64-NEXT:    movq %r10, 88(%rax)
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    shlq $9, %r10
+; X64-NEXT:    cmpb %dil, %bpl
+; X64-NEXT:    setl %dil
+; X64-NEXT:    setg %bpl
+; X64-NEXT:    subb %dil, %bpl
+; X64-NEXT:    movsbq %bpl, %r13
+; X64-NEXT:    movq %r13, %rbp
+; X64-NEXT:    sarq $63, %rbp
+; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    shldq $62, %r13, %rdi
+; X64-NEXT:    movq %rdi, 88(%rax)
+; X64-NEXT:    shrq $2, %rbp
+; X64-NEXT:    movl %ebp, 96(%rax)
+; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    shldq $20, %r9, %rdi
+; X64-NEXT:    movq %rdi, 64(%rax)
+; X64-NEXT:    movq %r11, %rdi
+; X64-NEXT:    shldq $31, %r8, %rdi
+; X64-NEXT:    movq %rdi, 48(%rax)
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    shldq $42, %rbx, %rdi
+; X64-NEXT:    movq %rdi, 32(%rax)
+; X64-NEXT:    movabsq $9007199254738944, %rdi # imm = 0x1FFFFFFFFFF800
+; X64-NEXT:    andq %r12, %rdi
+; X64-NEXT:    shldq $53, %rdx, %r12
+; X64-NEXT:    movq %r12, 16(%rax)
+; X64-NEXT:    movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF
+; X64-NEXT:    andq %r12, %r15
+; X64-NEXT:    shldq $9, %r14, %r15
+; X64-NEXT:    shlq $62, %r13
+; X64-NEXT:    orq %r15, %r13
+; X64-NEXT:    movq %r13, 80(%rax)
+; X64-NEXT:    movabsq $2251799813685247, %r15 # imm = 0x7FFFFFFFFFFFF
+; X64-NEXT:    andq %rbp, %r15
+; X64-NEXT:    movq %r15, %r13
+; X64-NEXT:    shrq $48, %r13
+; X64-NEXT:    movb %r13b, 102(%rax)
+; X64-NEXT:    shrq $32, %r15
+; X64-NEXT:    movw %r15w, 100(%rax)
+; X64-NEXT:    shlq $42, %rbx
+; X64-NEXT:    shrq $11, %rdi
+; X64-NEXT:    orq %rbx, %rdi
+; X64-NEXT:    movq %rdi, 24(%rax)
+; X64-NEXT:    shlq $9, %r14
+; X64-NEXT:    shrq $44, %r10
+; X64-NEXT:    andl $511, %r10d # imm = 0x1FF
+; X64-NEXT:    orq %r14, %r10
 ; X64-NEXT:    movq %r10, 72(%rax)
-; X64-NEXT:    movq %r9, (%rax)
-; X64-NEXT:    shlq $62, %r14
-; X64-NEXT:    shrq $55, %rsi
-; X64-NEXT:    orq %r14, %rsi
-; X64-NEXT:    movq %rsi, 80(%rax)
-; X64-NEXT:    movq %r8, %rsi
-; X64-NEXT:    shrq $44, %rsi
-; X64-NEXT:    movq %rsi, 64(%rax)
-; X64-NEXT:    shlq $20, %r8
-; X64-NEXT:    movq %r8, 56(%rax)
-; X64-NEXT:    movq %rcx, %rsi
-; X64-NEXT:    shrq $33, %rsi
-; X64-NEXT:    movq %rsi, 48(%rax)
-; X64-NEXT:    shlq $31, %rcx
-; X64-NEXT:    movq %rcx, 40(%rax)
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    shlq $20, %r9
+; X64-NEXT:    shrq $33, %r11
+; X64-NEXT:    andl $1048575, %r11d # imm = 0xFFFFF
+; X64-NEXT:    orq %r9, %r11
+; X64-NEXT:    movq %r11, 56(%rax)
+; X64-NEXT:    shlq $31, %r8
 ; X64-NEXT:    shrq $22, %rcx
-; X64-NEXT:    movq %rcx, 32(%rax)
-; X64-NEXT:    shlq $42, %rdx
-; X64-NEXT:    movq %rdx, 24(%rax)
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    shrq $11, %rcx
-; X64-NEXT:    movq %rcx, 16(%rax)
-; X64-NEXT:    shlq $53, %rdi
-; X64-NEXT:    movq %rdi, 8(%rax)
-; X64-NEXT:    movb $0, 102(%rax)
-; X64-NEXT:    movw $0, 100(%rax)
-; X64-NEXT:    movl $0, 96(%rax)
+; X64-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-NEXT:    orq %r8, %rcx
+; X64-NEXT:    movq %rcx, 40(%rax)
+; X64-NEXT:    movq %rsi, %xmm1
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NEXT:    movq %xmm0, %rcx
+; X64-NEXT:    andq %r12, %rcx
+; X64-NEXT:    shlq $53, %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    movq %rdx, 8(%rax)
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
 ; X64-NEXT:    popq %rbp
@@ -1723,203 +1593,200 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $44, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addb %al, %al
+; X86-NEXT:    sarb %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addb %cl, %cl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    addb %dh, %dh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    addb %dl, %dl
-; X86-NEXT:    sarb %dl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    addb %ch, %ch
-; X86-NEXT:    sarb %ch
+; X86-NEXT:    addb %al, %al
+; X86-NEXT:    sarb %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addb %al, %al
 ; X86-NEXT:    sarb %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    addb %ah, %ah
-; X86-NEXT:    sarb %ah
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb %al, %ah
-; X86-NEXT:    setg %al
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jl .LBB18_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:  .LBB18_2:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addb %al, %al
+; X86-NEXT:    sarb %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addb %al, %al
+; X86-NEXT:    sarb %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addb %al, %al
-; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
+; X86-NEXT:    sarb %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    sarb %dl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    addb %ah, %ah
+; X86-NEXT:    sarb %ah
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
 ; X86-NEXT:    sarb %cl
-; X86-NEXT:    sarb %dh
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb %dl, %ch
-; X86-NEXT:    setg %dl
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB18_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %dl, %al
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:  .LBB18_4:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    addb %bl, %bl
-; X86-NEXT:    addb %bh, %bh
-; X86-NEXT:    sarb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    sarb (%esp) # 1-byte Folded Spill
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb %cl, %dh
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB18_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %cl, %al
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:  .LBB18_6:
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
 ; X86-NEXT:    addb %ch, %ch
-; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    sarb %ch
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    addb %bl, %bl
 ; X86-NEXT:    sarb %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    addb %bh, %bh
 ; X86-NEXT:    sarb %bh
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movb (%esp), %bl # 1-byte Reload
-; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X86-NEXT:    setg %bl
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jl .LBB18_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %bl, %al
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:  .LBB18_8:
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    addb %al, %al
+; X86-NEXT:    sarb %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
 ; X86-NEXT:    addb %dh, %dh
-; X86-NEXT:    sarb %ch
-; X86-NEXT:    sarb %dl
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; X86-NEXT:    setg %bl
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jl .LBB18_10
-; X86-NEXT:  # %bb.9:
-; X86-NEXT:    movb %bl, %al
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:  .LBB18_10:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sarb %cl
 ; X86-NEXT:    sarb %dh
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmpb %al, %dh
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dh
+; X86-NEXT:    subb %al, %dh
+; X86-NEXT:    movsbl %dh, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $2097151, %esi # imm = 0x1FFFFF
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpb %bl, %bh
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dh
+; X86-NEXT:    subb %al, %dh
+; X86-NEXT:    movsbl %dh, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $2097151, %esi # imm = 0x1FFFFF
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpb %cl, %ch
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ecx, (%ebp)
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $2097151, %esi # imm = 0x1FFFFF
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpb %dl, %ah
+; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dl
-; X86-NEXT:    movl $-1, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jl .LBB18_12
-; X86-NEXT:  # %bb.11:
-; X86-NEXT:    movb %dl, %bl
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:  .LBB18_12:
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb %cl, %dh
-; X86-NEXT:    setg %cl
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    jl .LBB18_14
-; X86-NEXT:  # %bb.13:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:  .LBB18_14:
-; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
+; X86-NEXT:    setl %dl
+; X86-NEXT:    setg %dh
+; X86-NEXT:    subb %dl, %dh
+; X86-NEXT:    movsbl %dh, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 96(%ebp)
+; X86-NEXT:    movl %edx, 92(%ebp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 80(%ebp)
+; X86-NEXT:    movl %eax, 68(%ebp)
+; X86-NEXT:    movl %eax, 64(%ebp)
+; X86-NEXT:    movl %esi, 52(%ebp)
+; X86-NEXT:    movl %esi, 48(%ebp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 36(%ebp)
+; X86-NEXT:    movl %edi, 24(%ebp)
+; X86-NEXT:    movl %edi, 20(%ebp)
+; X86-NEXT:    movl %ecx, 8(%ebp)
+; X86-NEXT:    movl %ecx, 4(%ebp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $30, %edx, %ecx
+; X86-NEXT:    movl %ecx, 88(%ebp)
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $9, %ebp, %ecx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl %ecx, 76(%ebx)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $20, %ebx, %ecx
+; X86-NEXT:    movl %ecx, 60(%ebp)
 ; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    shrl $2, %ecx
-; X86-NEXT:    movl %ecx, 92(%eax)
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    shrl $23, %ecx
-; X86-NEXT:    movl %ecx, 80(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    shrl $12, %ecx
-; X86-NEXT:    movl %ecx, 64(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shrl %ecx
-; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    shldl $31, %ebx, %ecx
+; X86-NEXT:    movl %ecx, 44(%ebp)
+; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrl $22, %ecx
-; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $10, %ebp, %ecx
+; X86-NEXT:    movl %ecx, 32(%ebx)
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $21, %ebp, %ecx
+; X86-NEXT:    movl %ecx, 16(%ebx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    shrl $11, %ecx
-; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    movw %cx, 100(%ebx)
+; X86-NEXT:    shll $21, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, 12(%ebx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $30, %ecx, %esi
-; X86-NEXT:    movl %esi, 88(%eax)
 ; X86-NEXT:    shll $30, %ecx
-; X86-NEXT:    movl %ecx, 84(%eax)
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 84(%ebx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $9, %ecx, %ebp
-; X86-NEXT:    movl %ebp, 76(%eax)
 ; X86-NEXT:    shll $9, %ecx
-; X86-NEXT:    movl %ecx, 72(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $20, %ecx, %edi
-; X86-NEXT:    movl %edi, 60(%eax)
-; X86-NEXT:    shll $20, %ecx
-; X86-NEXT:    movl %ecx, 56(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $31, %ecx, %ebx
-; X86-NEXT:    movl %ebx, 44(%eax)
-; X86-NEXT:    shll $31, %ecx
-; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $10, %ecx, %esi
-; X86-NEXT:    movl %esi, 32(%eax)
-; X86-NEXT:    shll $10, %ecx
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $21, %ecx, %edx
-; X86-NEXT:    movl %edx, 16(%eax)
-; X86-NEXT:    shll $21, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movb $0, 102(%eax)
-; X86-NEXT:    movw $0, 100(%eax)
-; X86-NEXT:    movl $0, 96(%eax)
-; X86-NEXT:    movl $0, 68(%eax)
-; X86-NEXT:    movl $0, 52(%eax)
-; X86-NEXT:    movl $0, 24(%eax)
-; X86-NEXT:    movl $0, 8(%eax)
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    shrl $12, %eax
+; X86-NEXT:    andl $511, %eax # imm = 0x1FF
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, 72(%ebx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $20, %eax
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    andl $1048575, %esi # imm = 0xFFFFF
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %esi, 56(%ebx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, 40(%ebx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $10, %eax
+; X86-NEXT:    shrl $11, %edi
+; X86-NEXT:    andl $1023, %edi # imm = 0x3FF
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, 28(%ebx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shrl $18, %eax
+; X86-NEXT:    andl $7, %eax
+; X86-NEXT:    movb %al, 102(%ebx)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1936,39 +1803,36 @@ define <1 x i3> @scmp_scalarize(<1 x i33> %x, <1 x i33> %y) nounwind {
 ; X64-NEXT:    sarq $31, %rsi
 ; X64-NEXT:    shlq $31, %rdi
 ; X64-NEXT:    sarq $31, %rdi
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    setg %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovgel %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_scalarize:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl $1, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    setl %dl
-; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB19_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:  .LBB19_2:
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %1 = call <1 x i3> @llvm.scmp(<1 x i33> %x, <1 x i33> %y)
   ret <1 x i3> %1
@@ -1981,29 +1845,29 @@ define <2 x i8> @scmp_bool_operands(<2 x i1> %x, <2 x i1> %y) nounwind {
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andb $1, %cl
-; X64-NEXT:    negb %cl
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; X64-NEXT:    andb $1, %sil
-; X64-NEXT:    negb %sil
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpb %cl, %sil
-; X64-NEXT:    setg %dil
-; X64-NEXT:    movl $255, %ecx
-; X64-NEXT:    cmovll %ecx, %edi
-; X64-NEXT:    shll $8, %edi
 ; X64-NEXT:    andb $1, %al
 ; X64-NEXT:    negb %al
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
 ; X64-NEXT:    andb $1, %dl
 ; X64-NEXT:    negb %dl
-; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    cmpb %al, %dl
-; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %ecx, %esi
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    orl %edi, %eax
-; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %dl
+; X64-NEXT:    subb %al, %dl
+; X64-NEXT:    movzbl %dl, %eax
+; X64-NEXT:    andb $1, %cl
+; X64-NEXT:    negb %cl
+; X64-NEXT:    andb $1, %sil
+; X64-NEXT:    negb %sil
+; X64-NEXT:    cmpb %cl, %sil
+; X64-NEXT:    setl %cl
+; X64-NEXT:    setg %dl
+; X64-NEXT:    subb %cl, %dl
+; X64-NEXT:    movzbl %dl, %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_bool_operands:
@@ -2011,29 +1875,23 @@ define <2 x i8> @scmp_bool_operands(<2 x i1> %x, <2 x i1> %y) nounwind {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
 ; X86-NEXT:    negb %cl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    andb $1, %ah
-; X86-NEXT:    negb %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    negb %al
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    andb $1, %dl
 ; X86-NEXT:    negb %dl
-; X86-NEXT:    cmpb %al, %dl
-; X86-NEXT:    setg %ch
-; X86-NEXT:    movb $-1, %dl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jl .LBB20_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %ch, %al
-; X86-NEXT:  .LBB20_2:
-; X86-NEXT:    cmpb %cl, %ah
-; X86-NEXT:    setg %cl
-; X86-NEXT:    jl .LBB20_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:  .LBB20_4:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    andb $1, %ah
+; X86-NEXT:    negb %ah
+; X86-NEXT:    cmpb %al, %ah
+; X86-NEXT:    setl %ah
+; X86-NEXT:    setg %al
+; X86-NEXT:    subb %ah, %al
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
 ; X86-NEXT:    retl
   %1 = call <2 x i8> @llvm.scmp(<2 x i1> %x, <2 x i1> %y)
   ret <2 x i8> %1
@@ -2048,45 +1906,36 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin
 ; X64-NEXT:    movd %xmm0, %edx
 ; X64-NEXT:    movl %edx, %esi
 ; X64-NEXT:    shrl $8, %esi
-; X64-NEXT:    xorl %edi, %edi
 ; X64-NEXT:    cmpb %cl, %sil
-; X64-NEXT:    setg %dil
-; X64-NEXT:    movl $65535, %ecx # imm = 0xFFFF
-; X64-NEXT:    cmovll %ecx, %edi
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpb %al, %dl
+; X64-NEXT:    setl %cl
 ; X64-NEXT:    setg %sil
-; X64-NEXT:    cmovll %ecx, %esi
-; X64-NEXT:    movd %esi, %xmm0
-; X64-NEXT:    pinsrw $1, %edi, %xmm0
+; X64-NEXT:    subb %cl, %sil
+; X64-NEXT:    movsbl %sil, %ecx
+; X64-NEXT:    cmpb %al, %dl
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %dl
+; X64-NEXT:    subb %al, %dl
+; X64-NEXT:    movsbl %dl, %eax
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    pinsrw $1, %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: scmp_ret_wider_than_operands:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setg %ch
-; X86-NEXT:    movl $65535, %edx # imm = 0xFFFF
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    jl .LBB21_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %ch, %bl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:  .LBB21_2:
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movsbl %dl, %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    setg %cl
-; X86-NEXT:    jl .LBB21_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:  .LBB21_4:
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %edx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    # kill: def $dx killed $dx killed $edx
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %1 = call <2 x i16> @llvm.scmp(<2 x i8> %x, <2 x i8> %y)
   ret <2 x i16> %1

diff  --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 344404749d7ef..ac35605be4d52 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -5,24 +5,17 @@
 define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
 ; X64-LABEL: ucmp.8.8:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpb %sil, %dil
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp.8.8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB0_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    retl
   %1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
   ret i8 %1
@@ -31,24 +24,17 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
 define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
 ; X64-LABEL: ucmp.8.16:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpw %si, %di
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp.8.16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %ax
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB1_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB1_2:
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    retl
   %1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
   ret i8 %1
@@ -57,24 +43,17 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
 define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: ucmp.8.32:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp.8.32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB2_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    retl
   %1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
   ret i8 %1
@@ -83,33 +62,26 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
 define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind {
 ; X64-LABEL: ucmp.8.64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp.8.64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %esi, %ecx
 ; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB3_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB3_2:
+; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -124,12 +96,9 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    sbbq %rsi, %rax
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r8d
 ; X64-NEXT:    cmpq %rdx, %rdi
 ; X64-NEXT:    sbbq %rcx, %rsi
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %r8d, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    sbbb $0, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp.8.128:
@@ -138,30 +107,26 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB4_2
-; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB4_2:
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %ebp, %esi
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -174,25 +139,19 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: ucmp.32.32:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovael %ecx, %eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp.32.32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %dl
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    jb .LBB5_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB5_2:
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -201,34 +160,32 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 ; X64-LABEL: ucmp.32.64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovael %ecx, %eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp.32.64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    jb .LBB6_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:  .LBB6_2:
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movsbl %bl, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
   ret i32 %1
@@ -237,36 +194,34 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
 ; X64-LABEL: ucmp.64.64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    cmovaeq %rcx, %rax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbq %al, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp.64.64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    jb .LBB7_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:  .LBB7_2:
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movsbl %bl, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
   ret i64 %1
@@ -275,24 +230,17 @@ define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
 define i4 @ucmp_narrow_result(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: ucmp_narrow_result:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_narrow_result:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB8_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB8_2:
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    retl
   %1 = call i4 @llvm.ucmp(i32 %x, i32 %y)
   ret i4 %1
@@ -304,35 +252,28 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind {
 ; X64-NEXT:    movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF
 ; X64-NEXT:    andq %rax, %rsi
 ; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %ecx, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_narrow_op:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl $1073741823, %eax # imm = 0x3FFFFFFF
+; X86-NEXT:    movl $1073741823, %ecx # imm = 0x3FFFFFFF
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %edi, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB9_2
-; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB9_2:
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -343,39 +284,31 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind {
 define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: ucmp_wide_result:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    cmovaeq %rcx, %rax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbq %al, %rax
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    andl $8191, %ecx # imm = 0x1FFF
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_wide_result:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    seta %bl
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jb .LBB10_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %bl, %dl
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:  .LBB10_2:
-; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl $0, 12(%eax)
-; X86-NEXT:    movl $0, 8(%eax)
-; X86-NEXT:    movw $0, 16(%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    andl $8191, %ecx # imm = 0x1FFF
+; X86-NEXT:    movw %cx, 16(%eax)
 ; X86-NEXT:    retl $4
   %1 = call i141 @llvm.ucmp(i32 %x, i32 %y)
   ret i141 %1
@@ -391,12 +324,9 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind {
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    sbbq %rsi, %rax
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r8d
 ; X64-NEXT:    cmpq %rdx, %rdi
 ; X64-NEXT:    sbbq %rcx, %rsi
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovael %r8d, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    sbbb $0, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_wide_op:
@@ -405,33 +335,27 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl $8191, %eax # imm = 0x1FFF
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $8191, %ecx # imm = 0x1FFF
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB11_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:  .LBB11_2:
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -446,32 +370,24 @@ define i41 @ucmp_uncommon_types(i7 %x, i7 %y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    andb $127, %sil
 ; X64-NEXT:    andb $127, %dil
-; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpb %sil, %dil
-; X64-NEXT:    seta %cl
-; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    cmovaeq %rcx, %rax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbq %al, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_uncommon_types:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andb $127, %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    andb $127, %ah
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpb %al, %ah
-; X86-NEXT:    seta %bl
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    jb .LBB12_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %bl, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB12_2:
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $127, %cl
+; X86-NEXT:    cmpb %al, %cl
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    retl
   %1 = call i41 @llvm.ucmp(i7 %x, i7 %y)
   ret i41 %1
@@ -484,38 +400,37 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movd %xmm2, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
 ; X64-NEXT:    movd %xmm2, %ecx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    seta %dl
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovbl %eax, %edx
-; X64-NEXT:    movd %edx, %xmm2
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    movd %xmm3, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-NEXT:    movd %xmm3, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT:    movd %xmm1, %ecx
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    movd %xmm1, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; X64-NEXT:    movdqa %xmm2, %xmm0
@@ -523,59 +438,37 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: ucmp_normal_vectors:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jb .LBB13_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:  .LBB13_2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jb .LBB13_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:  .LBB13_4:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    seta %dl
+; X86-NEXT:    sbbb $0, %dl
+; X86-NEXT:    movsbl %dl, %edx
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jb .LBB13_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:  .LBB13_6:
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %bl
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movsbl %bl, %edi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    seta %bl
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movsbl %bl, %esi
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seta %cl
-; X86-NEXT:    jb .LBB13_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:  .LBB13_8:
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %1 = call <4 x i32> @llvm.ucmp(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %1
@@ -586,45 +479,41 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movd %xmm1, %eax
 ; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    seta %dl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovbl %eax, %edx
-; X64-NEXT:    movzbl %dl, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm2, %edx
+; X64-NEXT:    movd %xmm2, %ecx
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X64-NEXT:    movd %xmm2, %esi
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpl %edx, %esi
-; X64-NEXT:    seta %dil
-; X64-NEXT:    cmovbl %eax, %edi
-; X64-NEXT:    movzbl %dil, %edx
-; X64-NEXT:    shll $8, %edx
-; X64-NEXT:    orl %ecx, %edx
+; X64-NEXT:    movd %xmm2, %edx
+; X64-NEXT:    cmpl %ecx, %edx
+; X64-NEXT:    seta %cl
+; X64-NEXT:    sbbb $0, %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    orl %eax, %ecx
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    movd %xmm2, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT:    movd %xmm2, %esi
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpl %ecx, %esi
-; X64-NEXT:    seta %dil
-; X64-NEXT:    cmovbl %eax, %edi
-; X64-NEXT:    movzbl %dil, %ecx
-; X64-NEXT:    shll $16, %ecx
-; X64-NEXT:    orl %edx, %ecx
+; X64-NEXT:    movd %xmm2, %edx
+; X64-NEXT:    cmpl %eax, %edx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    shll $16, %eax
+; X64-NEXT:    orl %ecx, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X64-NEXT:    movd %xmm1, %edx
+; X64-NEXT:    movd %xmm1, %ecx
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-NEXT:    movd %xmm0, %esi
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpl %edx, %esi
-; X64-NEXT:    seta %dil
-; X64-NEXT:    cmovbl %eax, %edi
-; X64-NEXT:    shll $24, %edi
-; X64-NEXT:    orl %ecx, %edi
-; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %edx
+; X64-NEXT:    cmpl %ecx, %edx
+; X64-NEXT:    seta %cl
+; X64-NEXT:    sbbb $0, %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    shll $24, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_narrow_vec_result:
@@ -633,40 +522,24 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    seta %ch
-; X86-NEXT:    movb $-1, %dl
-; X86-NEXT:    movb $-1, %cl
-; X86-NEXT:    jb .LBB14_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:  .LBB14_2:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %al
-; X86-NEXT:    movb $-1, %ch
-; X86-NEXT:    jb .LBB14_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:  .LBB14_4:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sbbb $0, %ch
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    seta %bl
-; X86-NEXT:    movb $-1, %dh
-; X86-NEXT:    jb .LBB14_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %bl, %dh
-; X86-NEXT:  .LBB14_6:
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    seta %bl
-; X86-NEXT:    jb .LBB14_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %bl, %dl
-; X86-NEXT:  .LBB14_8:
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    seta %dl
+; X86-NEXT:    sbbb $0, %dl
 ; X86-NEXT:    movb %dl, 3(%eax)
-; X86-NEXT:    movb %dh, 2(%eax)
+; X86-NEXT:    movb %bl, 2(%eax)
 ; X86-NEXT:    movb %ch, 1(%eax)
 ; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    popl %esi
@@ -682,105 +555,82 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    pxor %xmm2, %xmm2
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-NEXT:    pextrw $0, %xmm1, %ecx
+; X64-NEXT:    pextrw $0, %xmm1, %eax
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X64-NEXT:    movd %xmm3, %eax
+; X64-NEXT:    movd %xmm3, %ecx
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; X64-NEXT:    pextrw $0, %xmm0, %edx
 ; X64-NEXT:    movdqa %xmm0, %xmm3
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
 ; X64-NEXT:    movd %xmm0, %esi
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpl %eax, %esi
-; X64-NEXT:    seta %dil
-; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:    cmovbl %eax, %edi
+; X64-NEXT:    cmpl %ecx, %esi
+; X64-NEXT:    seta %cl
+; X64-NEXT:    sbbb $0, %cl
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; X64-NEXT:    movd %xmm0, %esi
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
-; X64-NEXT:    movd %xmm0, %r8d
-; X64-NEXT:    xorl %r9d, %r9d
-; X64-NEXT:    cmpl %esi, %r8d
-; X64-NEXT:    movd %edi, %xmm0
-; X64-NEXT:    seta %r9b
-; X64-NEXT:    cmovbl %eax, %r9d
-; X64-NEXT:    movd %r9d, %xmm2
+; X64-NEXT:    movd %xmm0, %edi
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    movsbl %cl, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    seta %cl
+; X64-NEXT:    sbbb $0, %cl
+; X64-NEXT:    movsbl %cl, %ecx
+; X64-NEXT:    movd %ecx, %xmm2
 ; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    cmpl %eax, %edx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    movd %xmm1, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
-; X64-NEXT:    movd %xmm1, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_narrow_vec_op:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jb .LBB15_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:  .LBB15_2:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    seta %dl
+; X86-NEXT:    sbbb $0, %dl
+; X86-NEXT:    movsbl %dl, %edx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
 ; X86-NEXT:    seta %bl
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jb .LBB15_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %bl, %al
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:  .LBB15_4:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movsbl %bl, %esi
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jb .LBB15_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:  .LBB15_6:
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %ch
+; X86-NEXT:    sbbb $0, %ch
+; X86-NEXT:    movsbl %ch, %edi
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    seta %cl
-; X86-NEXT:    jb .LBB15_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:  .LBB15_8:
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl %edi, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %1 = call <4 x i32> @llvm.ucmp(<4 x i8> %x, <4 x i8> %y)
   ret <4 x i32> %1
@@ -798,178 +648,175 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X64-NEXT:    pxor %xmm2, %xmm2
 ; X64-NEXT:    movdqa %xmm1, %xmm4
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X64-NEXT:    pextrw $0, %xmm4, %edi
+; X64-NEXT:    pextrw $0, %xmm4, %edx
 ; X64-NEXT:    movdqa %xmm4, %xmm3
-; X64-NEXT:    pextrw $4, %xmm4, %r11d
+; X64-NEXT:    pextrw $4, %xmm4, %r9d
 ; X64-NEXT:    movdqa %xmm4, %xmm5
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[3,3,3,3]
 ; X64-NEXT:    movd %xmm4, %eax
 ; X64-NEXT:    movdqa %xmm0, %xmm6
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; X64-NEXT:    pextrw $0, %xmm6, %r8d
+; X64-NEXT:    pextrw $0, %xmm6, %esi
 ; X64-NEXT:    movdqa %xmm6, %xmm4
-; X64-NEXT:    pextrw $4, %xmm6, %ebx
+; X64-NEXT:    pextrw $4, %xmm6, %r10d
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[3,3,3,3]
 ; X64-NEXT:    movd %xmm7, %ecx
-; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    movl $-1, %edx
-; X64-NEXT:    cmovbl %edx, %esi
-; X64-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %esi
+; X64-NEXT:    movd %xmm7, %ecx
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %r9d
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %esi, %r9d
-; X64-NEXT:    seta %al
-; X64-NEXT:    cmovbl %edx, %eax
+; X64-NEXT:    movd %xmm7, %edi
+; X64-NEXT:    cmpl %ecx, %edi
+; X64-NEXT:    seta %cl
+; X64-NEXT:    sbbb $0, %cl
+; X64-NEXT:    movsbl %cl, %eax
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %edi, %r8d
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %edx, %esi
+; X64-NEXT:    cmpl %edx, %esi
+; X64-NEXT:    seta %dl
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movsbl %dl, %edx
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %r8d
+; X64-NEXT:    movd %xmm5, %esi
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %r9d
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpl %r8d, %r9d
-; X64-NEXT:    seta %dil
-; X64-NEXT:    cmovbl %edx, %edi
+; X64-NEXT:    movd %xmm5, %edi
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    seta %sil
+; X64-NEXT:    sbbb $0, %sil
+; X64-NEXT:    movsbl %sil, %esi
 ; X64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[3,3,3,3]
-; X64-NEXT:    movd %xmm5, %r9d
+; X64-NEXT:    movd %xmm5, %edi
 ; X64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[3,3,3,3]
-; X64-NEXT:    movd %xmm5, %r10d
-; X64-NEXT:    xorl %r8d, %r8d
-; X64-NEXT:    cmpl %r9d, %r10d
-; X64-NEXT:    seta %r8b
-; X64-NEXT:    cmovbl %edx, %r8d
+; X64-NEXT:    movd %xmm5, %r8d
+; X64-NEXT:    cmpl %edi, %r8d
+; X64-NEXT:    seta %dil
+; X64-NEXT:    sbbb $0, %dil
+; X64-NEXT:    movsbl %dil, %edi
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
-; X64-NEXT:    movd %xmm5, %r10d
+; X64-NEXT:    movd %xmm5, %r8d
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; X64-NEXT:    movd %xmm5, %ebp
-; X64-NEXT:    xorl %r9d, %r9d
-; X64-NEXT:    cmpl %r10d, %ebp
+; X64-NEXT:    movd %xmm5, %r11d
+; X64-NEXT:    cmpl %r8d, %r11d
+; X64-NEXT:    seta %r8b
+; X64-NEXT:    sbbb $0, %r8b
+; X64-NEXT:    movsbl %r8b, %r8d
+; X64-NEXT:    cmpl %r9d, %r10d
 ; X64-NEXT:    seta %r9b
-; X64-NEXT:    cmovbl %edx, %r9d
-; X64-NEXT:    xorl %r10d, %r10d
-; X64-NEXT:    cmpl %r11d, %ebx
-; X64-NEXT:    seta %r10b
-; X64-NEXT:    cmovbl %edx, %r10d
+; X64-NEXT:    sbbb $0, %r9b
+; X64-NEXT:    movsbl %r9b, %r9d
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; X64-NEXT:    movd %xmm3, %ebx
+; X64-NEXT:    movd %xmm3, %r10d
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1]
-; X64-NEXT:    movd %xmm3, %ebp
-; X64-NEXT:    xorl %r11d, %r11d
-; X64-NEXT:    cmpl %ebx, %ebp
-; X64-NEXT:    seta %r11b
-; X64-NEXT:    cmovbl %edx, %r11d
+; X64-NEXT:    movd %xmm3, %r11d
+; X64-NEXT:    cmpl %r10d, %r11d
+; X64-NEXT:    seta %r10b
+; X64-NEXT:    sbbb $0, %r10b
+; X64-NEXT:    movsbl %r10b, %r10d
 ; X64-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; X64-NEXT:    pextrw $0, %xmm1, %r15d
+; X64-NEXT:    pextrw $0, %xmm1, %ebx
 ; X64-NEXT:    movdqa %xmm1, %xmm4
-; X64-NEXT:    movdqa %xmm1, %xmm3
-; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[3,3,3,3]
-; X64-NEXT:    movd %xmm5, %ebp
+; X64-NEXT:    pextrw $4, %xmm1, %r11d
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; X64-NEXT:    movd %xmm3, %r14d
 ; X64-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; X64-NEXT:    pextrw $0, %xmm0, %r12d
+; X64-NEXT:    pextrw $0, %xmm0, %r15d
 ; X64-NEXT:    movdqa %xmm0, %xmm5
-; X64-NEXT:    movdqa %xmm0, %xmm6
-; X64-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[3,3,3,3]
-; X64-NEXT:    movd %xmm7, %r14d
-; X64-NEXT:    xorl %ebx, %ebx
-; X64-NEXT:    cmpl %ebp, %r14d
-; X64-NEXT:    seta %bl
-; X64-NEXT:    cmovbl %edx, %ebx
-; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %r14d
-; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %r13d
-; X64-NEXT:    xorl %ebp, %ebp
-; X64-NEXT:    cmpl %r14d, %r13d
-; X64-NEXT:    seta %bpl
-; X64-NEXT:    cmovbl %edx, %ebp
-; X64-NEXT:    xorl %r14d, %r14d
-; X64-NEXT:    cmpl %r15d, %r12d
+; X64-NEXT:    pextrw $4, %xmm0, %ebp
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X64-NEXT:    movd %xmm3, %r12d
+; X64-NEXT:    cmpl %r14d, %r12d
 ; X64-NEXT:    seta %r14b
-; X64-NEXT:    cmovbl %edx, %r14d
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
+; X64-NEXT:    sbbb $0, %r14b
+; X64-NEXT:    movsbl %r14b, %r14d
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
 ; X64-NEXT:    movd %xmm3, %r12d
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
+; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; X64-NEXT:    movd %xmm3, %r13d
-; X64-NEXT:    xorl %r15d, %r15d
 ; X64-NEXT:    cmpl %r12d, %r13d
+; X64-NEXT:    seta %r12b
+; X64-NEXT:    sbbb $0, %r12b
+; X64-NEXT:    cmpl %ebx, %r15d
+; X64-NEXT:    seta %bl
+; X64-NEXT:    sbbb $0, %bl
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X64-NEXT:    movd %xmm1, %r15d
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT:    movd %xmm0, %r13d
+; X64-NEXT:    cmpl %r15d, %r13d
 ; X64-NEXT:    seta %r15b
-; X64-NEXT:    cmovbl %edx, %r15d
+; X64-NEXT:    sbbb $0, %r15b
 ; X64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3]
-; X64-NEXT:    movd %xmm3, %r13d
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
+; X64-NEXT:    movd %xmm0, %r13d
 ; X64-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3]
-; X64-NEXT:    movd %xmm2, %eax
-; X64-NEXT:    xorl %r12d, %r12d
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
+; X64-NEXT:    movd %xmm0, %eax
 ; X64-NEXT:    cmpl %r13d, %eax
-; X64-NEXT:    seta %r12b
-; X64-NEXT:    cmovbl %edx, %r12d
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
-; X64-NEXT:    movd %xmm2, %ecx
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
-; X64-NEXT:    movd %xmm2, %eax
-; X64-NEXT:    xorl %r13d, %r13d
-; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; X64-NEXT:    movd %xmm0, %r13d
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    cmpl %r13d, %ecx
+; X64-NEXT:    movsbl %r12b, %ecx
+; X64-NEXT:    movsbl %bl, %ebx
+; X64-NEXT:    movsbl %r15b, %r15d
 ; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload
 ; X64-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    pextrw $4, %xmm1, %eax
 ; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload
 ; X64-NEXT:    # xmm3 = mem[0],zero,zero,zero
-; X64-NEXT:    pextrw $4, %xmm0, %ecx
-; X64-NEXT:    movd %esi, %xmm0
-; X64-NEXT:    movd %edi, %xmm6
-; X64-NEXT:    movd %r8d, %xmm7
-; X64-NEXT:    movd %r9d, %xmm8
-; X64-NEXT:    movd %r10d, %xmm1
-; X64-NEXT:    movd %r11d, %xmm9
+; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    movd %esi, %xmm6
+; X64-NEXT:    movd %edi, %xmm7
+; X64-NEXT:    movd %r8d, %xmm8
+; X64-NEXT:    movd %r9d, %xmm1
+; X64-NEXT:    movd %r10d, %xmm9
 ; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT:    movd %ebx, %xmm10
+; X64-NEXT:    movd %r14d, %xmm10
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; X64-NEXT:    movd %ebp, %xmm6
+; X64-NEXT:    movd %ecx, %xmm6
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; X64-NEXT:    movd %r14d, %xmm2
+; X64-NEXT:    movd %ebx, %xmm2
 ; X64-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
 ; X64-NEXT:    movd %r15d, %xmm3
 ; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
-; X64-NEXT:    movd %r12d, %xmm7
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0]
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm7
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm8
 ; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
-; X64-NEXT:    seta %r13b
-; X64-NEXT:    cmovbl %edx, %r13d
-; X64-NEXT:    movd %r13d, %xmm6
-; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %edx, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; X64-NEXT:    cmpl %r11d, %ebp
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
 ; X64-NEXT:    movd %xmm4, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
 ; X64-NEXT:    movd %xmm4, %ecx
-; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %edx, %esi
-; X64-NEXT:    movd %esi, %xmm4
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm4
 ; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0]
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
@@ -984,202 +831,115 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    seta %dl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jb .LBB16_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:  .LBB16_2:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    seta %bl
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jb .LBB16_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:  .LBB16_4:
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    seta %bh
+; X86-NEXT:    sbbb $0, %bh
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    jb .LBB16_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:  .LBB16_6:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jb .LBB16_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:  .LBB16_8:
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB16_10
-; X86-NEXT:  # %bb.9:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_10:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB16_12
-; X86-NEXT:  # %bb.11:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_12:
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB16_14
-; X86-NEXT:  # %bb.13:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_14:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB16_16
-; X86-NEXT:  # %bb.15:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_16:
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %ebp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB16_18
-; X86-NEXT:  # %bb.17:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_18:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB16_20
-; X86-NEXT:  # %bb.19:
-; X86-NEXT:    movb %al, %dl
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:  .LBB16_20:
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    jb .LBB16_22
-; X86-NEXT:  # %bb.21:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:  .LBB16_22:
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, %ebp
-; X86-NEXT:    jb .LBB16_24
-; X86-NEXT:  # %bb.23:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:  .LBB16_24:
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    seta %ah
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    jb .LBB16_26
-; X86-NEXT:  # %bb.25:
-; X86-NEXT:    movb %ah, %bl
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:  .LBB16_26:
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    jb .LBB16_28
-; X86-NEXT:  # %bb.27:
-; X86-NEXT:    movb %al, %bl
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:  .LBB16_28:
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    jb .LBB16_30
-; X86-NEXT:  # %bb.29:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:  .LBB16_30:
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    seta %cl
-; X86-NEXT:    jb .LBB16_32
-; X86-NEXT:  # %bb.31:
-; X86-NEXT:    movb %cl, %bl
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:  .LBB16_32:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %edi, 56(%eax)
+; X86-NEXT:    movl %edx, 56(%eax)
 ; X86-NEXT:    movl %esi, 52(%eax)
-; X86-NEXT:    movl %edx, 48(%eax)
-; X86-NEXT:    movl %ebp, 44(%eax)
+; X86-NEXT:    movl %ebp, 48(%eax)
+; X86-NEXT:    movl %edi, 44(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movsbl %bh, %ecx
 ; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movsbl (%esp), %edx # 1-byte Folded Reload
+; X86-NEXT:    movl %edx, 32(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NEXT:    movsbl %bl, %edi
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %edx, 16(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $48, %esp
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1196,150 +956,149 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; X64-NEXT:    movd %xmm8, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3]
 ; X64-NEXT:    movd %xmm8, %ecx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpl %eax, %ecx
-; X64-NEXT:    seta %dl
-; X64-NEXT:    movl $255, %eax
-; X64-NEXT:    cmovbl %eax, %edx
-; X64-NEXT:    movd %edx, %xmm8
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm8
 ; X64-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3]
-; X64-NEXT:    movd %xmm9, %ecx
+; X64-NEXT:    movd %xmm9, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3]
-; X64-NEXT:    movd %xmm9, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm9
+; X64-NEXT:    movd %xmm9, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm9
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; X64-NEXT:    movd %xmm7, %ecx
-; X64-NEXT:    movd %xmm3, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm8
+; X64-NEXT:    movd %xmm7, %eax
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm8
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1]
-; X64-NEXT:    movd %xmm7, %ecx
+; X64-NEXT:    movd %xmm7, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; X64-NEXT:    movd %xmm3, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
-; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    movd %xmm3, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
-; X64-NEXT:    movd %xmm3, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %ecx
+; X64-NEXT:    movd %xmm7, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
-; X64-NEXT:    movd %xmm7, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm7
+; X64-NEXT:    movd %xmm7, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm7
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; X64-NEXT:    movd %xmm6, %ecx
-; X64-NEXT:    movd %xmm2, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm3
+; X64-NEXT:    movd %xmm6, %eax
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1]
-; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    movd %xmm6, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X64-NEXT:    movd %xmm2, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3]
-; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    movd %xmm2, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X64-NEXT:    movd %xmm2, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    movd %xmm6, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X64-NEXT:    movd %xmm6, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm6
+; X64-NEXT:    movd %xmm6, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm6
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; X64-NEXT:    movd %xmm5, %ecx
-; X64-NEXT:    movd %xmm1, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm2
+; X64-NEXT:    movd %xmm5, %eax
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm2
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    movd %xmm5, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-NEXT:    movd %xmm1, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
-; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    movd %xmm1, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X64-NEXT:    movd %xmm1, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    movd %xmm5, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; X64-NEXT:    movd %xmm5, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm5
+; X64-NEXT:    movd %xmm5, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm5
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; X64-NEXT:    movd %xmm4, %ecx
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm1
+; X64-NEXT:    movd %xmm4, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
-; X64-NEXT:    movd %xmm4, %ecx
+; X64-NEXT:    movd %xmm4, %eax
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpl %ecx, %edx
-; X64-NEXT:    seta %sil
-; X64-NEXT:    cmovbl %eax, %esi
-; X64-NEXT:    movd %esi, %xmm0
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    movd %eax, %xmm0
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -1349,155 +1108,91 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
 ;
 ; X86-LABEL: ucmp_wide_vec_op:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movb $-1, %dl
-; X86-NEXT:    jb .LBB17_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:  .LBB17_2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movb $-1, %ah
-; X86-NEXT:    jb .LBB17_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:  .LBB17_4:
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %ch
-; X86-NEXT:    jb .LBB17_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:  .LBB17_6:
-; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movb $-1, %ah
-; X86-NEXT:    jb .LBB17_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:  .LBB17_8:
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %ch
-; X86-NEXT:    jb .LBB17_10
-; X86-NEXT:  # %bb.9:
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:  .LBB17_10:
-; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movb $-1, %ah
-; X86-NEXT:    jb .LBB17_12
-; X86-NEXT:  # %bb.11:
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:  .LBB17_12:
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %ch
-; X86-NEXT:    jb .LBB17_14
-; X86-NEXT:  # %bb.13:
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:  .LBB17_14:
-; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    seta %al
-; X86-NEXT:    movb $-1, %ah
-; X86-NEXT:    jb .LBB17_16
-; X86-NEXT:  # %bb.15:
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:  .LBB17_16:
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %ch
-; X86-NEXT:    jb .LBB17_18
-; X86-NEXT:  # %bb.17:
-; X86-NEXT:    movb %cl, %ch
-; X86-NEXT:  .LBB17_18:
-; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %bh
+; X86-NEXT:    sbbb $0, %bh
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %al
-; X86-NEXT:    movb $-1, %ah
-; X86-NEXT:    jb .LBB17_20
-; X86-NEXT:  # %bb.19:
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:  .LBB17_20:
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    movb $-1, %bh
-; X86-NEXT:    jb .LBB17_22
-; X86-NEXT:  # %bb.21:
-; X86-NEXT:    movb %cl, %bh
-; X86-NEXT:  .LBB17_22:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    seta %bl
+; X86-NEXT:    sbbb $0, %bl
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %al
-; X86-NEXT:    movb $-1, %cl
-; X86-NEXT:    jb .LBB17_24
-; X86-NEXT:  # %bb.23:
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:  .LBB17_24:
+; X86-NEXT:    seta %dh
+; X86-NEXT:    sbbb $0, %dh
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    seta %ch
-; X86-NEXT:    movb $-1, %dl
-; X86-NEXT:    jb .LBB17_26
-; X86-NEXT:  # %bb.25:
-; X86-NEXT:    movb %ch, %dl
-; X86-NEXT:  .LBB17_26:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sbbb $0, %ch
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %al
-; X86-NEXT:    movb $-1, %ch
-; X86-NEXT:    jb .LBB17_28
-; X86-NEXT:  # %bb.27:
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:  .LBB17_28:
+; X86-NEXT:    seta %dl
+; X86-NEXT:    sbbb $0, %dl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    seta %bl
-; X86-NEXT:    movb $-1, %dh
-; X86-NEXT:    jb .LBB17_30
-; X86-NEXT:  # %bb.29:
-; X86-NEXT:    movb %bl, %dh
-; X86-NEXT:  .LBB17_30:
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    seta %bl
-; X86-NEXT:    jb .LBB17_32
-; X86-NEXT:  # %bb.31:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB17_32:
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Reload
-; X86-NEXT:    movb %bl, 15(%eax)
-; X86-NEXT:    movb %dh, 14(%eax)
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 15(%eax)
+; X86-NEXT:    movb %dl, 14(%eax)
 ; X86-NEXT:    movb %ch, 13(%eax)
-; X86-NEXT:    movb %dl, 12(%eax)
-; X86-NEXT:    movb %cl, 11(%eax)
+; X86-NEXT:    movb %dh, 12(%eax)
+; X86-NEXT:    movb %bl, 11(%eax)
 ; X86-NEXT:    movb %bh, 10(%eax)
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movb %cl, 9(%eax)
@@ -1523,6 +1218,7 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %1 = call <16 x i8> @llvm.ucmp(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i8> %1
@@ -1600,70 +1296,66 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    andl $127, %eax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT:    andl $127, %r13d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    andl $127, %r12d
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    andl $127, %eax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT:    andl $127, %r15d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    andl $127, %r14d
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    andl $127, %eax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; X64-NEXT:    andl $127, %ebx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT:    andl $127, %r12d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    andl $127, %r15d
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
 ; X64-NEXT:    andl $127, %ebp
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; X64-NEXT:    andl $127, %r11d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT:    andl $127, %r8d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    andl $127, %r13d
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; X64-NEXT:    andl $127, %r10d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    andl $127, %edx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; X64-NEXT:    andl $127, %esi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; X64-NEXT:    andl $127, %r14d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    andl $127, %ecx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT:    cmpq %r9, %rdi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    sbbq %r14, %rax
-; X64-NEXT:    setb %al
-; X64-NEXT:    cmpq %rdi, %r9
-; X64-NEXT:    sbbq %rcx, %r14
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    movl $255, %r14d
-; X64-NEXT:    cmovbl %r14d, %eax
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    andl $127, %edi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    andl $127, %edx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    cmpq %r9, %r8
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    sbbq %rax, %rcx
+; X64-NEXT:    setb %cl
+; X64-NEXT:    cmpq %r8, %r9
+; X64-NEXT:    sbbq %rdx, %rax
+; X64-NEXT:    sbbb $0, %cl
+; X64-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    sbbq %rdx, %rdi
-; X64-NEXT:    setb %dil
-; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    movq %rdi, %rdx
 ; X64-NEXT:    sbbq %rsi, %rdx
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    cmovbl %r14d, %eax
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    setb %dl
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    sbbq %rdi, %rsi
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    cmpq %rax, %rcx
 ; X64-NEXT:    movq %r10, %rdx
-; X64-NEXT:    sbbq %r8, %rdx
+; X64-NEXT:    sbbq %r13, %rdx
 ; X64-NEXT:    setb %dl
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %r10, %r8
-; X64-NEXT:    movzbl %dl, %eax
-; X64-NEXT:    cmovbl %r14d, %eax
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    sbbq %r10, %r13
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    cmpq %rax, %rcx
@@ -1672,184 +1364,179 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X64-NEXT:    setb %dl
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    sbbq %r11, %rbp
-; X64-NEXT:    movzbl %dl, %eax
-; X64-NEXT:    cmovbl %r14d, %eax
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    movq %r15, %rdx
 ; X64-NEXT:    sbbq %rbx, %rdx
 ; X64-NEXT:    setb %dl
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %r12, %rbx
-; X64-NEXT:    movzbl %dl, %eax
-; X64-NEXT:    cmovbl %r14d, %eax
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %rdx
-; X64-NEXT:    sbbq %r15, %rdx
-; X64-NEXT:    setb %dl
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %rsi, %r15
-; X64-NEXT:    movzbl %dl, %eax
-; X64-NEXT:    cmovbl %r14d, %eax
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    sbbq %r15, %rbx
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    cmpq %rax, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    movq %rsi, %rdx
-; X64-NEXT:    sbbq %r13, %rdx
+; X64-NEXT:    sbbq %r14, %rdx
 ; X64-NEXT:    setb %dl
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %rsi, %r13
-; X64-NEXT:    movzbl %dl, %eax
-; X64-NEXT:    cmovbl %r14d, %eax
-; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    sbbq %rsi, %r14
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %rsi
-; X64-NEXT:    setb %sil
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    sbbq %r12, %rax
+; X64-NEXT:    setb %r13b
 ; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbq %rdi, %rax
-; X64-NEXT:    movzbl %sil, %ebp
-; X64-NEXT:    cmovbl %r14d, %ebp
+; X64-NEXT:    sbbq %rsi, %r12
+; X64-NEXT:    sbbb $0, %r13b
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; X64-NEXT:    cmpq %rdx, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %rdi
-; X64-NEXT:    setb %dil
+; X64-NEXT:    sbbq %rax, %rcx
+; X64-NEXT:    setb %bpl
 ; X64-NEXT:    cmpq %rsi, %rdx
-; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    movzbl %dil, %ebx
-; X64-NEXT:    cmovbl %r14d, %ebx
+; X64-NEXT:    sbbq %rdi, %rax
+; X64-NEXT:    sbbb $0, %bpl
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; X64-NEXT:    cmpq %rsi, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq %rcx, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r8
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    sbbq %rax, %rdx
+; X64-NEXT:    setb %r11b
 ; X64-NEXT:    cmpq %rdi, %rsi
 ; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    movzbl %r8b, %r10d
-; X64-NEXT:    cmovbl %r14d, %r10d
+; X64-NEXT:    sbbb $0, %r11b
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; X64-NEXT:    cmpq %rdi, %r8
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    movq %rcx, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r9
-; X64-NEXT:    setb %r9b
+; X64-NEXT:    sbbq %rax, %rsi
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    cmpq %r8, %rdi
 ; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    movzbl %r9b, %r8d
-; X64-NEXT:    cmovbl %r14d, %r8d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    sbbb $0, %sil
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    cmpq %r8, %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rdi
+; X64-NEXT:    setb %dil
+; X64-NEXT:    cmpq %r9, %r8
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    sbbb $0, %dil
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    cmpq %rdi, %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    cmpq %r9, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    movq %rcx, %r8
 ; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r11
-; X64-NEXT:    setb %r11b
-; X64-NEXT:    cmpq %r9, %rdi
+; X64-NEXT:    sbbq %rax, %r8
+; X64-NEXT:    setb %r8b
+; X64-NEXT:    cmpq %r10, %r9
 ; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    movzbl %r11b, %r9d
-; X64-NEXT:    cmovbl %r14d, %r9d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; X64-NEXT:    cmpq %rdi, %r11
+; X64-NEXT:    sbbb $0, %r8b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    cmpq %r10, %rbx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq %rcx, %r9
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r15
-; X64-NEXT:    setb %r15b
-; X64-NEXT:    cmpq %r11, %rdi
+; X64-NEXT:    sbbq %rax, %r9
+; X64-NEXT:    setb %r9b
+; X64-NEXT:    cmpq %rbx, %r10
 ; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    movzbl %r15b, %r11d
-; X64-NEXT:    cmovbl %r14d, %r11d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    sbbb $0, %r9b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    cmpq %rax, %rdi
+; X64-NEXT:    cmpq %rax, %rbx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    sbbq %rcx, %r15
-; X64-NEXT:    setb %r15b
-; X64-NEXT:    cmpq %rdi, %rax
+; X64-NEXT:    sbbq %rcx, %r10
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    cmpq %rbx, %rax
 ; X64-NEXT:    sbbq %rdx, %rcx
-; X64-NEXT:    movzbl %r15b, %edi
-; X64-NEXT:    cmovbl %r14d, %edi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    sbbb $0, %r10b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    cmpq %rcx, %r15
+; X64-NEXT:    cmpq %rcx, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r12
-; X64-NEXT:    setb %r12b
-; X64-NEXT:    cmpq %r15, %rcx
+; X64-NEXT:    sbbq %rax, %rbx
+; X64-NEXT:    setb %bl
+; X64-NEXT:    cmpq %r14, %rcx
 ; X64-NEXT:    sbbq %rdx, %rax
-; X64-NEXT:    movzbl %r12b, %r15d
-; X64-NEXT:    cmovbl %r14d, %r15d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    sbbb $0, %bl
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    cmpq %rcx, %r12
+; X64-NEXT:    cmpq %rcx, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r13
-; X64-NEXT:    setb %r13b
-; X64-NEXT:    cmpq %r12, %rcx
+; X64-NEXT:    sbbq %rax, %r14
+; X64-NEXT:    setb %r14b
+; X64-NEXT:    cmpq %r15, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
 ; X64-NEXT:    sbbq %rdx, %rax
-; X64-NEXT:    movzbl %r13b, %r12d
-; X64-NEXT:    cmovbl %r14d, %r12d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    cmpq %rsi, %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    sbbb $0, %r14b
+; X64-NEXT:    cmpq %r12, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %r13
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r13
-; X64-NEXT:    setb %r13b
-; X64-NEXT:    cmpq %rdx, %rsi
-; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload
-; X64-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload
-; X64-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload
-; X64-NEXT:    # xmm3 = mem[0],zero,zero,zero
-; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; X64-NEXT:    # xmm4 = mem[0],zero,zero,zero
-; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload
-; X64-NEXT:    # xmm5 = mem[0],zero,zero,zero
-; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload
-; X64-NEXT:    # xmm6 = mem[0],zero,zero,zero
-; X64-NEXT:    movd %ebp, %xmm7
-; X64-NEXT:    movd %ebx, %xmm8
-; X64-NEXT:    movd %r10d, %xmm9
-; X64-NEXT:    movd %r8d, %xmm10
-; X64-NEXT:    movd %r9d, %xmm11
-; X64-NEXT:    movd %r11d, %xmm12
-; X64-NEXT:    movd %edi, %xmm13
-; X64-NEXT:    movd %r15d, %xmm14
+; X64-NEXT:    sbbq %rcx, %r15
+; X64-NEXT:    setb %r15b
+; X64-NEXT:    cmpq %rax, %r12
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm1
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm2
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm3
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm4
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm5
+; X64-NEXT:    movzbl %r13b, %eax
+; X64-NEXT:    movd %eax, %xmm6
+; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    movd %eax, %xmm7
+; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    movd %eax, %xmm8
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movd %eax, %xmm9
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    movd %eax, %xmm10
+; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    movd %eax, %xmm11
+; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    movd %eax, %xmm12
+; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    movd %eax, %xmm13
+; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    movd %eax, %xmm14
+; X64-NEXT:    movzbl %r14b, %eax
+; X64-NEXT:    movd %eax, %xmm15
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
@@ -1861,17 +1548,17 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; X64-NEXT:    movd %r12d, %xmm0
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
-; X64-NEXT:    movzbl %r13b, %eax
-; X64-NEXT:    cmovbl %r14d, %eax
+; X64-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; X64-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0]
+; X64-NEXT:    sbbq %rdx, %rcx
+; X64-NEXT:    sbbb $0, %r15b
+; X64-NEXT:    movzbl %r15b, %eax
 ; X64-NEXT:    andl $3, %eax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movb %al, 4(%rdi)
-; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    andl $3, %eax
 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
@@ -1950,502 +1637,471 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $127, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    subl $132, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    andl $127, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movb $-1, %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    jb .LBB18_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:  .LBB18_2:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %edx
-; X86-NEXT:    andl $127, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jb .LBB18_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB18_4:
 ; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    andl $127, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %ebx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jb .LBB18_6
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB18_6:
-; X86-NEXT:    andl $127, %ecx
-; X86-NEXT:    andl $127, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl %edi, %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    andl $127, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $127, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    movb $-1, %bl
-; X86-NEXT:    jb .LBB18_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:  .LBB18_8:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %edx
-; X86-NEXT:    andl $127, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %ebp, %edi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jb .LBB18_10
-; X86-NEXT:  # %bb.9:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB18_10:
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    andl $127, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    sbbl %ebp, %edi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jb .LBB18_12
-; X86-NEXT:  # %bb.11:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB18_12:
-; X86-NEXT:    andl $127, %ecx
-; X86-NEXT:    andl $127, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    movb $-1, %bl
-; X86-NEXT:    jb .LBB18_14
-; X86-NEXT:  # %bb.13:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:  .LBB18_14:
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %edx
-; X86-NEXT:    andl $127, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    sbbl %ebp, %edi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    movb $-1, %bh
-; X86-NEXT:    jb .LBB18_16
-; X86-NEXT:  # %bb.15:
-; X86-NEXT:    movb %bl, %bh
-; X86-NEXT:  .LBB18_16:
-; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    andl $127, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movb $-1, %dl
-; X86-NEXT:    jb .LBB18_18
-; X86-NEXT:  # %bb.17:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:  .LBB18_18:
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %ecx
-; X86-NEXT:    andl $127, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ebp, %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    movb $-1, %dh
-; X86-NEXT:    jb .LBB18_20
-; X86-NEXT:  # %bb.19:
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
-; X86-NEXT:  .LBB18_20:
-; X86-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %edi
-; X86-NEXT:    andl $127, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %ebp, %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %eax, %ebp
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    movb $-1, %cl
-; X86-NEXT:    jb .LBB18_22
-; X86-NEXT:  # %bb.21:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:  .LBB18_22:
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    andl $127, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %ebx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movb $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    jb .LBB18_24
-; X86-NEXT:  # %bb.23:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:  .LBB18_24:
-; X86-NEXT:    andl $127, %ebp
-; X86-NEXT:    andl $127, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sbbl %ebp, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB18_26
-; X86-NEXT:  # %bb.25:
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:  .LBB18_26:
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %edi
-; X86-NEXT:    andl $127, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sbbl %ebp, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ebx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ebp
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    movb $-1, %ah
-; X86-NEXT:    jb .LBB18_28
-; X86-NEXT:  # %bb.27:
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:  .LBB18_28:
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %edx
-; X86-NEXT:    andl $127, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    setb %cl
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %ebx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    movb $-1, %cl
-; X86-NEXT:    jb .LBB18_30
-; X86-NEXT:  # %bb.29:
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:  .LBB18_30:
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andl $127, %ebp
-; X86-NEXT:    andl $127, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %ebp, %edi
 ; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb $-1, %cl
-; X86-NEXT:    jb .LBB18_32
-; X86-NEXT:  # %bb.31:
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:  .LBB18_32:
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb %bh
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sbbb $0, %bh
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %esi, %edi
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movb $-1, %dl
-; X86-NEXT:    jb .LBB18_34
-; X86-NEXT:  # %bb.33:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:  .LBB18_34:
-; X86-NEXT:    movzbl %dl, %eax
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb %al, 4(%edx)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    andl $3, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movb %cl, 4(%edi)
+; X86-NEXT:    movzbl %bh, %ebp
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NEXT:    andl $3, %ebp
 ; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    leal (%edi,%ecx,4), %eax
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    shll $4, %esi
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    leal (%ecx,%ebp,4), %ecx
+; X86-NEXT:    andl $3, %eax
+; X86-NEXT:    shll $4, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    andl $3, %ebx
 ; X86-NEXT:    shll $6, %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    andl $3, %ebp
-; X86-NEXT:    shll $8, %ebp
-; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    shll $8, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    andl $3, %edx
 ; X86-NEXT:    shll $10, %edx
-; X86-NEXT:    orl %ebp, %edx
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %eax
 ; X86-NEXT:    shll $12, %eax
@@ -2457,37 +2113,37 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    andl $3, %eax
 ; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    shll $18, %edi
-; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    shll $18, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    andl $3, %eax
+; X86-NEXT:    shll $20, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    shll $20, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    andl $3, %ecx
+; X86-NEXT:    shll $22, %ecx
 ; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    shll $22, %esi
-; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    shll $24, %edi
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    shll $24, %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    andl $3, %ebx
 ; X86-NEXT:    shll $26, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $28, %eax
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    andl $3, %ecx
+; X86-NEXT:    shll $28, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
 ; X86-NEXT:    shll $30, %edx
-; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, (%edi)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    addl $132, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx


        


More information about the llvm-commits mailing list