[llvm] [ARM] LHS and RHS should be frozen for LowerCMP (PR #159993)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 22 09:11:57 PDT 2025
https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/159993
>From 942b079a52510a7910fbbf37322d528d3f2a0499 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Sun, 21 Sep 2025 13:38:48 -0400
Subject: [PATCH 1/2] [TargetLowering][ARM] Freeze operands in UCMP
LHS and RHS are used multiple times.
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 8 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 4 +-
llvm/lib/Target/ARM/ARMISelLowering.cpp | 13 +-
.../GlobalISel/legalize-threeway-cmp.mir | 34 +-
llvm/test/CodeGen/AArch64/freeze.ll | 32 +-
llvm/test/CodeGen/AArch64/ucmp.ll | 8 +-
llvm/test/CodeGen/ARM/scmp.ll | 30 +-
llvm/test/CodeGen/ARM/ucmp.ll | 30 +-
llvm/test/CodeGen/PowerPC/ucmp.ll | 12 +-
llvm/test/CodeGen/SystemZ/ucmp.ll | 4 +-
llvm/test/CodeGen/Thumb/scmp.ll | 60 +-
llvm/test/CodeGen/Thumb/ucmp.ll | 60 +-
llvm/test/CodeGen/X86/scmp.ll | 880 ++++++-----
llvm/test/CodeGen/X86/ucmp.ll | 1372 +++++++++--------
14 files changed, 1317 insertions(+), 1230 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f3e036ed1b947..3be957378286f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8582,6 +8582,8 @@ LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Cmp->getReg(1));
LLT CmpTy = DstTy.changeElementSize(1);
+ auto LHS = MIRBuilder.buildFreeze(SrcTy, Cmp->getLHSReg());
+ auto RHS = MIRBuilder.buildFreeze(SrcTy, Cmp->getRHSReg());
CmpInst::Predicate LTPredicate = Cmp->isSigned()
? CmpInst::Predicate::ICMP_SLT
@@ -8591,10 +8593,8 @@ LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
: CmpInst::Predicate::ICMP_UGT;
auto Zero = MIRBuilder.buildConstant(DstTy, 0);
- auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, Cmp->getLHSReg(),
- Cmp->getRHSReg());
- auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, Cmp->getLHSReg(),
- Cmp->getRHSReg());
+ auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, LHS, RHS);
+ auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, LHS, RHS);
auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
auto BC = TLI.getBooleanContents(DstTy.isVector(), /*isFP=*/false);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 80500e48351e4..02f85cfc9262e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10956,8 +10956,8 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
unsigned Opcode = Node->getOpcode();
- SDValue LHS = Node->getOperand(0);
- SDValue RHS = Node->getOperand(1);
+ SDValue LHS = DAG.getFreeze(Node->getOperand(0));
+ SDValue RHS = DAG.getFreeze(Node->getOperand(1));
EVT VT = LHS.getValueType();
EVT ResVT = Node->getValueType(0);
EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9052cbfa89deb..01ab006d288fa 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -10479,6 +10479,9 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
// Special case for Thumb1 UCMP only
if (!IsSigned && Subtarget->isThumb1Only()) {
+ LHS = DAG.getFreeze(LHS);
+ RHS = DAG.getFreeze(RHS);
+
// For Thumb unsigned comparison, use this sequence:
// subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
// sbc r2, r2 ; r2 = r2 - r2 - !carry
@@ -10511,10 +10514,7 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
// Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
SDValue Result =
DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
- if (Op.getValueType() != MVT::i32)
- Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
-
- return Result;
+ return DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
}
// For the ARM assembly pattern:
@@ -10582,10 +10582,7 @@ SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
LTCondValue, Flags);
- if (Op.getValueType() != MVT::i32)
- Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
-
- return Result2;
+ return DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
}
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
index ae16e40671785..e1c63005ee9d2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-threeway-cmp.mir
@@ -7,8 +7,10 @@ body: |
; CHECK-LABEL: name: test_scmp
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
+ ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+ ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[FREEZE]](s64), [[FREEZE1]]
+ ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[FREEZE]](s64), [[FREEZE1]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]]
@@ -30,8 +32,10 @@ body: |
; CHECK-LABEL: name: test_ucmp
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+ ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+ ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE]](s64), [[FREEZE1]]
+ ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE]](s64), [[FREEZE1]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[C1]]
@@ -61,8 +65,10 @@ body: |
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $w2
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $w3
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ugt), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ult), [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]]
+ ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[BUILD_VECTOR1]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ugt), [[FREEZE]](<4 x s32>), [[FREEZE1]]
+ ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(ult), [[FREEZE]](<4 x s32>), [[FREEZE1]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s16>) = G_SUB [[TRUNC]], [[TRUNC1]]
@@ -92,13 +98,17 @@ body: |
; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[DEF]](s64), [[DEF]]
- ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
+ ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY]]
+ ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s64) = G_FREEZE [[DEF]]
+ ; CHECK-NEXT: [[FREEZE2:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+ ; CHECK-NEXT: [[FREEZE3:%[0-9]+]]:_(s64) = G_FREEZE [[DEF]]
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE]](s64), [[FREEZE2]]
+ ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[FREEZE1]](s64), [[FREEZE3]]
+ ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[FREEZE1]](s64), [[FREEZE3]]
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP]], [[ICMP1]]
- ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
- ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[DEF]]
- ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[DEF]]
+ ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE]](s64), [[FREEZE2]]
+ ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[FREEZE1]](s64), [[FREEZE3]]
+ ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[FREEZE1]](s64), [[FREEZE3]]
; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP3]], [[ICMP4]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index fae3bbe2dcfba..cdc7fc12404ca 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -522,16 +522,28 @@ define i32 @freeze_scmp(i32 %a0) nounwind {
}
define i32 @freeze_ucmp(i32 %a0) nounwind {
-; CHECK-LABEL: freeze_ucmp:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2 // =0x2
-; CHECK-NEXT: cmp w8, w0
-; CHECK-NEXT: cset w8, hi
-; CHECK-NEXT: csinv w8, w8, wzr, hs
-; CHECK-NEXT: cmp w8, #1
-; CHECK-NEXT: cset w8, hi
-; CHECK-NEXT: csinv w0, w8, wzr, hs
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: freeze_ucmp:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #2 // =0x2
+; CHECK-SD-NEXT: cmp w8, w0
+; CHECK-SD-NEXT: cset w8, hi
+; CHECK-SD-NEXT: csinv w8, w8, wzr, hs
+; CHECK-SD-NEXT: cmp w8, #1
+; CHECK-SD-NEXT: cset w8, hi
+; CHECK-SD-NEXT: csinv w0, w8, wzr, hs
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: freeze_ucmp:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #2 // =0x2
+; CHECK-GI-NEXT: mov w9, #1 // =0x1
+; CHECK-GI-NEXT: cmp w8, w0
+; CHECK-GI-NEXT: cset w8, hi
+; CHECK-GI-NEXT: csinv w8, w8, wzr, hs
+; CHECK-GI-NEXT: cmp w8, w9
+; CHECK-GI-NEXT: cset w8, hi
+; CHECK-GI-NEXT: csinv w0, w8, wzr, hs
+; CHECK-GI-NEXT: ret
%x = call i32 @llvm.ucmp.i32(i32 2, i32 %a0)
%y = freeze i32 %x
%z = call i32 @llvm.ucmp.i32(i32 %y, i32 1)
diff --git a/llvm/test/CodeGen/AArch64/ucmp.ll b/llvm/test/CodeGen/AArch64/ucmp.ll
index af8225307fedd..6b5bcfa400230 100644
--- a/llvm/test/CodeGen/AArch64/ucmp.ll
+++ b/llvm/test/CodeGen/AArch64/ucmp.ll
@@ -13,8 +13,8 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
;
; CHECK-GI-LABEL: ucmp.8.8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: and w8, w0, #0xff
-; CHECK-GI-NEXT: and w9, w1, #0xff
+; CHECK-GI-NEXT: uxtb w8, w0
+; CHECK-GI-NEXT: uxtb w9, w1
; CHECK-GI-NEXT: cmp w8, w9
; CHECK-GI-NEXT: cset w8, hi
; CHECK-GI-NEXT: csinv w0, w8, wzr, hs
@@ -34,8 +34,8 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
;
; CHECK-GI-LABEL: ucmp.8.16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: and w8, w0, #0xffff
-; CHECK-GI-NEXT: and w9, w1, #0xffff
+; CHECK-GI-NEXT: uxth w8, w0
+; CHECK-GI-NEXT: uxth w9, w1
; CHECK-GI-NEXT: cmp w8, w9
; CHECK-GI-NEXT: cset w8, hi
; CHECK-GI-NEXT: csinv w0, w8, wzr, hs
diff --git a/llvm/test/CodeGen/ARM/scmp.ll b/llvm/test/CodeGen/ARM/scmp.ll
index 9189aee6aaf43..07a08f46ee1ca 100644
--- a/llvm/test/CodeGen/ARM/scmp.ll
+++ b/llvm/test/CodeGen/ARM/scmp.ll
@@ -58,23 +58,23 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-NEXT: ldr r4, [sp, #24]
-; CHECK-NEXT: mov r5, #0
-; CHECK-NEXT: ldr r6, [sp, #28]
-; CHECK-NEXT: subs r7, r0, r4
-; CHECK-NEXT: ldr r12, [sp, #32]
-; CHECK-NEXT: sbcs r7, r1, r6
-; CHECK-NEXT: ldr lr, [sp, #36]
-; CHECK-NEXT: sbcs r7, r2, r12
-; CHECK-NEXT: sbcs r7, r3, lr
+; CHECK-NEXT: ldr r5, [sp, #24]
+; CHECK-NEXT: mov r6, #0
+; CHECK-NEXT: ldr r4, [sp, #28]
+; CHECK-NEXT: subs r7, r0, r5
+; CHECK-NEXT: ldr lr, [sp, #32]
+; CHECK-NEXT: sbcs r7, r1, r4
+; CHECK-NEXT: ldr r12, [sp, #36]
+; CHECK-NEXT: sbcs r7, r2, lr
+; CHECK-NEXT: sbcs r7, r3, r12
; CHECK-NEXT: mov r7, #0
; CHECK-NEXT: movwlt r7, #1
-; CHECK-NEXT: subs r0, r4, r0
-; CHECK-NEXT: sbcs r0, r6, r1
-; CHECK-NEXT: sbcs r0, r12, r2
-; CHECK-NEXT: sbcs r0, lr, r3
-; CHECK-NEXT: movwlt r5, #1
-; CHECK-NEXT: sub r0, r5, r7
+; CHECK-NEXT: subs r0, r5, r0
+; CHECK-NEXT: sbcs r0, r4, r1
+; CHECK-NEXT: sbcs r0, lr, r2
+; CHECK-NEXT: sbcs r0, r12, r3
+; CHECK-NEXT: movwlt r6, #1
+; CHECK-NEXT: sub r0, r6, r7
; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc}
%1 = call i8 @llvm.scmp(i128 %x, i128 %y)
ret i8 %1
diff --git a/llvm/test/CodeGen/ARM/ucmp.ll b/llvm/test/CodeGen/ARM/ucmp.ll
index bb0201454d1ea..a15cc4cca0d39 100644
--- a/llvm/test/CodeGen/ARM/ucmp.ll
+++ b/llvm/test/CodeGen/ARM/ucmp.ll
@@ -58,23 +58,23 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
-; CHECK-NEXT: ldr r4, [sp, #24]
-; CHECK-NEXT: mov r5, #0
-; CHECK-NEXT: ldr r6, [sp, #28]
-; CHECK-NEXT: subs r7, r0, r4
-; CHECK-NEXT: ldr r12, [sp, #32]
-; CHECK-NEXT: sbcs r7, r1, r6
-; CHECK-NEXT: ldr lr, [sp, #36]
-; CHECK-NEXT: sbcs r7, r2, r12
-; CHECK-NEXT: sbcs r7, r3, lr
+; CHECK-NEXT: ldr r5, [sp, #24]
+; CHECK-NEXT: mov r6, #0
+; CHECK-NEXT: ldr r4, [sp, #28]
+; CHECK-NEXT: subs r7, r0, r5
+; CHECK-NEXT: ldr lr, [sp, #32]
+; CHECK-NEXT: sbcs r7, r1, r4
+; CHECK-NEXT: ldr r12, [sp, #36]
+; CHECK-NEXT: sbcs r7, r2, lr
+; CHECK-NEXT: sbcs r7, r3, r12
; CHECK-NEXT: mov r7, #0
; CHECK-NEXT: movwlo r7, #1
-; CHECK-NEXT: subs r0, r4, r0
-; CHECK-NEXT: sbcs r0, r6, r1
-; CHECK-NEXT: sbcs r0, r12, r2
-; CHECK-NEXT: sbcs r0, lr, r3
-; CHECK-NEXT: movwlo r5, #1
-; CHECK-NEXT: sub r0, r5, r7
+; CHECK-NEXT: subs r0, r5, r0
+; CHECK-NEXT: sbcs r0, r4, r1
+; CHECK-NEXT: sbcs r0, lr, r2
+; CHECK-NEXT: sbcs r0, r12, r3
+; CHECK-NEXT: movwlo r6, #1
+; CHECK-NEXT: sub r0, r6, r7
; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc}
%1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
ret i8 %1
diff --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll
index d2dff6e7e05c8..22faf9cbd9c24 100644
--- a/llvm/test/CodeGen/PowerPC/ucmp.ll
+++ b/llvm/test/CodeGen/PowerPC/ucmp.ll
@@ -4,11 +4,13 @@
define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
; CHECK-LABEL: ucmp_8_8:
; CHECK: # %bb.0:
+; CHECK-NEXT: clrldi 5, 4, 32
+; CHECK-NEXT: clrldi 6, 3, 32
+; CHECK-NEXT: sub 5, 5, 6
; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: sub 5, 4, 3
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: rldicl 5, 5, 1, 63
; CHECK-NEXT: rldic 3, 3, 0, 32
+; CHECK-NEXT: rldicl 5, 5, 1, 63
; CHECK-NEXT: isellt 3, 3, 5
; CHECK-NEXT: blr
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
@@ -18,11 +20,13 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
; CHECK-LABEL: ucmp_8_16:
; CHECK: # %bb.0:
+; CHECK-NEXT: clrldi 5, 4, 32
+; CHECK-NEXT: clrldi 6, 3, 32
+; CHECK-NEXT: sub 5, 5, 6
; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: sub 5, 4, 3
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: rldicl 5, 5, 1, 63
; CHECK-NEXT: rldic 3, 3, 0, 32
+; CHECK-NEXT: rldicl 5, 5, 1, 63
; CHECK-NEXT: isellt 3, 3, 5
; CHECK-NEXT: blr
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
diff --git a/llvm/test/CodeGen/SystemZ/ucmp.ll b/llvm/test/CodeGen/SystemZ/ucmp.ll
index 4175cd7850a98..786f5610c2d1f 100644
--- a/llvm/test/CodeGen/SystemZ/ucmp.ll
+++ b/llvm/test/CodeGen/SystemZ/ucmp.ll
@@ -4,7 +4,7 @@
define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind {
; CHECK-LABEL: ucmp.8.8:
; CHECK: # %bb.0:
-; CHECK-NEXT: cr %r2, %r3
+; CHECK-NEXT: clr %r2, %r3
; CHECK-NEXT: lhi %r2, 0
; CHECK-NEXT: lochih %r2, 1
; CHECK-NEXT: lochil %r2, -1
@@ -16,7 +16,7 @@ define i8 @ucmp.8.8(i8 zeroext %x, i8 zeroext %y) nounwind {
define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind {
; CHECK-LABEL: ucmp.8.16:
; CHECK: # %bb.0:
-; CHECK-NEXT: cr %r2, %r3
+; CHECK-NEXT: clr %r2, %r3
; CHECK-NEXT: lhi %r2, 0
; CHECK-NEXT: lochih %r2, 1
; CHECK-NEXT: lochil %r2, -1
diff --git a/llvm/test/CodeGen/Thumb/scmp.ll b/llvm/test/CodeGen/Thumb/scmp.ll
index c0024492b3a6d..cf73771d3c426 100644
--- a/llvm/test/CodeGen/Thumb/scmp.ll
+++ b/llvm/test/CodeGen/Thumb/scmp.ll
@@ -184,44 +184,50 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
; THUMB1: @ %bb.0:
; THUMB1-NEXT: .save {r4, r5, r6, r7, lr}
; THUMB1-NEXT: push {r4, r5, r6, r7, lr}
-; THUMB1-NEXT: .pad #20
-; THUMB1-NEXT: sub sp, #20
-; THUMB1-NEXT: str r3, [sp, #16] @ 4-byte Spill
-; THUMB1-NEXT: movs r3, #1
-; THUMB1-NEXT: str r3, [sp] @ 4-byte Spill
-; THUMB1-NEXT: movs r3, #0
-; THUMB1-NEXT: str r3, [sp, #12] @ 4-byte Spill
-; THUMB1-NEXT: ldr r6, [sp, #52]
-; THUMB1-NEXT: add r7, sp, #40
-; THUMB1-NEXT: ldm r7, {r3, r5, r7}
-; THUMB1-NEXT: subs r4, r0, r3
-; THUMB1-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; THUMB1-NEXT: .pad #36
+; THUMB1-NEXT: sub sp, #36
+; THUMB1-NEXT: ldr r4, [sp, #68]
+; THUMB1-NEXT: str r4, [sp, #8] @ 4-byte Spill
+; THUMB1-NEXT: add r7, sp, #56
+; THUMB1-NEXT: ldm r7, {r5, r6, r7}
+; THUMB1-NEXT: movs r4, #1
+; THUMB1-NEXT: str r4, [sp, #4] @ 4-byte Spill
+; THUMB1-NEXT: movs r4, #0
+; THUMB1-NEXT: str r4, [sp, #24] @ 4-byte Spill
+; THUMB1-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; THUMB1-NEXT: str r5, [sp, #12] @ 4-byte Spill
+; THUMB1-NEXT: subs r4, r0, r5
+; THUMB1-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; THUMB1-NEXT: str r1, [sp, #20] @ 4-byte Spill
; THUMB1-NEXT: mov r4, r1
-; THUMB1-NEXT: ldr r1, [sp] @ 4-byte Reload
-; THUMB1-NEXT: sbcs r4, r5
-; THUMB1-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; THUMB1-NEXT: sbcs r4, r6
+; THUMB1-NEXT: str r2, [sp, #28] @ 4-byte Spill
; THUMB1-NEXT: mov r4, r2
+; THUMB1-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; THUMB1-NEXT: sbcs r4, r7
-; THUMB1-NEXT: ldr r4, [sp, #16] @ 4-byte Reload
-; THUMB1-NEXT: sbcs r4, r6
-; THUMB1-NEXT: mov r2, r1
+; THUMB1-NEXT: str r3, [sp, #32] @ 4-byte Spill
+; THUMB1-NEXT: mov r4, r3
+; THUMB1-NEXT: sbcs r4, r5
+; THUMB1-NEXT: mov r1, r2
; THUMB1-NEXT: blt .LBB4_2
; THUMB1-NEXT: @ %bb.1:
-; THUMB1-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; THUMB1-NEXT: .LBB4_2:
-; THUMB1-NEXT: subs r0, r3, r0
-; THUMB1-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; THUMB1-NEXT: sbcs r5, r0
-; THUMB1-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; THUMB1-NEXT: sbcs r7, r0
; THUMB1-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; THUMB1-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT: subs r0, r3, r0
+; THUMB1-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; THUMB1-NEXT: sbcs r6, r0
+; THUMB1-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
+; THUMB1-NEXT: sbcs r7, r0
+; THUMB1-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; THUMB1-NEXT: sbcs r5, r0
; THUMB1-NEXT: blt .LBB4_4
; THUMB1-NEXT: @ %bb.3:
-; THUMB1-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
; THUMB1-NEXT: .LBB4_4:
-; THUMB1-NEXT: subs r0, r1, r2
-; THUMB1-NEXT: add sp, #20
+; THUMB1-NEXT: subs r0, r2, r1
+; THUMB1-NEXT: add sp, #36
; THUMB1-NEXT: pop {r4, r5, r6, r7, pc}
;
; THUMB2-LABEL: scmp_8_128:
diff --git a/llvm/test/CodeGen/Thumb/ucmp.ll b/llvm/test/CodeGen/Thumb/ucmp.ll
index 5d0f57e2a9d72..e10a162ed0474 100644
--- a/llvm/test/CodeGen/Thumb/ucmp.ll
+++ b/llvm/test/CodeGen/Thumb/ucmp.ll
@@ -151,44 +151,50 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
; THUMB1: @ %bb.0:
; THUMB1-NEXT: .save {r4, r5, r6, r7, lr}
; THUMB1-NEXT: push {r4, r5, r6, r7, lr}
-; THUMB1-NEXT: .pad #20
-; THUMB1-NEXT: sub sp, #20
-; THUMB1-NEXT: str r3, [sp, #16] @ 4-byte Spill
-; THUMB1-NEXT: movs r3, #1
-; THUMB1-NEXT: str r3, [sp] @ 4-byte Spill
-; THUMB1-NEXT: movs r3, #0
-; THUMB1-NEXT: str r3, [sp, #12] @ 4-byte Spill
-; THUMB1-NEXT: ldr r6, [sp, #52]
-; THUMB1-NEXT: add r7, sp, #40
-; THUMB1-NEXT: ldm r7, {r3, r5, r7}
-; THUMB1-NEXT: subs r4, r0, r3
-; THUMB1-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; THUMB1-NEXT: .pad #36
+; THUMB1-NEXT: sub sp, #36
+; THUMB1-NEXT: ldr r4, [sp, #68]
+; THUMB1-NEXT: str r4, [sp, #8] @ 4-byte Spill
+; THUMB1-NEXT: add r7, sp, #56
+; THUMB1-NEXT: ldm r7, {r5, r6, r7}
+; THUMB1-NEXT: movs r4, #1
+; THUMB1-NEXT: str r4, [sp, #4] @ 4-byte Spill
+; THUMB1-NEXT: movs r4, #0
+; THUMB1-NEXT: str r4, [sp, #24] @ 4-byte Spill
+; THUMB1-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; THUMB1-NEXT: str r5, [sp, #12] @ 4-byte Spill
+; THUMB1-NEXT: subs r4, r0, r5
+; THUMB1-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; THUMB1-NEXT: str r1, [sp, #20] @ 4-byte Spill
; THUMB1-NEXT: mov r4, r1
-; THUMB1-NEXT: ldr r1, [sp] @ 4-byte Reload
-; THUMB1-NEXT: sbcs r4, r5
-; THUMB1-NEXT: str r2, [sp, #8] @ 4-byte Spill
+; THUMB1-NEXT: sbcs r4, r6
+; THUMB1-NEXT: str r2, [sp, #28] @ 4-byte Spill
; THUMB1-NEXT: mov r4, r2
+; THUMB1-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; THUMB1-NEXT: sbcs r4, r7
-; THUMB1-NEXT: ldr r4, [sp, #16] @ 4-byte Reload
-; THUMB1-NEXT: sbcs r4, r6
-; THUMB1-NEXT: mov r2, r1
+; THUMB1-NEXT: str r3, [sp, #32] @ 4-byte Spill
+; THUMB1-NEXT: mov r4, r3
+; THUMB1-NEXT: sbcs r4, r5
+; THUMB1-NEXT: mov r1, r2
; THUMB1-NEXT: blo .LBB4_2
; THUMB1-NEXT: @ %bb.1:
-; THUMB1-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; THUMB1-NEXT: .LBB4_2:
-; THUMB1-NEXT: subs r0, r3, r0
-; THUMB1-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; THUMB1-NEXT: sbcs r5, r0
-; THUMB1-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; THUMB1-NEXT: sbcs r7, r0
; THUMB1-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; THUMB1-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT: subs r0, r3, r0
+; THUMB1-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; THUMB1-NEXT: sbcs r6, r0
+; THUMB1-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
+; THUMB1-NEXT: sbcs r7, r0
+; THUMB1-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; THUMB1-NEXT: sbcs r5, r0
; THUMB1-NEXT: blo .LBB4_4
; THUMB1-NEXT: @ %bb.3:
-; THUMB1-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; THUMB1-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
; THUMB1-NEXT: .LBB4_4:
-; THUMB1-NEXT: subs r0, r1, r2
-; THUMB1-NEXT: add sp, #20
+; THUMB1-NEXT: subs r0, r2, r1
+; THUMB1-NEXT: add sp, #36
; THUMB1-NEXT: pop {r4, r5, r6, r7, pc}
;
; THUMB2-LABEL: ucmp_8_128:
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 8a287229a1cb1..5a7a05d09763e 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -17,7 +17,7 @@ define i8 @scmp.8.8(i8 %x, i8 %y) nounwind {
; X86-LABEL: scmp.8.8:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %cl
; X86-NEXT: setg %al
; X86-NEXT: subb %cl, %al
@@ -38,7 +38,7 @@ define i8 @scmp.8.16(i16 %x, i16 %y) nounwind {
; X86-LABEL: scmp.8.16:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: cmpw %ax, {{[0-9]+}}(%esp)
; X86-NEXT: setl %cl
; X86-NEXT: setg %al
; X86-NEXT: subb %cl, %al
@@ -59,7 +59,7 @@ define i8 @scmp.8.32(i32 %x, i32 %y) nounwind {
; X86-LABEL: scmp.8.32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: setl %cl
; X86-NEXT: setg %al
; X86-NEXT: subb %cl, %al
@@ -167,7 +167,7 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
; X86-LABEL: scmp.32.32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
@@ -263,7 +263,7 @@ define i4 @scmp_narrow_result(i32 %x, i32 %y) nounwind {
; X86-LABEL: scmp_narrow_result:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: setl %cl
; X86-NEXT: setg %al
; X86-NEXT: subb %cl, %al
@@ -330,9 +330,9 @@ define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind {
;
; X86-LABEL: scmp_wide_result:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: setl %cl
; X86-NEXT: setg %dl
; X86-NEXT: subb %cl, %dl
@@ -471,27 +471,27 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: setl %dl
; X86-NEXT: setg %dh
; X86-NEXT: subb %dl, %dh
; X86-NEXT: movsbl %dh, %edx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: setl %bl
; X86-NEXT: setg %bh
; X86-NEXT: subb %bl, %bh
; X86-NEXT: movsbl %bh, %edi
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: setl %bl
; X86-NEXT: setg %bh
; X86-NEXT: subb %bl, %bh
; X86-NEXT: movsbl %bh, %esi
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: setl %cl
; X86-NEXT: setg %ch
; X86-NEXT: subb %cl, %ch
@@ -628,31 +628,31 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: setl %ch
-; X86-NEXT: setg %cl
-; X86-NEXT: subb %ch, %cl
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: setl %ch
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: setl %dh
+; X86-NEXT: setg %dl
+; X86-NEXT: subb %dh, %dl
+; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: setl %dh
; X86-NEXT: setg %bl
-; X86-NEXT: subb %ch, %bl
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: setl %ch
+; X86-NEXT: subb %dh, %bl
+; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: setl %dh
; X86-NEXT: setg %bh
-; X86-NEXT: subb %ch, %bh
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: setl %dl
+; X86-NEXT: subb %dh, %bh
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: setl %cl
; X86-NEXT: setg %ch
-; X86-NEXT: subb %dl, %ch
+; X86-NEXT: subb %cl, %ch
; X86-NEXT: movb %ch, 3(%eax)
; X86-NEXT: movb %bh, 2(%eax)
; X86-NEXT: movb %bl, 1(%eax)
-; X86-NEXT: movb %cl, (%eax)
+; X86-NEXT: movb %dl, (%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -713,27 +713,27 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpb %dl, {{[0-9]+}}(%esp)
; X86-NEXT: setl %dl
; X86-NEXT: setg %dh
; X86-NEXT: subb %dl, %dh
; X86-NEXT: movsbl %dh, %edx
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT: cmpb %bl, {{[0-9]+}}(%esp)
; X86-NEXT: setl %bl
; X86-NEXT: setg %bh
; X86-NEXT: subb %bl, %bh
; X86-NEXT: movsbl %bh, %esi
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: cmpb %ch, {{[0-9]+}}(%esp)
; X86-NEXT: setl %ch
; X86-NEXT: setg %bl
; X86-NEXT: subb %ch, %bl
; X86-NEXT: movsbl %bl, %edi
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: cmpb %cl, {{[0-9]+}}(%esp)
; X86-NEXT: setl %cl
; X86-NEXT: setg %ch
; X86-NEXT: subb %cl, %ch
@@ -869,90 +869,90 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bh
+; X86-NEXT: cmpb %bh, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT: cmpb %bl, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: cmpb %dh, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: cmpb %ch, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: cmpb %ah, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: cmpb %dl, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %bh
; X86-NEXT: subb %al, %bh
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %bl
; X86-NEXT: subb %al, %bl
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %dh
; X86-NEXT: subb %al, %dh
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %dl
; X86-NEXT: subb %al, %dl
; X86-NEXT: movsbl %dl, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %dl
; X86-NEXT: subb %al, %dl
; X86-NEXT: movsbl %dl, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %dl
; X86-NEXT: subb %al, %dl
; X86-NEXT: movsbl %dl, %ebp
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %dl
; X86-NEXT: subb %al, %dl
; X86-NEXT: movsbl %dl, %edi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %ah
; X86-NEXT: subb %al, %ah
; X86-NEXT: movsbl %ah, %esi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %dl
; X86-NEXT: subb %al, %dl
@@ -999,154 +999,179 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
; SSE2-LABEL: scmp_wide_vec_op:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm7, %rax
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
+; SSE2-NEXT: movq %xmm8, %rax
; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
-; SSE2-NEXT: movq %xmm7, %rax
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %xmm7, %rcx
+; SSE2-NEXT: movd %eax, %xmm7
+; SSE2-NEXT: movq %xmm8, %rax
+; SSE2-NEXT: cmpq %rax, %rcx
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm7
-; SSE2-NEXT: movq %xmm6, %rax
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movd %eax, %xmm8
; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
+; SSE2-NEXT: movq %xmm7, %rax
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; SSE2-NEXT: movq %xmm6, %rax
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %xmm6, %rcx
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: movq %xmm7, %rax
+; SSE2-NEXT: cmpq %rax, %rcx
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
; SSE2-NEXT: movd %eax, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; SSE2-NEXT: movq %xmm5, %rax
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
+; SSE2-NEXT: movq %xmm7, %rax
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; SSE2-NEXT: movq %xmm5, %rcx
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: movq %xmm7, %rax
+; SSE2-NEXT: cmpq %rax, %rcx
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movq %xmm4, %rcx
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: movd %eax, %xmm7
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT: movq %xmm5, %rax
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; SSE2-NEXT: movq %xmm4, %rax
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %xmm4, %rcx
+; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: movq %xmm5, %rax
+; SSE2-NEXT: cmpq %rax, %rcx
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
+; SSE2-NEXT: movq %xmm5, %rax
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; SSE2-NEXT: movq %xmm3, %rcx
; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq %xmm5, %rax
+; SSE2-NEXT: cmpq %rax, %rcx
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: setl %al
-; SSE2-NEXT: setg %dl
-; SSE2-NEXT: subb %al, %dl
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: movzbl %dl, %eax
-; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %xmm2, %rcx
+; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: cmpq %rax, %rcx
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
+; SSE2-NEXT: movq %xmm1, %rcx
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: cmpq %rax, %rcx
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: setl %al
; SSE2-NEXT: setg %cl
; SSE2-NEXT: subb %al, %cl
; SSE2-NEXT: movzbl %cl, %eax
+; SSE2-NEXT: movq %xmm0, %rcx
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: cmpq %rax, %rcx
+; SSE2-NEXT: setl %al
+; SSE2-NEXT: setg %cl
+; SSE2-NEXT: subb %al, %cl
+; SSE2-NEXT: movzbl %cl, %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE2-NEXT: retq
;
; SSE4-LABEL: scmp_wide_vec_op:
; SSE4: # %bb.0:
+; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
+; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
+; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12
+; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13
+; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15
; SSE4-NEXT: pextrq $1, %xmm0, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: pextrq $1, %xmm15, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: movq %xmm0, %rcx
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; SSE4-NEXT: movq %xmm15, %rdx
+; SSE4-NEXT: cmpq %rdx, %rcx
; SSE4-NEXT: setl %cl
; SSE4-NEXT: setg %dl
; SSE4-NEXT: subb %cl, %dl
@@ -1154,98 +1179,112 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
; SSE4-NEXT: movd %ecx, %xmm0
; SSE4-NEXT: pinsrb $1, %eax, %xmm0
; SSE4-NEXT: movq %xmm1, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq %xmm14, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $2, %eax, %xmm0
; SSE4-NEXT: pextrq $1, %xmm1, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: pextrq $1, %xmm14, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $3, %eax, %xmm0
; SSE4-NEXT: movq %xmm2, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq %xmm13, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $4, %eax, %xmm0
; SSE4-NEXT: pextrq $1, %xmm2, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: pextrq $1, %xmm13, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $5, %eax, %xmm0
; SSE4-NEXT: movq %xmm3, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq %xmm12, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $6, %eax, %xmm0
; SSE4-NEXT: pextrq $1, %xmm3, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: pextrq $1, %xmm12, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $7, %eax, %xmm0
; SSE4-NEXT: movq %xmm4, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq %xmm11, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $8, %eax, %xmm0
; SSE4-NEXT: pextrq $1, %xmm4, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: pextrq $1, %xmm11, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $9, %eax, %xmm0
; SSE4-NEXT: movq %xmm5, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq %xmm10, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $10, %eax, %xmm0
; SSE4-NEXT: pextrq $1, %xmm5, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: pextrq $1, %xmm10, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $11, %eax, %xmm0
; SSE4-NEXT: movq %xmm6, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq %xmm9, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $12, %eax, %xmm0
; SSE4-NEXT: pextrq $1, %xmm6, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: pextrq $1, %xmm9, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $13, %eax, %xmm0
; SSE4-NEXT: movq %xmm7, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq %xmm8, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
; SSE4-NEXT: movzbl %cl, %eax
; SSE4-NEXT: pinsrb $14, %eax, %xmm0
; SSE4-NEXT: pextrq $1, %xmm7, %rax
-; SSE4-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: pextrq $1, %xmm8, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
; SSE4-NEXT: setl %al
; SSE4-NEXT: setg %cl
; SSE4-NEXT: subb %al, %cl
@@ -1767,58 +1806,71 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %rdi, %rax
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; SSE2-NEXT: addb %dil, %dil
+; SSE2-NEXT: sarb %dil
+; SSE2-NEXT: addb %sil, %sil
+; SSE2-NEXT: sarb %sil
+; SSE2-NEXT: cmpb %dil, %sil
+; SSE2-NEXT: setl %sil
+; SSE2-NEXT: setg %dil
+; SSE2-NEXT: subb %sil, %dil
+; SSE2-NEXT: movsbq %dil, %rdi
+; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movq %rdi, (%rax)
+; SSE2-NEXT: sarq $63, %rdi
+; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: addb %bl, %bl
+; SSE2-NEXT: sarb %bl
+; SSE2-NEXT: movl {{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: addb %sil, %sil
+; SSE2-NEXT: sarb %sil
+; SSE2-NEXT: cmpb %bl, %sil
+; SSE2-NEXT: setl %sil
+; SSE2-NEXT: setg %bl
+; SSE2-NEXT: subb %sil, %bl
+; SSE2-NEXT: movsbq %bl, %rbx
+; SSE2-NEXT: movq %rbx, %r14
+; SSE2-NEXT: sarq $63, %r14
; SSE2-NEXT: addb %r15b, %r15b
; SSE2-NEXT: sarb %r15b
+; SSE2-NEXT: movl {{[0-9]+}}(%rsp), %esi
; SSE2-NEXT: addb %sil, %sil
; SSE2-NEXT: sarb %sil
; SSE2-NEXT: cmpb %r15b, %sil
; SSE2-NEXT: setl %sil
; SSE2-NEXT: setg %r15b
; SSE2-NEXT: subb %sil, %r15b
-; SSE2-NEXT: movsbq %r15b, %rsi
-; SSE2-NEXT: movq %rsi, (%rax)
-; SSE2-NEXT: movq %rsi, %xmm0
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: addb %r14b, %r14b
-; SSE2-NEXT: sarb %r14b
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
-; SSE2-NEXT: addb %r15b, %r15b
-; SSE2-NEXT: sarb %r15b
-; SSE2-NEXT: cmpb %r14b, %r15b
-; SSE2-NEXT: setl %r14b
-; SSE2-NEXT: setg %r15b
-; SSE2-NEXT: subb %r14b, %r15b
-; SSE2-NEXT: movsbq %r15b, %r14
-; SSE2-NEXT: movq %r14, %r15
-; SSE2-NEXT: sarq $63, %r15
-; SSE2-NEXT: addb %bpl, %bpl
-; SSE2-NEXT: sarb %bpl
+; SSE2-NEXT: movsbq %r15b, %r15
+; SSE2-NEXT: movq %r15, %r13
+; SSE2-NEXT: sarq $63, %r13
+; SSE2-NEXT: addb %r12b, %r12b
+; SSE2-NEXT: sarb %r12b
; SSE2-NEXT: addb %dl, %dl
; SSE2-NEXT: sarb %dl
-; SSE2-NEXT: cmpb %bpl, %dl
+; SSE2-NEXT: cmpb %r12b, %dl
; SSE2-NEXT: setl %dl
-; SSE2-NEXT: setg %bpl
-; SSE2-NEXT: subb %dl, %bpl
-; SSE2-NEXT: movsbq %bpl, %rdx
-; SSE2-NEXT: movq %rdx, %r12
-; SSE2-NEXT: sarq $63, %r12
-; SSE2-NEXT: addb %bl, %bl
-; SSE2-NEXT: sarb %bl
+; SSE2-NEXT: setg %sil
+; SSE2-NEXT: subb %dl, %sil
+; SSE2-NEXT: movsbq %sil, %rdx
+; SSE2-NEXT: movq %rdx, %rdi
+; SSE2-NEXT: sarq $63, %rdi
+; SSE2-NEXT: addb %bpl, %bpl
+; SSE2-NEXT: sarb %bpl
; SSE2-NEXT: addb %cl, %cl
; SSE2-NEXT: sarb %cl
-; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: cmpb %bpl, %cl
; SSE2-NEXT: setl %cl
-; SSE2-NEXT: setg %bl
-; SSE2-NEXT: subb %cl, %bl
-; SSE2-NEXT: movsbq %bl, %rbx
-; SSE2-NEXT: movq %rbx, %rcx
+; SSE2-NEXT: setg %bpl
+; SSE2-NEXT: subb %cl, %bpl
+; SSE2-NEXT: movsbq %bpl, %r12
+; SSE2-NEXT: movq %r12, %rcx
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: addb %r11b, %r11b
; SSE2-NEXT: sarb %r11b
@@ -1828,9 +1880,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: setl %r8b
; SSE2-NEXT: setg %r11b
; SSE2-NEXT: subb %r8b, %r11b
-; SSE2-NEXT: movsbq %r11b, %r8
-; SSE2-NEXT: movq %r8, %r11
-; SSE2-NEXT: sarq $63, %r11
+; SSE2-NEXT: movsbq %r11b, %rsi
+; SSE2-NEXT: movq %rsi, %r8
+; SSE2-NEXT: sarq $63, %r8
; SSE2-NEXT: addb %r10b, %r10b
; SSE2-NEXT: sarb %r10b
; SSE2-NEXT: addb %r9b, %r9b
@@ -1842,68 +1894,59 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: movsbq %r10b, %r9
; SSE2-NEXT: movq %r9, %r10
; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: addb %dil, %dil
-; SSE2-NEXT: sarb %dil
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; SSE2-NEXT: addb %bpl, %bpl
-; SSE2-NEXT: sarb %bpl
-; SSE2-NEXT: cmpb %dil, %bpl
-; SSE2-NEXT: setl %dil
-; SSE2-NEXT: setg %bpl
-; SSE2-NEXT: subb %dil, %bpl
-; SSE2-NEXT: movsbq %bpl, %rdi
-; SSE2-NEXT: movq %rdi, %r13
-; SSE2-NEXT: sarq $63, %r13
+; SSE2-NEXT: movq %r10, %rbp
+; SSE2-NEXT: shldq $20, %r9, %rbp
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero
+; SSE2-NEXT: movq %r8, %r11
+; SSE2-NEXT: shldq $31, %rsi, %r11
+; SSE2-NEXT: movq %rbp, 64(%rax)
+; SSE2-NEXT: movq %rcx, %rbp
+; SSE2-NEXT: shldq $42, %r12, %rbp
+; SSE2-NEXT: movq %r11, 48(%rax)
+; SSE2-NEXT: movq %rbp, 32(%rax)
+; SSE2-NEXT: movabsq $9007199254738944, %r11 # imm = 0x1FFFFFFFFFF800
+; SSE2-NEXT: andq %rdi, %r11
+; SSE2-NEXT: shldq $53, %rdx, %rdi
+; SSE2-NEXT: movq %rdi, 16(%rax)
; SSE2-NEXT: movl %r13d, 96(%rax)
-; SSE2-NEXT: movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF
-; SSE2-NEXT: andq %r13, %rbp
-; SSE2-NEXT: shldq $62, %rdi, %r13
+; SSE2-NEXT: movabsq $2251799813685247, %rdi # imm = 0x7FFFFFFFFFFFF
+; SSE2-NEXT: andq %r13, %rdi
+; SSE2-NEXT: shldq $62, %r15, %r13
; SSE2-NEXT: movq %r13, 88(%rax)
-; SSE2-NEXT: movq %r10, %r13
-; SSE2-NEXT: shldq $20, %r9, %r13
-; SSE2-NEXT: movq %r13, 64(%rax)
-; SSE2-NEXT: movq %r11, %r13
-; SSE2-NEXT: shldq $31, %r8, %r13
-; SSE2-NEXT: movq %r13, 48(%rax)
-; SSE2-NEXT: movq %rcx, %r13
-; SSE2-NEXT: shldq $42, %rbx, %r13
-; SSE2-NEXT: movq %r13, 32(%rax)
-; SSE2-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800
-; SSE2-NEXT: andq %r12, %r13
-; SSE2-NEXT: shldq $53, %rdx, %r12
-; SSE2-NEXT: movq %r12, 16(%rax)
-; SSE2-NEXT: movq %rbp, %r12
-; SSE2-NEXT: shrq $48, %r12
-; SSE2-NEXT: movb %r12b, 102(%rax)
-; SSE2-NEXT: shrq $32, %rbp
-; SSE2-NEXT: movw %bp, 100(%rax)
-; SSE2-NEXT: movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF
-; SSE2-NEXT: andq %r12, %r15
-; SSE2-NEXT: shldq $9, %r14, %r15
-; SSE2-NEXT: shlq $62, %rdi
-; SSE2-NEXT: orq %r15, %rdi
-; SSE2-NEXT: movq %rdi, 80(%rax)
-; SSE2-NEXT: shlq $42, %rbx
-; SSE2-NEXT: shrq $11, %r13
-; SSE2-NEXT: orq %rbx, %r13
-; SSE2-NEXT: movq %r13, 24(%rax)
-; SSE2-NEXT: shlq $9, %r14
+; SSE2-NEXT: shlq $42, %r12
+; SSE2-NEXT: shrq $11, %r11
+; SSE2-NEXT: orq %r12, %r11
+; SSE2-NEXT: movq %r11, 24(%rax)
+; SSE2-NEXT: movq %rdi, %r11
+; SSE2-NEXT: shrq $48, %r11
+; SSE2-NEXT: movb %r11b, 102(%rax)
+; SSE2-NEXT: shrq $32, %rdi
+; SSE2-NEXT: movw %di, 100(%rax)
+; SSE2-NEXT: movabsq $9007199254740991, %rdi # imm = 0x1FFFFFFFFFFFFF
+; SSE2-NEXT: andq %rdi, %r14
+; SSE2-NEXT: shldq $9, %rbx, %r14
+; SSE2-NEXT: shlq $62, %r15
+; SSE2-NEXT: orq %r14, %r15
+; SSE2-NEXT: movq %r15, 80(%rax)
+; SSE2-NEXT: shlq $9, %rbx
; SSE2-NEXT: andl $511, %r10d # imm = 0x1FF
-; SSE2-NEXT: orq %r14, %r10
+; SSE2-NEXT: orq %rbx, %r10
; SSE2-NEXT: movq %r10, 72(%rax)
; SSE2-NEXT: shlq $20, %r9
-; SSE2-NEXT: andl $1048575, %r11d # imm = 0xFFFFF
-; SSE2-NEXT: orq %r9, %r11
-; SSE2-NEXT: movq %r11, 56(%rax)
-; SSE2-NEXT: shlq $31, %r8
+; SSE2-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
+; SSE2-NEXT: orq %r9, %r8
+; SSE2-NEXT: movq %r8, 56(%rax)
+; SSE2-NEXT: shlq $31, %rsi
; SSE2-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; SSE2-NEXT: orq %r8, %rcx
+; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: movq %rcx, 40(%rax)
-; SSE2-NEXT: movq %rsi, %xmm1
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload
+; SSE2-NEXT: # xmm1 = mem[0],zero
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: andq %r12, %rcx
+; SSE2-NEXT: andq %rdi, %rcx
; SSE2-NEXT: shlq $53, %rdx
; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: movq %rdx, 8(%rax)
@@ -1924,140 +1967,143 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE4-NEXT: pushq %r12
; SSE4-NEXT: pushq %rbx
; SSE4-NEXT: movq %rdi, %rax
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; SSE4-NEXT: addb %dil, %dil
+; SSE4-NEXT: sarb %dil
+; SSE4-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; SSE4-NEXT: addb %r10b, %r10b
+; SSE4-NEXT: sarb %r10b
+; SSE4-NEXT: cmpb %dil, %r10b
+; SSE4-NEXT: setl %dil
+; SSE4-NEXT: setg %r10b
+; SSE4-NEXT: subb %dil, %r10b
+; SSE4-NEXT: movsbq %r10b, %r13
+; SSE4-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT: sarq $63, %r13
+; SSE4-NEXT: addb %r11b, %r11b
+; SSE4-NEXT: sarb %r11b
+; SSE4-NEXT: addb %sil, %sil
+; SSE4-NEXT: sarb %sil
+; SSE4-NEXT: cmpb %r11b, %sil
+; SSE4-NEXT: setl %sil
+; SSE4-NEXT: setg %r11b
+; SSE4-NEXT: subb %sil, %r11b
+; SSE4-NEXT: movsbq %r11b, %r11
+; SSE4-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT: sarq $63, %r11
; SSE4-NEXT: addb %r14b, %r14b
; SSE4-NEXT: sarb %r14b
+; SSE4-NEXT: movl {{[0-9]+}}(%rsp), %esi
; SSE4-NEXT: addb %sil, %sil
; SSE4-NEXT: sarb %sil
; SSE4-NEXT: cmpb %r14b, %sil
; SSE4-NEXT: setl %sil
; SSE4-NEXT: setg %r14b
; SSE4-NEXT: subb %sil, %r14b
-; SSE4-NEXT: movsbq %r14b, %r14
-; SSE4-NEXT: movq %r14, (%rax)
+; SSE4-NEXT: movsbq %r14b, %rsi
+; SSE4-NEXT: movq %rsi, %r14
; SSE4-NEXT: sarq $63, %r14
; SSE4-NEXT: addb %r15b, %r15b
; SSE4-NEXT: sarb %r15b
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: addb %sil, %sil
-; SSE4-NEXT: sarb %sil
-; SSE4-NEXT: cmpb %r15b, %sil
-; SSE4-NEXT: setl %sil
-; SSE4-NEXT: setg %r15b
-; SSE4-NEXT: subb %sil, %r15b
-; SSE4-NEXT: movsbq %r15b, %rsi
-; SSE4-NEXT: movq %rsi, %r15
-; SSE4-NEXT: sarq $63, %r15
-; SSE4-NEXT: addb %bpl, %bpl
-; SSE4-NEXT: sarb %bpl
; SSE4-NEXT: addb %dl, %dl
; SSE4-NEXT: sarb %dl
-; SSE4-NEXT: cmpb %bpl, %dl
+; SSE4-NEXT: cmpb %r15b, %dl
; SSE4-NEXT: setl %dl
-; SSE4-NEXT: setg %bpl
-; SSE4-NEXT: subb %dl, %bpl
-; SSE4-NEXT: movsbq %bpl, %r12
-; SSE4-NEXT: movq %r12, %r13
-; SSE4-NEXT: sarq $63, %r13
-; SSE4-NEXT: addb %bl, %bl
-; SSE4-NEXT: sarb %bl
+; SSE4-NEXT: setg %r15b
+; SSE4-NEXT: subb %dl, %r15b
+; SSE4-NEXT: movsbq %r15b, %r15
+; SSE4-NEXT: movq %r15, %rdi
+; SSE4-NEXT: sarq $63, %rdi
+; SSE4-NEXT: addb %r12b, %r12b
+; SSE4-NEXT: sarb %r12b
; SSE4-NEXT: addb %cl, %cl
; SSE4-NEXT: sarb %cl
-; SSE4-NEXT: cmpb %bl, %cl
+; SSE4-NEXT: cmpb %r12b, %cl
; SSE4-NEXT: setl %cl
-; SSE4-NEXT: setg %dl
-; SSE4-NEXT: subb %cl, %dl
-; SSE4-NEXT: movsbq %dl, %rbx
-; SSE4-NEXT: movq %rbx, %rcx
+; SSE4-NEXT: setg %r12b
+; SSE4-NEXT: subb %cl, %r12b
+; SSE4-NEXT: movsbq %r12b, %r12
+; SSE4-NEXT: movq %r12, %rcx
; SSE4-NEXT: sarq $63, %rcx
-; SSE4-NEXT: addb %r11b, %r11b
-; SSE4-NEXT: sarb %r11b
+; SSE4-NEXT: addb %bpl, %bpl
+; SSE4-NEXT: sarb %bpl
; SSE4-NEXT: addb %r8b, %r8b
; SSE4-NEXT: sarb %r8b
-; SSE4-NEXT: cmpb %r11b, %r8b
-; SSE4-NEXT: setl %dl
-; SSE4-NEXT: setg %r8b
-; SSE4-NEXT: subb %dl, %r8b
-; SSE4-NEXT: movsbq %r8b, %rdx
-; SSE4-NEXT: movq %rdx, %r8
-; SSE4-NEXT: sarq $63, %r8
-; SSE4-NEXT: addb %r10b, %r10b
-; SSE4-NEXT: sarb %r10b
+; SSE4-NEXT: cmpb %bpl, %r8b
+; SSE4-NEXT: setl %r8b
+; SSE4-NEXT: setg %bpl
+; SSE4-NEXT: subb %r8b, %bpl
+; SSE4-NEXT: movsbq %bpl, %r10
+; SSE4-NEXT: movq %r10, %rbp
+; SSE4-NEXT: sarq $63, %rbp
+; SSE4-NEXT: addb %bl, %bl
+; SSE4-NEXT: sarb %bl
; SSE4-NEXT: addb %r9b, %r9b
; SSE4-NEXT: sarb %r9b
-; SSE4-NEXT: cmpb %r10b, %r9b
+; SSE4-NEXT: cmpb %bl, %r9b
; SSE4-NEXT: setl %r9b
-; SSE4-NEXT: setg %r10b
-; SSE4-NEXT: subb %r9b, %r10b
-; SSE4-NEXT: movsbq %r10b, %r9
-; SSE4-NEXT: movq %r9, %r10
-; SSE4-NEXT: sarq $63, %r10
-; SSE4-NEXT: addb %dil, %dil
-; SSE4-NEXT: sarb %dil
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE4-NEXT: addb %r11b, %r11b
-; SSE4-NEXT: sarb %r11b
-; SSE4-NEXT: cmpb %dil, %r11b
-; SSE4-NEXT: setl %dil
-; SSE4-NEXT: setg %r11b
-; SSE4-NEXT: subb %dil, %r11b
-; SSE4-NEXT: movsbq %r11b, %rdi
-; SSE4-NEXT: movq %rdi, %rbp
-; SSE4-NEXT: sarq $63, %rbp
-; SSE4-NEXT: movl %ebp, 96(%rax)
-; SSE4-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF
-; SSE4-NEXT: andq %rbp, %r11
-; SSE4-NEXT: shldq $62, %rdi, %rbp
-; SSE4-NEXT: movq %rbp, 88(%rax)
-; SSE4-NEXT: movq %r10, %rbp
-; SSE4-NEXT: shldq $20, %r9, %rbp
-; SSE4-NEXT: movq %rbp, 64(%rax)
-; SSE4-NEXT: movq %r8, %rbp
-; SSE4-NEXT: shldq $31, %rdx, %rbp
-; SSE4-NEXT: movq %rbp, 48(%rax)
-; SSE4-NEXT: movq %rcx, %rbp
-; SSE4-NEXT: shldq $42, %rbx, %rbp
-; SSE4-NEXT: movq %rbp, 32(%rax)
-; SSE4-NEXT: movabsq $9007199254738944, %rbp # imm = 0x1FFFFFFFFFF800
-; SSE4-NEXT: andq %r13, %rbp
-; SSE4-NEXT: shldq $53, %r12, %r13
-; SSE4-NEXT: movq %r13, 16(%rax)
-; SSE4-NEXT: movq %r11, %r13
-; SSE4-NEXT: shrq $48, %r13
-; SSE4-NEXT: movb %r13b, 102(%rax)
-; SSE4-NEXT: shrq $32, %r11
-; SSE4-NEXT: movw %r11w, 100(%rax)
-; SSE4-NEXT: movabsq $9007199254740991, %r11 # imm = 0x1FFFFFFFFFFFFF
-; SSE4-NEXT: andq %r11, %r15
-; SSE4-NEXT: shldq $9, %rsi, %r15
-; SSE4-NEXT: shlq $62, %rdi
-; SSE4-NEXT: orq %r15, %rdi
-; SSE4-NEXT: movq %rdi, 80(%rax)
-; SSE4-NEXT: andq %r11, %r14
-; SSE4-NEXT: shlq $53, %r12
-; SSE4-NEXT: orq %r14, %r12
-; SSE4-NEXT: movq %r12, 8(%rax)
-; SSE4-NEXT: shlq $42, %rbx
-; SSE4-NEXT: shrq $11, %rbp
-; SSE4-NEXT: orq %rbx, %rbp
-; SSE4-NEXT: movq %rbp, 24(%rax)
-; SSE4-NEXT: shlq $9, %rsi
-; SSE4-NEXT: andl $511, %r10d # imm = 0x1FF
-; SSE4-NEXT: orq %rsi, %r10
-; SSE4-NEXT: movq %r10, 72(%rax)
-; SSE4-NEXT: shlq $20, %r9
-; SSE4-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
-; SSE4-NEXT: orq %r9, %r8
-; SSE4-NEXT: movq %r8, 56(%rax)
-; SSE4-NEXT: shlq $31, %rdx
+; SSE4-NEXT: setg %bl
+; SSE4-NEXT: subb %r9b, %bl
+; SSE4-NEXT: movsbq %bl, %rdx
+; SSE4-NEXT: movq %rdx, %r9
+; SSE4-NEXT: sarq $63, %r9
+; SSE4-NEXT: movq %r9, %rbx
+; SSE4-NEXT: shldq $20, %rdx, %rbx
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT: movq %r8, (%rax)
+; SSE4-NEXT: movq %rbp, %r8
+; SSE4-NEXT: shldq $31, %r10, %r8
+; SSE4-NEXT: movq %rbx, 64(%rax)
+; SSE4-NEXT: movq %rcx, %rbx
+; SSE4-NEXT: shldq $42, %r12, %rbx
+; SSE4-NEXT: movq %r8, 48(%rax)
+; SSE4-NEXT: movq %rbx, 32(%rax)
+; SSE4-NEXT: movabsq $9007199254738944, %r8 # imm = 0x1FFFFFFFFFF800
+; SSE4-NEXT: andq %rdi, %r8
+; SSE4-NEXT: shldq $53, %r15, %rdi
+; SSE4-NEXT: movq %rdi, 16(%rax)
+; SSE4-NEXT: movl %r14d, 96(%rax)
+; SSE4-NEXT: movabsq $2251799813685247, %rdi # imm = 0x7FFFFFFFFFFFF
+; SSE4-NEXT: andq %r14, %rdi
+; SSE4-NEXT: shldq $62, %rsi, %r14
+; SSE4-NEXT: movq %r14, 88(%rax)
+; SSE4-NEXT: movabsq $9007199254740991, %rbx # imm = 0x1FFFFFFFFFFFFF
+; SSE4-NEXT: andq %rbx, %r11
+; SSE4-NEXT: shlq $53, %r15
+; SSE4-NEXT: orq %r11, %r15
+; SSE4-NEXT: movq %r15, 8(%rax)
+; SSE4-NEXT: shlq $42, %r12
+; SSE4-NEXT: shrq $11, %r8
+; SSE4-NEXT: orq %r12, %r8
+; SSE4-NEXT: movq %r8, 24(%rax)
+; SSE4-NEXT: movq %rdi, %r8
+; SSE4-NEXT: shrq $48, %r8
+; SSE4-NEXT: movb %r8b, 102(%rax)
+; SSE4-NEXT: shrq $32, %rdi
+; SSE4-NEXT: movw %di, 100(%rax)
+; SSE4-NEXT: andq %rbx, %r13
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE4-NEXT: shldq $9, %rdi, %r13
+; SSE4-NEXT: shlq $62, %rsi
+; SSE4-NEXT: orq %r13, %rsi
+; SSE4-NEXT: movq %rsi, 80(%rax)
+; SSE4-NEXT: shlq $9, %rdi
+; SSE4-NEXT: andl $511, %r9d # imm = 0x1FF
+; SSE4-NEXT: orq %rdi, %r9
+; SSE4-NEXT: movq %r9, 72(%rax)
+; SSE4-NEXT: shlq $20, %rdx
+; SSE4-NEXT: andl $1048575, %ebp # imm = 0xFFFFF
+; SSE4-NEXT: orq %rdx, %rbp
+; SSE4-NEXT: movq %rbp, 56(%rax)
+; SSE4-NEXT: shlq $31, %r10
; SSE4-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; SSE4-NEXT: orq %rdx, %rcx
+; SSE4-NEXT: orq %r10, %rcx
; SSE4-NEXT: movq %rcx, 40(%rax)
; SSE4-NEXT: popq %rbx
; SSE4-NEXT: popq %r12
@@ -2076,132 +2122,132 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %rdi, %rax
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
-; AVX-NEXT: addb %r14b, %r14b
-; AVX-NEXT: sarb %r14b
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX-NEXT: addb %dil, %dil
+; AVX-NEXT: sarb %dil
+; AVX-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT: addb %r10b, %r10b
+; AVX-NEXT: sarb %r10b
+; AVX-NEXT: cmpb %dil, %r10b
+; AVX-NEXT: setl %dil
+; AVX-NEXT: setg %r10b
+; AVX-NEXT: subb %dil, %r10b
+; AVX-NEXT: movsbq %r10b, %rdi
+; AVX-NEXT: movq %rdi, %r10
+; AVX-NEXT: sarq $63, %r10
+; AVX-NEXT: addb %bpl, %bpl
+; AVX-NEXT: sarb %bpl
; AVX-NEXT: addb %sil, %sil
; AVX-NEXT: sarb %sil
-; AVX-NEXT: cmpb %r14b, %sil
+; AVX-NEXT: cmpb %bpl, %sil
; AVX-NEXT: setl %sil
-; AVX-NEXT: setg %r14b
-; AVX-NEXT: subb %sil, %r14b
-; AVX-NEXT: movsbq %r14b, %r14
-; AVX-NEXT: movq %r14, (%rax)
-; AVX-NEXT: sarq $63, %r14
+; AVX-NEXT: setg %bpl
+; AVX-NEXT: subb %sil, %bpl
+; AVX-NEXT: movsbq %bpl, %r12
+; AVX-NEXT: movq %r12, (%rax)
+; AVX-NEXT: sarq $63, %r12
; AVX-NEXT: addb %r15b, %r15b
; AVX-NEXT: sarb %r15b
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX-NEXT: movl {{[0-9]+}}(%rsp), %esi
; AVX-NEXT: addb %sil, %sil
; AVX-NEXT: sarb %sil
; AVX-NEXT: cmpb %r15b, %sil
; AVX-NEXT: setl %sil
-; AVX-NEXT: setg %r15b
-; AVX-NEXT: subb %sil, %r15b
-; AVX-NEXT: movsbq %r15b, %rsi
-; AVX-NEXT: movq %rsi, %r12
-; AVX-NEXT: sarq $63, %r12
-; AVX-NEXT: addb %bpl, %bpl
-; AVX-NEXT: sarb %bpl
+; AVX-NEXT: setg %bpl
+; AVX-NEXT: subb %sil, %bpl
+; AVX-NEXT: movsbq %bpl, %rsi
+; AVX-NEXT: movq %rsi, %r15
+; AVX-NEXT: sarq $63, %r15
+; AVX-NEXT: addb %r14b, %r14b
+; AVX-NEXT: sarb %r14b
; AVX-NEXT: addb %dl, %dl
; AVX-NEXT: sarb %dl
-; AVX-NEXT: cmpb %bpl, %dl
+; AVX-NEXT: cmpb %r14b, %dl
; AVX-NEXT: setl %dl
; AVX-NEXT: setg %bpl
; AVX-NEXT: subb %dl, %bpl
-; AVX-NEXT: movsbq %bpl, %r15
-; AVX-NEXT: movq %r15, %r13
-; AVX-NEXT: sarq $63, %r13
-; AVX-NEXT: addb %bl, %bl
-; AVX-NEXT: sarb %bl
+; AVX-NEXT: movsbq %bpl, %r14
+; AVX-NEXT: movq %r14, %rbp
+; AVX-NEXT: sarq $63, %rbp
+; AVX-NEXT: addb %r13b, %r13b
+; AVX-NEXT: sarb %r13b
; AVX-NEXT: addb %cl, %cl
; AVX-NEXT: sarb %cl
-; AVX-NEXT: cmpb %bl, %cl
+; AVX-NEXT: cmpb %r13b, %cl
; AVX-NEXT: setl %cl
; AVX-NEXT: setg %dl
; AVX-NEXT: subb %cl, %dl
-; AVX-NEXT: movsbq %dl, %rbx
-; AVX-NEXT: movq %rbx, %rcx
+; AVX-NEXT: movsbq %dl, %r13
+; AVX-NEXT: movq %r13, %rcx
; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: addb %r11b, %r11b
-; AVX-NEXT: sarb %r11b
+; AVX-NEXT: addb %bl, %bl
+; AVX-NEXT: sarb %bl
; AVX-NEXT: addb %r8b, %r8b
; AVX-NEXT: sarb %r8b
-; AVX-NEXT: cmpb %r11b, %r8b
+; AVX-NEXT: cmpb %bl, %r8b
; AVX-NEXT: setl %dl
; AVX-NEXT: setg %r8b
; AVX-NEXT: subb %dl, %r8b
; AVX-NEXT: movsbq %r8b, %rdx
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: sarq $63, %r8
-; AVX-NEXT: addb %r10b, %r10b
-; AVX-NEXT: sarb %r10b
+; AVX-NEXT: addb %r11b, %r11b
+; AVX-NEXT: sarb %r11b
; AVX-NEXT: addb %r9b, %r9b
; AVX-NEXT: sarb %r9b
-; AVX-NEXT: cmpb %r10b, %r9b
+; AVX-NEXT: cmpb %r11b, %r9b
; AVX-NEXT: setl %r9b
-; AVX-NEXT: setg %r10b
-; AVX-NEXT: subb %r9b, %r10b
-; AVX-NEXT: movsbq %r10b, %r9
-; AVX-NEXT: movq %r9, %r10
-; AVX-NEXT: sarq $63, %r10
-; AVX-NEXT: addb %dil, %dil
-; AVX-NEXT: sarb %dil
-; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; AVX-NEXT: addb %r11b, %r11b
-; AVX-NEXT: sarb %r11b
-; AVX-NEXT: cmpb %dil, %r11b
-; AVX-NEXT: setl %dil
; AVX-NEXT: setg %r11b
-; AVX-NEXT: subb %dil, %r11b
-; AVX-NEXT: movsbq %r11b, %rdi
-; AVX-NEXT: movq %rdi, %rbp
-; AVX-NEXT: sarq $63, %rbp
-; AVX-NEXT: movl %ebp, 96(%rax)
-; AVX-NEXT: movb $51, %r11b
-; AVX-NEXT: bzhiq %r11, %rbp, %r11
-; AVX-NEXT: shldq $62, %rdi, %rbp
-; AVX-NEXT: movq %rbp, 88(%rax)
-; AVX-NEXT: movq %r10, %rbp
-; AVX-NEXT: shldq $20, %r9, %rbp
-; AVX-NEXT: movq %rbp, 64(%rax)
-; AVX-NEXT: movq %r8, %rbp
-; AVX-NEXT: shldq $31, %rdx, %rbp
-; AVX-NEXT: movq %rbp, 48(%rax)
-; AVX-NEXT: movq %rcx, %rbp
-; AVX-NEXT: shldq $42, %rbx, %rbp
-; AVX-NEXT: movq %rbp, 32(%rax)
-; AVX-NEXT: movb $42, %bpl
-; AVX-NEXT: bzhiq %rbp, %r13, %rbp
-; AVX-NEXT: shldq $53, %r15, %r13
-; AVX-NEXT: movq %r13, 16(%rax)
-; AVX-NEXT: movq %r11, %r13
-; AVX-NEXT: shrq $48, %r13
-; AVX-NEXT: movb %r13b, 102(%rax)
-; AVX-NEXT: shrq $32, %r11
-; AVX-NEXT: movw %r11w, 100(%rax)
-; AVX-NEXT: movb $53, %r11b
-; AVX-NEXT: bzhiq %r11, %r12, %r12
-; AVX-NEXT: shldq $9, %rsi, %r12
-; AVX-NEXT: shlq $62, %rdi
-; AVX-NEXT: orq %r12, %rdi
-; AVX-NEXT: movq %rdi, 80(%rax)
-; AVX-NEXT: shlq $42, %rbx
-; AVX-NEXT: orq %rbp, %rbx
-; AVX-NEXT: movq %rbx, 24(%rax)
-; AVX-NEXT: bzhiq %r11, %r14, %rdi
-; AVX-NEXT: shlq $53, %r15
-; AVX-NEXT: orq %rdi, %r15
-; AVX-NEXT: movq %r15, 8(%rax)
-; AVX-NEXT: shlq $9, %rsi
-; AVX-NEXT: andl $511, %r10d # imm = 0x1FF
-; AVX-NEXT: orq %rsi, %r10
-; AVX-NEXT: movq %r10, 72(%rax)
+; AVX-NEXT: subb %r9b, %r11b
+; AVX-NEXT: movsbq %r11b, %r9
+; AVX-NEXT: movq %r9, %r11
+; AVX-NEXT: sarq $63, %r11
+; AVX-NEXT: movq %r11, %rbx
+; AVX-NEXT: shldq $20, %r9, %rbx
+; AVX-NEXT: movq %rbx, 64(%rax)
+; AVX-NEXT: movq %r8, %rbx
+; AVX-NEXT: shldq $31, %rdx, %rbx
+; AVX-NEXT: movq %rbx, 48(%rax)
+; AVX-NEXT: movq %rcx, %rbx
+; AVX-NEXT: shldq $42, %r13, %rbx
+; AVX-NEXT: movq %rbx, 32(%rax)
+; AVX-NEXT: movb $42, %bl
+; AVX-NEXT: bzhiq %rbx, %rbp, %rbx
+; AVX-NEXT: shldq $53, %r14, %rbp
+; AVX-NEXT: movq %rbp, 16(%rax)
+; AVX-NEXT: movl %r15d, 96(%rax)
+; AVX-NEXT: movb $51, %bpl
+; AVX-NEXT: bzhiq %rbp, %r15, %rbp
+; AVX-NEXT: shldq $62, %rsi, %r15
+; AVX-NEXT: movq %r15, 88(%rax)
+; AVX-NEXT: shlq $42, %r13
+; AVX-NEXT: orq %rbx, %r13
+; AVX-NEXT: movq %r13, 24(%rax)
+; AVX-NEXT: movb $53, %bl
+; AVX-NEXT: bzhiq %rbx, %r12, %r15
+; AVX-NEXT: shlq $53, %r14
+; AVX-NEXT: orq %r15, %r14
+; AVX-NEXT: movq %r14, 8(%rax)
+; AVX-NEXT: movq %rbp, %r14
+; AVX-NEXT: shrq $48, %r14
+; AVX-NEXT: movb %r14b, 102(%rax)
+; AVX-NEXT: shrq $32, %rbp
+; AVX-NEXT: movw %bp, 100(%rax)
+; AVX-NEXT: bzhiq %rbx, %r10, %r10
+; AVX-NEXT: shldq $9, %rdi, %r10
+; AVX-NEXT: shlq $62, %rsi
+; AVX-NEXT: orq %r10, %rsi
+; AVX-NEXT: movq %rsi, 80(%rax)
+; AVX-NEXT: shlq $9, %rdi
+; AVX-NEXT: andl $511, %r11d # imm = 0x1FF
+; AVX-NEXT: orq %rdi, %r11
+; AVX-NEXT: movq %r11, 72(%rax)
; AVX-NEXT: shlq $20, %r9
; AVX-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
; AVX-NEXT: orq %r9, %r8
@@ -2671,12 +2717,12 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: setl %al
; X86-NEXT: setg %dl
; X86-NEXT: subb %al, %dl
; X86-NEXT: movsbl %dl, %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: cmpb %cl, {{[0-9]+}}(%esp)
; X86-NEXT: setl %cl
; X86-NEXT: setg %dl
; X86-NEXT: subb %cl, %dl
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 7f17299b39e33..af2275eda305f 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -16,7 +16,7 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
; X86-LABEL: ucmp.8.8:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: retl
@@ -35,7 +35,7 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind {
; X86-LABEL: ucmp.8.16:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: cmpw %ax, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: retl
@@ -54,7 +54,7 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
; X86-LABEL: ucmp.8.32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: retl
@@ -155,7 +155,7 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
; X86-LABEL: ucmp.32.32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movsbl %al, %eax
@@ -245,7 +245,7 @@ define i4 @ucmp_narrow_result(i32 %x, i32 %y) nounwind {
; X86-LABEL: ucmp_narrow_result:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: retl
@@ -278,18 +278,18 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl $1073741823, %ecx # imm = 0x3FFFFFFF
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl %ecx, %edx
-; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: cmpl %esi, %edi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1073741823, %edi # imm = 0x3FFFFFFF
+; X86-NEXT: cmpl %ecx, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: sbbl %edx, %eax
; X86-NEXT: setb %al
-; X86-NEXT: cmpl %edi, %esi
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: cmpl %esi, %ecx
+; X86-NEXT: sbbl %edi, %edx
; X86-NEXT: sbbb $0, %al
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -313,9 +313,9 @@ define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind {
;
; X86-LABEL: ucmp_wide_result:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %cl
; X86-NEXT: sbbb $0, %cl
; X86-NEXT: movsbl %cl, %ecx
@@ -366,26 +366,26 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl $8191, %ecx # imm = 0x1FFF
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl %ecx, %edx
-; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $8191, %ecx # imm = 0x1FFF
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: andl $8191, %esi # imm = 0x1FFF
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: sbbl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: sbbl %esi, %eax
-; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: sbbl %ecx, %eax
; X86-NEXT: setb %al
; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp)
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl %ebx, %esi
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: sbbb $0, %al
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -470,24 +470,24 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %dl
; X86-NEXT: sbbb $0, %dl
; X86-NEXT: movsbl %dl, %edx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: seta %bl
; X86-NEXT: sbbb $0, %bl
; X86-NEXT: movsbl %bl, %edi
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: seta %bl
; X86-NEXT: sbbb $0, %bl
; X86-NEXT: movsbl %bl, %esi
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %cl
; X86-NEXT: sbbb $0, %cl
; X86-NEXT: movsbl %cl, %ecx
@@ -611,27 +611,27 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: seta %cl
-; X86-NEXT: sbbb $0, %cl
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: seta %ch
-; X86-NEXT: sbbb $0, %ch
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: seta %bl
-; X86-NEXT: sbbb $0, %bl
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %dl
; X86-NEXT: sbbb $0, %dl
-; X86-NEXT: movb %dl, 3(%eax)
+; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: seta %dh
+; X86-NEXT: sbbb $0, %dh
+; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: seta %bl
+; X86-NEXT: sbbb $0, %bl
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: seta %cl
+; X86-NEXT: sbbb $0, %cl
+; X86-NEXT: movb %cl, 3(%eax)
; X86-NEXT: movb %bl, 2(%eax)
-; X86-NEXT: movb %ch, 1(%eax)
-; X86-NEXT: movb %cl, (%eax)
+; X86-NEXT: movb %dh, 1(%eax)
+; X86-NEXT: movb %dl, (%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -691,24 +691,24 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpb %dl, {{[0-9]+}}(%esp)
; X86-NEXT: seta %dl
; X86-NEXT: sbbb $0, %dl
; X86-NEXT: movsbl %dl, %edx
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT: cmpb %bl, {{[0-9]+}}(%esp)
; X86-NEXT: seta %bl
; X86-NEXT: sbbb $0, %bl
; X86-NEXT: movsbl %bl, %esi
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: cmpb %ch, {{[0-9]+}}(%esp)
; X86-NEXT: seta %ch
; X86-NEXT: sbbb $0, %ch
; X86-NEXT: movsbl %ch, %edi
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: cmpb %cl, {{[0-9]+}}(%esp)
; X86-NEXT: seta %cl
; X86-NEXT: sbbb $0, %cl
; X86-NEXT: movsbl %cl, %ecx
@@ -767,38 +767,44 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
;
; SSE2-LABEL: ucmp_wide_vec_result:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE2-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: psubd %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; SSE2-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE2-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSE2-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
; SSE2-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: psubd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pmaxud %xmm1, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm5
+; SSE2-NEXT: pminud %xmm2, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: psubd %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE2-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
-; SSE2-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pmaxud %xmm2, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: pminud %xmm5, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
; SSE2-NEXT: psubd %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE2-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
-; SSE2-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: psubd %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSE2-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
+; SSE2-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pmaxud %xmm3, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pminud %xmm4, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: psubd %xmm5, %xmm3
; SSE2-NEXT: retq
;
; AVX2-LABEL: ucmp_wide_vec_result:
@@ -812,8 +818,10 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm1
; AVX2-NEXT: vmovdqa %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -842,74 +850,74 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: cmpb %cl, {{[0-9]+}}(%esp)
; X86-NEXT: seta %cl
; X86-NEXT: sbbb $0, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bh
+; X86-NEXT: cmpb %bh, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl
+; X86-NEXT: cmpb %bl, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dh
+; X86-NEXT: cmpb %dh, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: cmpb %ch, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: cmpb %ah, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: cmpb %dl, {{[0-9]+}}(%esp)
; X86-NEXT: seta %bl
; X86-NEXT: sbbb $0, %bl
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, (%esp) # 1-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %bh
; X86-NEXT: sbbb $0, %bh
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movsbl %al, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movsbl %al, %edi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movsbl %al, %ebp
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movsbl %al, %esi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movsbl %al, %edx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT: cmpb %al, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movsbl %al, %ecx
@@ -1368,72 +1376,72 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: cmpl %ebx, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: cmpl %ebx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: seta %bh
; X86-NEXT: sbbb $0, %bh
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: seta %bl
; X86-NEXT: sbbb $0, %bl
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: seta %dh
; X86-NEXT: sbbb $0, %dh
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: seta %ch
; X86-NEXT: sbbb $0, %ch
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: seta %dl
; X86-NEXT: sbbb $0, %dl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: seta %cl
; X86-NEXT: sbbb $0, %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1523,10 +1531,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT: movq %rax, (%rsp) # 8-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: andl $127, %eax
; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1554,240 +1562,240 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: andl $127, %eax
; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE4-NEXT: andl $127, %r10d
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSE4-NEXT: andl $127, %ebp
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: andl $127, %eax
; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: andl $127, %ecx
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r8
; SSE4-NEXT: andl $127, %r8d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; SSE4-NEXT: andl $127, %ebx
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT: andl $127, %edx
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13
; SSE4-NEXT: andl $127, %r13d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT: andl $127, %r11d
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT: andl $127, %r10d
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT: andl $127, %edx
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE4-NEXT: andl $127, %r15d
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE4-NEXT: andl $127, %r14d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT: andl $127, %r12d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE4-NEXT: cmpq %rax, %rbp
-; SSE4-NEXT: movq %r12, %r15
-; SSE4-NEXT: sbbq %r14, %r15
-; SSE4-NEXT: setb %r15b
-; SSE4-NEXT: cmpq %rbp, %rax
-; SSE4-NEXT: sbbq %r12, %r14
-; SSE4-NEXT: sbbb $0, %r15b
-; SSE4-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT: cmpq %rax, %r14
-; SSE4-NEXT: movq %r11, %r15
-; SSE4-NEXT: sbbq %r13, %r15
-; SSE4-NEXT: setb %bpl
-; SSE4-NEXT: cmpq %r14, %rax
-; SSE4-NEXT: sbbq %r11, %r13
-; SSE4-NEXT: sbbb $0, %bpl
-; SSE4-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT: cmpq %rax, %r11
-; SSE4-NEXT: movq %rdx, %r14
-; SSE4-NEXT: sbbq %rbx, %r14
-; SSE4-NEXT: setb %bpl
-; SSE4-NEXT: cmpq %r11, %rax
-; SSE4-NEXT: sbbq %rdx, %rbx
-; SSE4-NEXT: sbbb $0, %bpl
-; SSE4-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT: andl $127, %r11d
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE4-NEXT: andl $127, %ecx
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT: cmpq %rax, %rdx
-; SSE4-NEXT: movq %r8, %r11
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT: cmpq %rax, %r12
+; SSE4-NEXT: movq %rcx, %rbx
+; SSE4-NEXT: sbbq %r11, %rbx
+; SSE4-NEXT: setb %bl
+; SSE4-NEXT: cmpq %r12, %rax
; SSE4-NEXT: sbbq %rcx, %r11
+; SSE4-NEXT: sbbb $0, %bl
+; SSE4-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE4-NEXT: cmpq %rax, %rcx
+; SSE4-NEXT: movq %r14, %r11
+; SSE4-NEXT: sbbq %r15, %r11
; SSE4-NEXT: setb %r11b
-; SSE4-NEXT: cmpq %rdx, %rax
-; SSE4-NEXT: sbbq %r8, %rcx
+; SSE4-NEXT: cmpq %rcx, %rax
+; SSE4-NEXT: sbbq %r14, %r15
; SSE4-NEXT: sbbb $0, %r11b
; SSE4-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: movq %r8, %rdx
-; SSE4-NEXT: sbbq %r10, %rdx
-; SSE4-NEXT: setb %dl
+; SSE4-NEXT: movq %rdx, %r11
+; SSE4-NEXT: sbbq %r10, %r11
+; SSE4-NEXT: setb %r11b
; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r8, %r10
-; SSE4-NEXT: sbbb $0, %dl
-; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT: sbbq %rdx, %r10
+; SSE4-NEXT: sbbb $0, %r11b
+; SSE4-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT: movq %r11, %rdx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT: movq %r13, %rdx
; SSE4-NEXT: sbbq %r8, %rdx
-; SSE4-NEXT: setb %r10b
+; SSE4-NEXT: setb %dl
; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r11, %r8
-; SSE4-NEXT: sbbb $0, %r10b
+; SSE4-NEXT: sbbq %r13, %r8
+; SSE4-NEXT: sbbb $0, %dl
+; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT: movq %r11, %rdx
; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rdx
+; SSE4-NEXT: movq %r8, %rdx
+; SSE4-NEXT: sbbq %rbp, %rdx
; SSE4-NEXT: setb %dl
; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r11, %r8
+; SSE4-NEXT: sbbq %r8, %rbp
; SSE4-NEXT: sbbb $0, %dl
; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT: movq %r11, %rdx
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT: movq %r10, %rdx
; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; SSE4-NEXT: sbbq %r8, %rdx
-; SSE4-NEXT: setb %bpl
+; SSE4-NEXT: setb %dl
; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r11, %r8
-; SSE4-NEXT: sbbb $0, %bpl
+; SSE4-NEXT: sbbq %r10, %r8
+; SSE4-NEXT: sbbb $0, %dl
+; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT: movq %r11, %rdx
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT: movq %r10, %rdx
; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; SSE4-NEXT: sbbq %r8, %rdx
; SSE4-NEXT: setb %dl
; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r11, %r8
+; SSE4-NEXT: sbbq %r10, %r8
; SSE4-NEXT: sbbb $0, %dl
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT: movq %r14, %r8
-; SSE4-NEXT: movq (%rsp), %rbx # 8-byte Reload
-; SSE4-NEXT: sbbq %rbx, %r8
-; SSE4-NEXT: setb %r11b
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r14, %rbx
-; SSE4-NEXT: sbbb $0, %r11b
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT: cmpq %rcx, %rdx
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT: movq %r10, %rax
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT: sbbq %r8, %rax
+; SSE4-NEXT: setb %r12b
+; SSE4-NEXT: cmpq %rdx, %rcx
+; SSE4-NEXT: sbbq %r10, %r8
+; SSE4-NEXT: sbbb $0, %r12b
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT: movq %r14, %rbx
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT: cmpq %rcx, %r10
; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rbx
+; SSE4-NEXT: movq %r8, %rdx
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT: sbbq %rax, %rdx
+; SSE4-NEXT: setb %bpl
+; SSE4-NEXT: cmpq %r10, %rcx
+; SSE4-NEXT: sbbq %r8, %rax
+; SSE4-NEXT: sbbb $0, %bpl
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE4-NEXT: cmpq %r10, %r11
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE4-NEXT: movq %rdx, %rcx
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT: sbbq %rax, %rcx
+; SSE4-NEXT: setb %r8b
+; SSE4-NEXT: cmpq %r11, %r10
+; SSE4-NEXT: sbbq %rdx, %rax
+; SSE4-NEXT: sbbb $0, %r8b
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; SSE4-NEXT: cmpq %r11, %rbx
+; SSE4-NEXT: movq (%rsp), %rcx # 8-byte Reload
+; SSE4-NEXT: movq %rcx, %r10
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT: sbbq %rax, %r10
+; SSE4-NEXT: setb %r10b
+; SSE4-NEXT: cmpq %rbx, %r11
+; SSE4-NEXT: sbbq %rcx, %rax
+; SSE4-NEXT: sbbb $0, %r10b
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; SSE4-NEXT: cmpq %rbx, %r14
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT: movq %rcx, %r11
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT: sbbq %rax, %r11
+; SSE4-NEXT: setb %r11b
+; SSE4-NEXT: cmpq %r14, %rbx
+; SSE4-NEXT: sbbq %rcx, %rax
+; SSE4-NEXT: sbbb $0, %r11b
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE4-NEXT: cmpq %r14, %r15
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT: movq %rcx, %rbx
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT: sbbq %rax, %rbx
; SSE4-NEXT: setb %bl
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r14, %r8
+; SSE4-NEXT: cmpq %r15, %r14
+; SSE4-NEXT: sbbq %rcx, %rax
; SSE4-NEXT: sbbb $0, %bl
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT: cmpq %rax, %r14
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE4-NEXT: movq %r15, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rcx
-; SSE4-NEXT: setb %cl
-; SSE4-NEXT: cmpq %r14, %rax
-; SSE4-NEXT: sbbq %r15, %r8
-; SSE4-NEXT: sbbb $0, %cl
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE4-NEXT: cmpq %rax, %r15
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT: movq %r12, %r14
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %r14
+; SSE4-NEXT: cmpq %r9, %r15
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT: movq %rcx, %r14
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT: sbbq %rax, %r14
; SSE4-NEXT: setb %r14b
-; SSE4-NEXT: cmpq %r15, %rax
-; SSE4-NEXT: sbbq %r12, %r8
+; SSE4-NEXT: cmpq %r15, %r9
+; SSE4-NEXT: sbbq %rcx, %rax
; SSE4-NEXT: sbbb $0, %r14b
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: cmpq %r9, %rax
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT: movq %r12, %r15
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %r15
-; SSE4-NEXT: setb %r15b
-; SSE4-NEXT: cmpq %rax, %r9
-; SSE4-NEXT: sbbq %r12, %r8
-; SSE4-NEXT: sbbb $0, %r15b
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT: cmpq %r12, %rax
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE4-NEXT: movq %r13, %r9
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %r9
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT: cmpq %rcx, %r15
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE4-NEXT: movq %rdx, %r9
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT: sbbq %rax, %r9
; SSE4-NEXT: setb %r9b
-; SSE4-NEXT: cmpq %rax, %r12
-; SSE4-NEXT: sbbq %r13, %r8
-; SSE4-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT: cmpq %r15, %rcx
+; SSE4-NEXT: sbbq %rdx, %rax
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13
; SSE4-NEXT: sbbb $0, %r9b
-; SSE4-NEXT: cmpq %rsi, %r12
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: movq %r8, %rdi
+; SSE4-NEXT: cmpq %rsi, %r13
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT: movq %rcx, %r15
; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT: sbbq %rax, %rdi
-; SSE4-NEXT: setb %dil
-; SSE4-NEXT: cmpq %r12, %rsi
-; SSE4-NEXT: sbbq %r8, %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT: sbbq %rax, %r15
+; SSE4-NEXT: setb %r15b
+; SSE4-NEXT: cmpq %r13, %rsi
+; SSE4-NEXT: sbbq %rcx, %rax
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE4-NEXT: sbbb $0, %dil
-; SSE4-NEXT: cmpq %r12, %r13
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: movq %r8, %rsi
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT: sbbq %rax, %rsi
+; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT: sbbb $0, %r15b
+; SSE4-NEXT: cmpq %r13, %rax
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE4-NEXT: movq %rdx, %rsi
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT: sbbq %rcx, %rsi
; SSE4-NEXT: setb %sil
-; SSE4-NEXT: cmpq %r13, %r12
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm1
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm2
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm3
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm4
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm5
-; SSE4-NEXT: movzbl %r10b, %r10d
-; SSE4-NEXT: movd %r10d, %xmm6
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r10d, %xmm7
-; SSE4-NEXT: movzbl %bpl, %r10d
-; SSE4-NEXT: movd %r10d, %xmm0
-; SSE4-NEXT: movzbl %dl, %edx
-; SSE4-NEXT: movd %edx, %xmm8
-; SSE4-NEXT: movzbl %r11b, %edx
-; SSE4-NEXT: movd %edx, %xmm9
-; SSE4-NEXT: movzbl %bl, %edx
-; SSE4-NEXT: movd %edx, %xmm10
-; SSE4-NEXT: movzbl %cl, %ecx
-; SSE4-NEXT: movd %ecx, %xmm11
-; SSE4-NEXT: movzbl %r14b, %ecx
-; SSE4-NEXT: movd %ecx, %xmm12
-; SSE4-NEXT: movzbl %r15b, %ecx
-; SSE4-NEXT: movd %ecx, %xmm13
-; SSE4-NEXT: movzbl %r9b, %ecx
-; SSE4-NEXT: movd %ecx, %xmm14
-; SSE4-NEXT: movzbl %dil, %ecx
-; SSE4-NEXT: movd %ecx, %xmm15
+; SSE4-NEXT: cmpq %rax, %r13
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm1
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm2
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm3
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm4
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm5
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm6
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm7
+; SSE4-NEXT: movzbl %r12b, %eax
+; SSE4-NEXT: movd %eax, %xmm0
+; SSE4-NEXT: movzbl %bpl, %eax
+; SSE4-NEXT: movd %eax, %xmm8
+; SSE4-NEXT: movzbl %r8b, %eax
+; SSE4-NEXT: movd %eax, %xmm9
+; SSE4-NEXT: movzbl %r10b, %eax
+; SSE4-NEXT: movd %eax, %xmm10
+; SSE4-NEXT: movzbl %r11b, %eax
+; SSE4-NEXT: movd %eax, %xmm11
+; SSE4-NEXT: movzbl %bl, %eax
+; SSE4-NEXT: movd %eax, %xmm12
+; SSE4-NEXT: movzbl %r14b, %eax
+; SSE4-NEXT: movd %eax, %xmm13
+; SSE4-NEXT: movzbl %r9b, %eax
+; SSE4-NEXT: movd %eax, %xmm14
+; SSE4-NEXT: movzbl %r15b, %eax
+; SSE4-NEXT: movd %eax, %xmm15
; SSE4-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE4-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE4-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
@@ -1802,76 +1810,76 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE4-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
; SSE4-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
; SSE4-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
-; SSE4-NEXT: sbbq %r8, %rax
+; SSE4-NEXT: sbbq %rdx, %rcx
; SSE4-NEXT: sbbb $0, %sil
; SSE4-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
-; SSE4-NEXT: movzbl %sil, %ecx
-; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT: movb %cl, 4(%rax)
+; SSE4-NEXT: movzbl %sil, %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: movb %al, 4(%rdi)
; SSE4-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp)
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT: andl $3, %ecx
+; SSE4-NEXT: leaq (%rcx,%rax,4), %rax
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT: andl $3, %ecx
+; SSE4-NEXT: shll $4, %ecx
+; SSE4-NEXT: orq %rax, %rcx
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: shll $6, %eax
+; SSE4-NEXT: orq %rcx, %rax
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE4-NEXT: andl $3, %ecx
+; SSE4-NEXT: shll $8, %ecx
+; SSE4-NEXT: orq %rax, %rcx
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: shll $10, %eax
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: leaq (%rdx,%rcx,4), %rcx
+; SSE4-NEXT: shll $12, %edx
+; SSE4-NEXT: orq %rax, %rdx
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE4-NEXT: andl $3, %esi
+; SSE4-NEXT: shll $14, %esi
+; SSE4-NEXT: orq %rdx, %rsi
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: shll $16, %eax
+; SSE4-NEXT: orq %rsi, %rax
+; SSE4-NEXT: orq %rcx, %rax
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT: andl $3, %ecx
+; SSE4-NEXT: shll $18, %ecx
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shll $4, %edx
+; SSE4-NEXT: shll $20, %edx
; SSE4-NEXT: orq %rcx, %rdx
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: shll $6, %ecx
+; SSE4-NEXT: shll $22, %ecx
; SSE4-NEXT: orq %rdx, %rcx
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shll $8, %edx
+; SSE4-NEXT: shll $24, %edx
; SSE4-NEXT: orq %rcx, %rdx
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: shll $10, %ecx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: andl $3, %esi
-; SSE4-NEXT: shll $12, %esi
-; SSE4-NEXT: orq %rcx, %rsi
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSE4-NEXT: andl $3, %edi
-; SSE4-NEXT: shll $14, %edi
-; SSE4-NEXT: orq %rsi, %rdi
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: shll $16, %ecx
-; SSE4-NEXT: orq %rdi, %rcx
+; SSE4-NEXT: shlq $26, %rcx
; SSE4-NEXT: orq %rdx, %rcx
+; SSE4-NEXT: orq %rax, %rcx
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: shlq $28, %rax
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shll $18, %edx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: andl $3, %esi
-; SSE4-NEXT: shll $20, %esi
-; SSE4-NEXT: orq %rdx, %rsi
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shll $22, %edx
-; SSE4-NEXT: orq %rsi, %rdx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: andl $3, %esi
-; SSE4-NEXT: shll $24, %esi
-; SSE4-NEXT: orq %rdx, %rsi
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shlq $26, %rdx
-; SSE4-NEXT: orq %rsi, %rdx
+; SSE4-NEXT: shlq $30, %rdx
+; SSE4-NEXT: orq %rax, %rdx
; SSE4-NEXT: orq %rcx, %rdx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: shlq $28, %rcx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: andl $3, %esi
-; SSE4-NEXT: shlq $30, %rsi
-; SSE4-NEXT: orq %rcx, %rsi
-; SSE4-NEXT: orq %rdx, %rsi
-; SSE4-NEXT: movl %esi, (%rax)
+; SSE4-NEXT: movl %edx, (%rdi)
+; SSE4-NEXT: movq %rdi, %rax
; SSE4-NEXT: addq $120, %rsp
; SSE4-NEXT: popq %rbx
; SSE4-NEXT: popq %r12
@@ -1961,88 +1969,76 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: andl $127, %eax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: andl $127, %ecx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT: andl $127, %r11d
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: andl $127, %eax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT: andl $127, %ebx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSE2-NEXT: andl $127, %ebp
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: andl $127, %edx
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: andl $127, %r10d
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE2-NEXT: andl $127, %r14d
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT: andl $127, %ebp
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE2-NEXT: andl $127, %r13d
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE2-NEXT: andl $127, %r11d
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; SSE2-NEXT: andl $127, %r8d
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: andl $127, %eax
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; SSE2-NEXT: andl $127, %ebx
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
; SSE2-NEXT: andl $127, %r15d
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: cmpq %r13, %rcx
+; SSE2-NEXT: movq %r15, %r12
+; SSE2-NEXT: sbbq %rbx, %r12
+; SSE2-NEXT: setb %r12b
+; SSE2-NEXT: cmpq %rcx, %r13
+; SSE2-NEXT: sbbq %r15, %rbx
+; SSE2-NEXT: sbbb $0, %r12b
+; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; SSE2-NEXT: cmpq %rcx, %rbx
+; SSE2-NEXT: movq %rax, %r15
+; SSE2-NEXT: sbbq %r8, %r15
+; SSE2-NEXT: setb %r15b
+; SSE2-NEXT: cmpq %rbx, %rcx
+; SSE2-NEXT: sbbq %rax, %r8
+; SSE2-NEXT: sbbb $0, %r15b
+; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT: cmpq %rax, %r12
-; SSE2-NEXT: movq %r15, %r8
-; SSE2-NEXT: sbbq %r11, %r8
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: cmpq %rax, %rcx
+; SSE2-NEXT: movq %r14, %r8
+; SSE2-NEXT: sbbq %r10, %r8
; SSE2-NEXT: setb %r8b
-; SSE2-NEXT: cmpq %r12, %rax
-; SSE2-NEXT: sbbq %r15, %r11
+; SSE2-NEXT: cmpq %rcx, %rax
+; SSE2-NEXT: sbbq %r14, %r10
; SSE2-NEXT: sbbb $0, %r8b
; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: cmpq %rax, %r8
-; SSE2-NEXT: movq %r13, %r11
-; SSE2-NEXT: sbbq %rbp, %r11
-; SSE2-NEXT: setb %r11b
-; SSE2-NEXT: cmpq %r8, %rax
-; SSE2-NEXT: sbbq %r13, %rbp
-; SSE2-NEXT: sbbb $0, %r11b
-; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: cmpq %rax, %r8
-; SSE2-NEXT: movq %r14, %r11
-; SSE2-NEXT: sbbq %r10, %r11
-; SSE2-NEXT: setb %r11b
-; SSE2-NEXT: cmpq %r8, %rax
-; SSE2-NEXT: sbbq %r14, %r10
-; SSE2-NEXT: sbbb $0, %r11b
-; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: cmpq %rax, %r8
-; SSE2-NEXT: movq %rdx, %r10
-; SSE2-NEXT: sbbq %rbx, %r10
-; SSE2-NEXT: setb %r10b
-; SSE2-NEXT: cmpq %r8, %rax
-; SSE2-NEXT: sbbq %rdx, %rbx
-; SSE2-NEXT: sbbb $0, %r10b
-; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: cmpq %rax, %rdx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT: movq %r10, %r8
-; SSE2-NEXT: sbbq %rcx, %r8
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: cmpq %rax, %rcx
+; SSE2-NEXT: movq %rdx, %r8
+; SSE2-NEXT: sbbq %rbp, %r8
; SSE2-NEXT: setb %r8b
-; SSE2-NEXT: cmpq %rdx, %rax
-; SSE2-NEXT: sbbq %r10, %rcx
+; SSE2-NEXT: cmpq %rcx, %rax
+; SSE2-NEXT: sbbq %rdx, %rbp
; SSE2-NEXT: sbbb $0, %r8b
; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT: movq %r10, %rdx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT: sbbq %r8, %rdx
+; SSE2-NEXT: movq %r8, %rdx
+; SSE2-NEXT: sbbq %r11, %rdx
; SSE2-NEXT: setb %dl
; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %r10, %r8
+; SSE2-NEXT: sbbq %r8, %r11
; SSE2-NEXT: sbbb $0, %dl
; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -2058,117 +2054,129 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE2-NEXT: sbbb $0, %dl
; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE2-NEXT: movq %r11, %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: cmpq %rax, %rdx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT: sbbq %r10, %rdx
-; SSE2-NEXT: setb %r8b
-; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %r11, %r10
-; SSE2-NEXT: sbbb $0, %r8b
+; SSE2-NEXT: movq %r10, %rcx
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT: sbbq %r8, %rcx
+; SSE2-NEXT: setb %cl
+; SSE2-NEXT: cmpq %rdx, %rax
+; SSE2-NEXT: sbbq %r10, %r8
+; SSE2-NEXT: sbbb $0, %cl
+; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT: movq %rbx, %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: cmpq %rax, %rdx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT: sbbq %r10, %rdx
-; SSE2-NEXT: setb %r11b
-; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %rbx, %r10
-; SSE2-NEXT: sbbb $0, %r11b
+; SSE2-NEXT: movq %r10, %r8
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: sbbq %rcx, %r8
+; SSE2-NEXT: setb %r12b
+; SSE2-NEXT: cmpq %rdx, %rax
+; SSE2-NEXT: sbbq %r10, %rcx
+; SSE2-NEXT: sbbb $0, %r12b
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT: movq %rbx, %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: cmpq %rax, %rdx
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT: movq %r11, %r8
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT: sbbq %r10, %r8
+; SSE2-NEXT: setb %cl
+; SSE2-NEXT: cmpq %rdx, %rax
+; SSE2-NEXT: sbbq %r11, %r10
+; SSE2-NEXT: sbbb $0, %cl
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; SSE2-NEXT: cmpq %rax, %r8
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT: movq %r11, %rdx
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; SSE2-NEXT: sbbq %r10, %rdx
; SSE2-NEXT: setb %dl
-; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %rbx, %r10
+; SSE2-NEXT: cmpq %r8, %rax
+; SSE2-NEXT: sbbq %r11, %r10
; SSE2-NEXT: sbbb $0, %dl
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: cmpq %rax, %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: cmpq %rax, %r10
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT: movq %rbx, %r8
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT: sbbq %r11, %r8
+; SSE2-NEXT: setb %r8b
+; SSE2-NEXT: cmpq %r10, %rax
+; SSE2-NEXT: sbbq %rbx, %r11
+; SSE2-NEXT: sbbb $0, %r8b
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT: cmpq %rax, %r11
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; SSE2-NEXT: movq %r14, %r10
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
; SSE2-NEXT: sbbq %rbx, %r10
; SSE2-NEXT: setb %r10b
-; SSE2-NEXT: cmpq %rcx, %rax
+; SSE2-NEXT: cmpq %r11, %rax
; SSE2-NEXT: sbbq %r14, %rbx
; SSE2-NEXT: sbbb $0, %r10b
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT: cmpq %rax, %rbx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT: movq %r15, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT: sbbq %r14, %rcx
-; SSE2-NEXT: setb %cl
-; SSE2-NEXT: cmpq %rbx, %rax
-; SSE2-NEXT: sbbq %r15, %r14
-; SSE2-NEXT: sbbb $0, %cl
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE2-NEXT: cmpq %rax, %r14
-; SSE2-NEXT: movq (%rsp), %r12 # 8-byte Reload
-; SSE2-NEXT: movq %r12, %rbx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT: sbbq %r15, %rbx
-; SSE2-NEXT: setb %bl
+; SSE2-NEXT: movq (%rsp), %r15 # 8-byte Reload
+; SSE2-NEXT: movq %r15, %r11
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT: sbbq %rbx, %r11
+; SSE2-NEXT: setb %r11b
; SSE2-NEXT: cmpq %r14, %rax
-; SSE2-NEXT: sbbq %r12, %r15
-; SSE2-NEXT: sbbb $0, %bl
+; SSE2-NEXT: sbbq %r15, %rbx
+; SSE2-NEXT: sbbb $0, %r11b
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: cmpq %r9, %rax
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT: movq %r12, %r14
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT: sbbq %r15, %r14
+; SSE2-NEXT: movq %r15, %r14
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT: sbbq %rbx, %r14
; SSE2-NEXT: setb %bpl
; SSE2-NEXT: cmpq %rax, %r9
-; SSE2-NEXT: sbbq %r12, %r15
+; SSE2-NEXT: sbbq %r15, %rbx
; SSE2-NEXT: sbbb $0, %bpl
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE2-NEXT: cmpq %rsi, %rax
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT: movq %r15, %r9
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT: sbbq %r14, %r9
+; SSE2-NEXT: movq %r14, %r9
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT: sbbq %rbx, %r9
; SSE2-NEXT: setb %r9b
; SSE2-NEXT: cmpq %rax, %rsi
-; SSE2-NEXT: sbbq %r15, %r14
-; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: sbbq %r14, %rbx
+; SSE2-NEXT: movq %rdi, %rbx
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSE2-NEXT: sbbb $0, %r9b
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; SSE2-NEXT: cmpq %r15, %rsi
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT: movq %r12, %rdi
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: movq %rax, %rdi
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; SSE2-NEXT: sbbq %r14, %rdi
; SSE2-NEXT: setb %dil
; SSE2-NEXT: cmpq %rsi, %r15
-; SSE2-NEXT: sbbq %r12, %r14
+; SSE2-NEXT: sbbq %rax, %r14
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE2-NEXT: sbbb $0, %dil
; SSE2-NEXT: cmpq %rsi, %r14
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: movq %rax, %r15
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE2-NEXT: movq %r13, %r15
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT: sbbq %r12, %r15
+; SSE2-NEXT: sbbq %r13, %r15
; SSE2-NEXT: setb %r15b
; SSE2-NEXT: cmpq %r14, %rsi
-; SSE2-NEXT: sbbq %r13, %r12
+; SSE2-NEXT: sbbq %rax, %r13
; SSE2-NEXT: sbbb $0, %r15b
; SSE2-NEXT: movzbl %r15b, %esi
; SSE2-NEXT: andl $3, %esi
-; SSE2-NEXT: movb %sil, 4(%rax)
+; SSE2-NEXT: movb %sil, 4(%rbx)
; SSE2-NEXT: movzbl %dil, %esi
; SSE2-NEXT: movzbl %r9b, %edi
; SSE2-NEXT: andl $3, %esi
@@ -2178,59 +2186,60 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE2-NEXT: andl $3, %edi
; SSE2-NEXT: shll $4, %edi
; SSE2-NEXT: orq %rsi, %rdi
-; SSE2-NEXT: movzbl %bl, %r9d
+; SSE2-NEXT: movzbl %r11b, %r9d
; SSE2-NEXT: andl $3, %r9d
; SSE2-NEXT: shll $6, %r9d
; SSE2-NEXT: orq %rdi, %r9
-; SSE2-NEXT: movzbl %cl, %esi
+; SSE2-NEXT: movzbl %r10b, %esi
; SSE2-NEXT: andl $3, %esi
; SSE2-NEXT: shll $8, %esi
; SSE2-NEXT: orq %r9, %rsi
-; SSE2-NEXT: movzbl %dl, %ecx
-; SSE2-NEXT: movzbl %r10b, %edx
-; SSE2-NEXT: andl $3, %edx
-; SSE2-NEXT: shll $10, %edx
-; SSE2-NEXT: andl $3, %ecx
-; SSE2-NEXT: shll $12, %ecx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: movzbl %r11b, %edx
+; SSE2-NEXT: movzbl %dl, %edx
+; SSE2-NEXT: movzbl %r8b, %edi
+; SSE2-NEXT: andl $3, %edi
+; SSE2-NEXT: shll $10, %edi
; SSE2-NEXT: andl $3, %edx
-; SSE2-NEXT: shll $14, %edx
-; SSE2-NEXT: orq %rcx, %rdx
-; SSE2-NEXT: movzbl %r8b, %ecx
-; SSE2-NEXT: andl $3, %ecx
-; SSE2-NEXT: shll $16, %ecx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: orq %rsi, %rcx
-; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SSE2-NEXT: andl $3, %esi
-; SSE2-NEXT: shll $18, %esi
+; SSE2-NEXT: shll $12, %edx
+; SSE2-NEXT: orq %rdi, %rdx
+; SSE2-NEXT: movzbl %cl, %edi
+; SSE2-NEXT: andl $3, %edi
+; SSE2-NEXT: shll $14, %edi
+; SSE2-NEXT: orq %rdx, %rdi
+; SSE2-NEXT: movzbl %r12b, %edx
; SSE2-NEXT: andl $3, %edx
-; SSE2-NEXT: shll $20, %edx
+; SSE2-NEXT: shll $16, %edx
+; SSE2-NEXT: orq %rdi, %rdx
; SSE2-NEXT: orq %rsi, %rdx
; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: shll $18, %ecx
; SSE2-NEXT: andl $3, %esi
-; SSE2-NEXT: shll $22, %esi
-; SSE2-NEXT: orq %rdx, %rsi
-; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT: andl $3, %edx
-; SSE2-NEXT: shll $24, %edx
-; SSE2-NEXT: orq %rsi, %rdx
+; SSE2-NEXT: shll $20, %esi
+; SSE2-NEXT: orq %rcx, %rsi
+; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: shll $22, %ecx
+; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
; SSE2-NEXT: andl $3, %esi
-; SSE2-NEXT: shlq $26, %rsi
-; SSE2-NEXT: orq %rdx, %rsi
+; SSE2-NEXT: shll $24, %esi
; SSE2-NEXT: orq %rcx, %rsi
; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT: andl $3, %edx
-; SSE2-NEXT: shlq $28, %rdx
; SSE2-NEXT: andl $3, %ecx
-; SSE2-NEXT: shlq $30, %rcx
-; SSE2-NEXT: orq %rdx, %rcx
+; SSE2-NEXT: shlq $26, %rcx
; SSE2-NEXT: orq %rsi, %rcx
-; SSE2-NEXT: movl %ecx, (%rax)
+; SSE2-NEXT: orq %rdx, %rcx
+; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE2-NEXT: andl $3, %esi
+; SSE2-NEXT: shlq $28, %rsi
+; SSE2-NEXT: andl $3, %edx
+; SSE2-NEXT: shlq $30, %rdx
+; SSE2-NEXT: orq %rsi, %rdx
+; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: movl %edx, (%rbx)
+; SSE2-NEXT: movq %rbx, %rax
; SSE2-NEXT: addq $88, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
@@ -2333,34 +2342,34 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX2-NEXT: andl $127, %r14d
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
; AVX2-NEXT: andl $127, %edx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; AVX2-NEXT: andl $127, %ebp
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: andl $127, %ebx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: andl $127, %r11d
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
; AVX2-NEXT: andl $127, %r8d
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT: andl $127, %r12d
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
; AVX2-NEXT: andl $127, %r13d
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: cmpq %rbx, %r11
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX2-NEXT: cmpq %r12, %rbp
; AVX2-NEXT: movq %r13, %r10
-; AVX2-NEXT: sbbq %r12, %r10
+; AVX2-NEXT: sbbq %r8, %r10
; AVX2-NEXT: setb %r10b
-; AVX2-NEXT: cmpq %r11, %rbx
-; AVX2-NEXT: sbbq %r13, %r12
+; AVX2-NEXT: cmpq %rbp, %r12
+; AVX2-NEXT: sbbq %r13, %r8
; AVX2-NEXT: sbbb $0, %r10b
; AVX2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: cmpq %r10, %r11
-; AVX2-NEXT: movq %r8, %rbx
-; AVX2-NEXT: sbbq %rbp, %rbx
-; AVX2-NEXT: setb %bl
-; AVX2-NEXT: cmpq %r11, %r10
-; AVX2-NEXT: sbbq %r8, %rbp
-; AVX2-NEXT: sbbb $0, %bl
-; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT: cmpq %r8, %r10
+; AVX2-NEXT: movq %r11, %r12
+; AVX2-NEXT: sbbq %rbx, %r12
+; AVX2-NEXT: setb %bpl
+; AVX2-NEXT: cmpq %r10, %r8
+; AVX2-NEXT: sbbq %r11, %rbx
+; AVX2-NEXT: sbbb $0, %bpl
+; AVX2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: cmpq %r8, %r10
@@ -2431,13 +2440,13 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
; AVX2-NEXT: cmpq %rax, %rdx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: movq %rbx, %r10
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: movq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: sbbq %r10, %r8
+; AVX2-NEXT: sbbq %r11, %r10
; AVX2-NEXT: setb %r8b
; AVX2-NEXT: cmpq %rdx, %rax
-; AVX2-NEXT: sbbq %r11, %r10
+; AVX2-NEXT: sbbq %rbx, %r11
; AVX2-NEXT: sbbb $0, %r8b
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
@@ -2451,81 +2460,81 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX2-NEXT: sbbq %rbx, %r11
; AVX2-NEXT: sbbb $0, %dl
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: cmpq %rax, %r11
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: cmpq %rax, %rbx
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: sbbq %rbx, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: sbbq %r11, %r10
; AVX2-NEXT: setb %r10b
-; AVX2-NEXT: cmpq %r11, %rax
-; AVX2-NEXT: sbbq %r14, %rbx
-; AVX2-NEXT: sbbb $0, %r10b
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT: cmpq %rax, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: movq %r15, %r11
-; AVX2-NEXT: movq (%rsp), %r14 # 8-byte Reload
-; AVX2-NEXT: sbbq %r14, %r11
-; AVX2-NEXT: setb %r11b
; AVX2-NEXT: cmpq %rbx, %rax
-; AVX2-NEXT: sbbq %r15, %r14
-; AVX2-NEXT: sbbb $0, %r11b
+; AVX2-NEXT: sbbq %r14, %r11
+; AVX2-NEXT: sbbb $0, %r10b
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX2-NEXT: cmpq %rax, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq %r13, %rbx
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: sbbq %r15, %rbx
+; AVX2-NEXT: movq %r15, %rbx
+; AVX2-NEXT: movq (%rsp), %r11 # 8-byte Reload
+; AVX2-NEXT: sbbq %r11, %rbx
; AVX2-NEXT: setb %bl
; AVX2-NEXT: cmpq %r14, %rax
-; AVX2-NEXT: sbbq %r13, %r15
+; AVX2-NEXT: sbbq %r15, %r11
; AVX2-NEXT: sbbb $0, %bl
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: cmpq %r9, %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: cmpq %rax, %r14
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq %r13, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: sbbq %r15, %r14
+; AVX2-NEXT: movq %r13, %r15
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: sbbq %r11, %r15
; AVX2-NEXT: setb %bpl
-; AVX2-NEXT: cmpq %rax, %r9
-; AVX2-NEXT: sbbq %r13, %r15
+; AVX2-NEXT: cmpq %r14, %rax
+; AVX2-NEXT: sbbq %r13, %r11
; AVX2-NEXT: sbbb $0, %bpl
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: cmpq %r9, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: movq %r15, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: sbbq %r11, %r14
+; AVX2-NEXT: setb %r14b
+; AVX2-NEXT: cmpq %rax, %r9
+; AVX2-NEXT: sbbq %r15, %r11
+; AVX2-NEXT: sbbb $0, %r14b
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: cmpq %rsi, %rax
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
; AVX2-NEXT: movq %r15, %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: sbbq %r14, %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: sbbq %r11, %r9
; AVX2-NEXT: setb %r9b
; AVX2-NEXT: cmpq %rax, %rsi
-; AVX2-NEXT: sbbq %r15, %r14
+; AVX2-NEXT: sbbq %r15, %r11
; AVX2-NEXT: sbbb $0, %r9b
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: cmpq %rcx, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: movq %r11, %rsi
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: movq %r15, %rsi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: sbbq %r14, %rsi
+; AVX2-NEXT: sbbq %r15, %rsi
; AVX2-NEXT: setb %sil
; AVX2-NEXT: cmpq %rax, %rcx
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: sbbq %r15, %r14
+; AVX2-NEXT: sbbq %r11, %r15
; AVX2-NEXT: sbbb $0, %sil
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX2-NEXT: cmpq %rax, %rcx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: movq %r11, %r15
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq %r13, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: sbbq %r15, %r14
-; AVX2-NEXT: setb %r14b
-; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: sbbq %r13, %r15
+; AVX2-NEXT: setb %r15b
+; AVX2-NEXT: cmpq %rcx, %rax
+; AVX2-NEXT: sbbq %r11, %r13
; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: sbbb $0, %r14b
-; AVX2-NEXT: movzbl %r14b, %ecx
+; AVX2-NEXT: sbbb $0, %r15b
+; AVX2-NEXT: movzbl %r15b, %ecx
; AVX2-NEXT: andl $3, %ecx
; AVX2-NEXT: movb %cl, 4(%rdi)
; AVX2-NEXT: movzbl %sil, %ecx
@@ -2533,15 +2542,15 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX2-NEXT: movzbl %r9b, %esi
; AVX2-NEXT: andl $3, %esi
; AVX2-NEXT: leaq (%rsi,%rcx,4), %rcx
-; AVX2-NEXT: movzbl %bpl, %esi
+; AVX2-NEXT: movzbl %r14b, %esi
; AVX2-NEXT: andl $3, %esi
; AVX2-NEXT: shll $4, %esi
; AVX2-NEXT: orq %rcx, %rsi
-; AVX2-NEXT: movzbl %bl, %ecx
+; AVX2-NEXT: movzbl %bpl, %ecx
; AVX2-NEXT: andl $3, %ecx
; AVX2-NEXT: shll $6, %ecx
; AVX2-NEXT: orq %rsi, %rcx
-; AVX2-NEXT: movzbl %r11b, %esi
+; AVX2-NEXT: movzbl %bl, %esi
; AVX2-NEXT: andl $3, %esi
; AVX2-NEXT: shll $8, %esi
; AVX2-NEXT: orq %rcx, %rsi
@@ -2676,18 +2685,18 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: andl $127, %eax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; AVX512-NEXT: andl $127, %ebp
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512-NEXT: andl $127, %r12d
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13
; AVX512-NEXT: andl $127, %r13d
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX512-NEXT: andl $127, %ebp
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15
; AVX512-NEXT: andl $127, %r15d
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT: andl $127, %r12d
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-NEXT: andl $127, %r10d
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT: andl $127, %ebx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT: andl $127, %r14d
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8
; AVX512-NEXT: andl $127, %r8d
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9
@@ -2700,13 +2709,13 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX512-NEXT: andl $127, %eax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
; AVX512-NEXT: andl $127, %edx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: cmpq %r14, %r11
+; AVX512-NEXT: cmpq %rbx, %r11
; AVX512-NEXT: movq %rdx, %rcx
; AVX512-NEXT: sbbq %rax, %rcx
; AVX512-NEXT: setb %cl
-; AVX512-NEXT: cmpq %r11, %r14
+; AVX512-NEXT: cmpq %r11, %rbx
; AVX512-NEXT: sbbq %rdx, %rax
; AVX512-NEXT: sbbb $0, %cl
; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
@@ -2733,31 +2742,31 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq %rbx, %rdx
+; AVX512-NEXT: movq %r14, %rdx
; AVX512-NEXT: sbbq %r10, %rdx
; AVX512-NEXT: setb %dl
; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %rbx, %r10
+; AVX512-NEXT: sbbq %r14, %r10
; AVX512-NEXT: sbbb $0, %dl
; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq %r15, %rdx
-; AVX512-NEXT: sbbq %r13, %rdx
+; AVX512-NEXT: movq %r12, %rdx
+; AVX512-NEXT: sbbq %r15, %rdx
; AVX512-NEXT: setb %dl
; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %r15, %r13
+; AVX512-NEXT: sbbq %r12, %r15
; AVX512-NEXT: sbbb $0, %dl
; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq %r12, %rdx
-; AVX512-NEXT: sbbq %rbp, %rdx
+; AVX512-NEXT: movq %rbp, %rdx
+; AVX512-NEXT: sbbq %r13, %rdx
; AVX512-NEXT: setb %dl
; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %r12, %rbp
+; AVX512-NEXT: sbbq %rbp, %r13
; AVX512-NEXT: sbbb $0, %dl
; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -2767,21 +2776,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX512-NEXT: movq %rdi, %rdx
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; AVX512-NEXT: sbbq %rsi, %rdx
-; AVX512-NEXT: setb %r13b
-; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %rdi, %rsi
-; AVX512-NEXT: sbbb $0, %r13b
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: movq %rdi, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: sbbq %rsi, %rdx
-; AVX512-NEXT: setb %bpl
+; AVX512-NEXT: setb %r15b
; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: sbbq %rdi, %rsi
-; AVX512-NEXT: sbbb $0, %bpl
+; AVX512-NEXT: sbbb $0, %r15b
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
; AVX512-NEXT: cmpq %rcx, %rdx
@@ -2789,10 +2787,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX512-NEXT: movq %rdi, %rax
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; AVX512-NEXT: sbbq %rsi, %rax
-; AVX512-NEXT: setb %r9b
+; AVX512-NEXT: setb %bl
; AVX512-NEXT: cmpq %rdx, %rcx
; AVX512-NEXT: sbbq %rdi, %rsi
-; AVX512-NEXT: sbbb $0, %r9b
+; AVX512-NEXT: sbbb $0, %bl
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; AVX512-NEXT: cmpq %rdx, %rsi
@@ -2818,107 +2816,118 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8
; AVX512-NEXT: cmpq %rdi, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: movq %r10, %rsi
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: movq %r9, %rsi
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX512-NEXT: sbbq %rax, %rsi
; AVX512-NEXT: setb %sil
; AVX512-NEXT: cmpq %r8, %rdi
-; AVX512-NEXT: sbbq %r10, %rax
+; AVX512-NEXT: sbbq %r9, %rax
; AVX512-NEXT: sbbb $0, %sil
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: cmpq %r8, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: movq %r11, %rdi
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT: cmpq %r8, %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: movq %r10, %rdi
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX512-NEXT: sbbq %rax, %rdi
; AVX512-NEXT: setb %dil
-; AVX512-NEXT: cmpq %r10, %r8
-; AVX512-NEXT: sbbq %r11, %rax
+; AVX512-NEXT: cmpq %r9, %r8
+; AVX512-NEXT: sbbq %r10, %rax
; AVX512-NEXT: sbbb $0, %dil
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: cmpq %r9, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: movq %r11, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: sbbq %rax, %r8
+; AVX512-NEXT: setb %r8b
+; AVX512-NEXT: cmpq %r10, %r9
+; AVX512-NEXT: sbbq %r11, %rax
+; AVX512-NEXT: sbbb $0, %r8b
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX512-NEXT: cmpq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: movq %rbx, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT: movq %r14, %r9
; AVX512-NEXT: movq (%rsp), %r11 # 8-byte Reload
-; AVX512-NEXT: sbbq %r11, %r8
-; AVX512-NEXT: setb %r8b
+; AVX512-NEXT: sbbq %r11, %r9
+; AVX512-NEXT: setb %r9b
; AVX512-NEXT: cmpq %r10, %rax
-; AVX512-NEXT: sbbq %rbx, %r11
-; AVX512-NEXT: sbbb $0, %r8b
+; AVX512-NEXT: sbbq %r14, %r11
+; AVX512-NEXT: sbbb $0, %r9b
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: cmpq %rbx, %r11
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT: movq %r14, %r10
+; AVX512-NEXT: cmpq %r14, %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT: movq %r12, %r10
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; AVX512-NEXT: sbbq %rax, %r10
; AVX512-NEXT: setb %r10b
-; AVX512-NEXT: cmpq %r11, %rbx
-; AVX512-NEXT: sbbq %r14, %rax
+; AVX512-NEXT: cmpq %r11, %r14
+; AVX512-NEXT: sbbq %r12, %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512-NEXT: sbbb $0, %r10b
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: cmpq %r15, %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: cmpq %r13, %r11
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT: sbbq %r14, %rbx
-; AVX512-NEXT: setb %bl
-; AVX512-NEXT: cmpq %r11, %r15
+; AVX512-NEXT: movq %rax, %r14
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT: sbbq %r12, %r14
+; AVX512-NEXT: setb %bpl
+; AVX512-NEXT: cmpq %r11, %r13
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: sbbq %rax, %r14
+; AVX512-NEXT: sbbq %rax, %r12
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT: sbbb $0, %bl
+; AVX512-NEXT: sbbb $0, %bpl
; AVX512-NEXT: cmpq %r11, %r14
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT: sbbq %r12, %r15
-; AVX512-NEXT: setb %r15b
+; AVX512-NEXT: movq %rax, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: sbbq %r13, %r12
+; AVX512-NEXT: setb %r12b
; AVX512-NEXT: cmpq %r14, %r11
-; AVX512-NEXT: sbbq %rax, %r12
-; AVX512-NEXT: sbbb $0, %r15b
-; AVX512-NEXT: movzbl %r15b, %r11d
+; AVX512-NEXT: sbbq %rax, %r13
+; AVX512-NEXT: sbbb $0, %r12b
+; AVX512-NEXT: movzbl %r12b, %r11d
; AVX512-NEXT: andl $3, %r11d
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; AVX512-NEXT: movb %r11b, 4(%r14)
-; AVX512-NEXT: movzbl %bl, %r11d
+; AVX512-NEXT: movzbl %bpl, %r11d
; AVX512-NEXT: andl $3, %r11d
; AVX512-NEXT: movzbl %r10b, %r10d
; AVX512-NEXT: andl $3, %r10d
; AVX512-NEXT: leaq (%r10,%r11,4), %r10
+; AVX512-NEXT: movzbl %r9b, %r9d
+; AVX512-NEXT: andl $3, %r9d
+; AVX512-NEXT: shll $4, %r9d
+; AVX512-NEXT: orq %r10, %r9
; AVX512-NEXT: movzbl %r8b, %r8d
; AVX512-NEXT: andl $3, %r8d
-; AVX512-NEXT: shll $4, %r8d
-; AVX512-NEXT: orq %r10, %r8
+; AVX512-NEXT: shll $6, %r8d
+; AVX512-NEXT: orq %r9, %r8
; AVX512-NEXT: movzbl %dil, %edi
; AVX512-NEXT: andl $3, %edi
-; AVX512-NEXT: shll $6, %edi
+; AVX512-NEXT: shll $8, %edi
; AVX512-NEXT: orq %r8, %rdi
; AVX512-NEXT: movzbl %sil, %esi
; AVX512-NEXT: andl $3, %esi
-; AVX512-NEXT: shll $8, %esi
-; AVX512-NEXT: orq %rdi, %rsi
+; AVX512-NEXT: shll $10, %esi
; AVX512-NEXT: movzbl %dl, %edx
; AVX512-NEXT: andl $3, %edx
-; AVX512-NEXT: shll $10, %edx
+; AVX512-NEXT: shll $12, %edx
+; AVX512-NEXT: orq %rsi, %rdx
; AVX512-NEXT: movzbl %cl, %ecx
; AVX512-NEXT: andl $3, %ecx
-; AVX512-NEXT: shll $12, %ecx
+; AVX512-NEXT: shll $14, %ecx
; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: movzbl %r9b, %edx
-; AVX512-NEXT: andl $3, %edx
-; AVX512-NEXT: shll $14, %edx
-; AVX512-NEXT: orq %rcx, %rdx
-; AVX512-NEXT: movzbl %bpl, %eax
+; AVX512-NEXT: movzbl %bl, %eax
; AVX512-NEXT: andl $3, %eax
; AVX512-NEXT: shll $16, %eax
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: orq %rsi, %rax
-; AVX512-NEXT: movzbl %r13b, %ecx
+; AVX512-NEXT: orq %rcx, %rax
+; AVX512-NEXT: orq %rdi, %rax
+; AVX512-NEXT: movzbl %r15b, %ecx
; AVX512-NEXT: andl $3, %ecx
; AVX512-NEXT: shll $18, %ecx
; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
@@ -2963,7 +2972,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $132, %esp
+; X86-NEXT: subl $128, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $127, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3054,31 +3063,47 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $127, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: andl $127, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl $127, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl $127, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: andl $127, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: andl $127, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $127, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: andl $127, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: sbbl %eax, %esi
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sbbl %edx, %esi
-; X86-NEXT: movl $0, %esi
-; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: sbbl %edx, %ebp
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: sbbl %eax, %ebp
+; X86-NEXT: movl $0, %ebp
+; X86-NEXT: sbbl %ebp, %ebp
; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: sbbb $0, %cl
+; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sbbl %ebx, %eax
-; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: sbbl %edx, %ebp
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: sbbl %edi, %ebp
+; X86-NEXT: movl $0, %ebp
+; X86-NEXT: sbbl %ebp, %ebp
+; X86-NEXT: setb %cl
+; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: sbbl %ebx, %edi
; X86-NEXT: movl $0, %eax
; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: sbbb $0, %cl
@@ -3091,6 +3116,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: sbbl %ebp, %edi
; X86-NEXT: movl $0, %edi
; X86-NEXT: sbbl %edi, %edi
@@ -3243,26 +3269,6 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; X86-NEXT: sbbb $0, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: sbbl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %edi
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: sbbl %edi, %edi
-; X86-NEXT: setb %cl
-; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: sbbl %esi, %edx
-; X86-NEXT: sbbl %ebp, %ebx
-; X86-NEXT: movl $0, %eax
-; X86-NEXT: sbbl %eax, %eax
-; X86-NEXT: sbbb $0, %cl
-; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -3387,8 +3393,8 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: sbbl %edi, %ebp
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: sbbl %esi, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3398,7 +3404,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: sbbl %esi, %edi
+; X86-NEXT: sbbl %edi, %esi
; X86-NEXT: sbbl %edx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: sbbl %eax, %eax
@@ -3469,7 +3475,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; X86-NEXT: orl %eax, %edx
; X86-NEXT: movl %edx, (%edi)
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: addl $132, %esp
+; X86-NEXT: addl $128, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
>From ff2c357952c7f7824773bb3ecf37b59a630063d8 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Mon, 22 Sep 2025 11:13:03 -0400
Subject: [PATCH 2/2] Fix mitigiations in PowerPC by having custom lowering
anyway
---
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 33 +++++++++
llvm/lib/Target/PowerPC/PPCISelLowering.h | 1 +
llvm/test/CodeGen/PowerPC/ucmp.ll | 78 ++++++++-------------
3 files changed, 62 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2907303874de5..932c224561033 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -585,6 +585,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// We cannot sextinreg(i1). Expand to shifts.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ // Custom handling for PowerPC ucmp instruction
+ setOperationAction(ISD::UCMP, MVT::i32, Custom);
+ setOperationAction(ISD::UCMP, MVT::i64, isPPC64 ? Custom : Expand);
+
// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
// SjLj exception handling but a light-weight setjmp/longjmp replacement to
// support continuation, user-level threading, and etc.. As a result, no
@@ -12618,6 +12622,33 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues({Sub, OverflowTrunc}, dl);
}
+// Lower unsigned 3-way compare producing -1/0/1.
+SDValue PPCTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue A = DAG.getFreeze(Op.getOperand(0));
+ SDValue B = DAG.getFreeze(Op.getOperand(1));
+ EVT OpVT = A.getValueType(); // operand type
+ EVT ResVT = Op.getValueType(); // result type
+
+ // First compute diff = A - B (will become subf).
+ SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, A, B);
+
+ // Generate B - A using SUBC to capture carry.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ SDValue SubC = DAG.getNode(PPCISD::SUBC, DL, VTs, B, A);
+ SDValue CA0 = SubC.getValue(1);
+
+ // t2 = A - B + CA0 using SUBE.
+ SDValue SubE1 = DAG.getNode(PPCISD::SUBE, DL, VTs, A, B, CA0);
+ SDValue CA1 = SubE1.getValue(1);
+
+ // res = diff - t2 + CA1 using SUBE (produces desired -1/0/1).
+ SDValue ResPair = DAG.getNode(PPCISD::SUBE, DL, VTs, Diff, SubE1, CA1);
+
+ // Extract the first result and truncate to result type if needed
+ return DAG.getSExtOrTrunc(ResPair.getValue(0), DL, ResVT);
+}
+
/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -12722,6 +12753,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UADDO_CARRY:
case ISD::USUBO_CARRY:
return LowerADDSUBO_CARRY(Op, DAG);
+ case ISD::UCMP:
+ return LowerUCMP(Op, DAG);
}
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 669430550f4e6..b82533fac2eb8 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1343,6 +1343,7 @@ namespace llvm {
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUCMP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll
index 22faf9cbd9c24..4d393dd00e3db 100644
--- a/llvm/test/CodeGen/PowerPC/ucmp.ll
+++ b/llvm/test/CodeGen/PowerPC/ucmp.ll
@@ -4,14 +4,10 @@
define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
; CHECK-LABEL: ucmp_8_8:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrldi 5, 4, 32
-; CHECK-NEXT: clrldi 6, 3, 32
-; CHECK-NEXT: sub 5, 5, 6
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: rldic 3, 3, 0, 32
-; CHECK-NEXT: rldicl 5, 5, 1, 63
-; CHECK-NEXT: isellt 3, 3, 5
+; CHECK-NEXT: subc 6, 4, 3
+; CHECK-NEXT: sub 5, 3, 4
+; CHECK-NEXT: subfe 3, 4, 3
+; CHECK-NEXT: subfe 3, 3, 5
; CHECK-NEXT: blr
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
ret i8 %1
@@ -20,14 +16,10 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
; CHECK-LABEL: ucmp_8_16:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrldi 5, 4, 32
-; CHECK-NEXT: clrldi 6, 3, 32
-; CHECK-NEXT: sub 5, 5, 6
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: rldic 3, 3, 0, 32
-; CHECK-NEXT: rldicl 5, 5, 1, 63
-; CHECK-NEXT: isellt 3, 3, 5
+; CHECK-NEXT: subc 6, 4, 3
+; CHECK-NEXT: sub 5, 3, 4
+; CHECK-NEXT: subfe 3, 4, 3
+; CHECK-NEXT: subfe 3, 3, 5
; CHECK-NEXT: blr
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
ret i8 %1
@@ -36,14 +28,10 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ucmp_8_32:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrldi 5, 4, 32
-; CHECK-NEXT: clrldi 6, 3, 32
-; CHECK-NEXT: sub 5, 5, 6
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: rldic 3, 3, 0, 32
-; CHECK-NEXT: rldicl 5, 5, 1, 63
-; CHECK-NEXT: isellt 3, 3, 5
+; CHECK-NEXT: subc 6, 4, 3
+; CHECK-NEXT: sub 5, 3, 4
+; CHECK-NEXT: subfe 3, 4, 3
+; CHECK-NEXT: subfe 3, 3, 5
; CHECK-NEXT: blr
%1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
ret i8 %1
@@ -52,12 +40,10 @@ define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind {
; CHECK-LABEL: ucmp_8_64:
; CHECK: # %bb.0:
-; CHECK-NEXT: cmpld 3, 4
-; CHECK-NEXT: subc 3, 4, 3
-; CHECK-NEXT: subfe 3, 4, 4
-; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: neg 3, 3
-; CHECK-NEXT: isellt 3, 4, 3
+; CHECK-NEXT: subc 6, 4, 3
+; CHECK-NEXT: sub 5, 3, 4
+; CHECK-NEXT: subfe 3, 4, 3
+; CHECK-NEXT: subfe 3, 3, 5
; CHECK-NEXT: blr
%1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
ret i8 %1
@@ -86,14 +72,10 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ucmp_32_32:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrldi 5, 4, 32
-; CHECK-NEXT: clrldi 6, 3, 32
-; CHECK-NEXT: sub 5, 5, 6
-; CHECK-NEXT: cmplw 3, 4
-; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: rldic 3, 3, 0, 32
-; CHECK-NEXT: rldicl 5, 5, 1, 63
-; CHECK-NEXT: isellt 3, 3, 5
+; CHECK-NEXT: subc 6, 4, 3
+; CHECK-NEXT: sub 5, 3, 4
+; CHECK-NEXT: subfe 3, 4, 3
+; CHECK-NEXT: subfe 3, 3, 5
; CHECK-NEXT: blr
%1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
ret i32 %1
@@ -102,12 +84,10 @@ define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
; CHECK-LABEL: ucmp_32_64:
; CHECK: # %bb.0:
-; CHECK-NEXT: cmpld 3, 4
-; CHECK-NEXT: subc 3, 4, 3
-; CHECK-NEXT: subfe 3, 4, 4
-; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: neg 3, 3
-; CHECK-NEXT: isellt 3, 4, 3
+; CHECK-NEXT: subc 6, 4, 3
+; CHECK-NEXT: sub 5, 3, 4
+; CHECK-NEXT: subfe 3, 4, 3
+; CHECK-NEXT: subfe 3, 3, 5
; CHECK-NEXT: blr
%1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
ret i32 %1
@@ -116,12 +96,10 @@ define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind {
; CHECK-LABEL: ucmp_64_64:
; CHECK: # %bb.0:
-; CHECK-NEXT: subc 5, 4, 3
-; CHECK-NEXT: cmpld 3, 4
-; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: subfe 5, 4, 4
-; CHECK-NEXT: neg 5, 5
-; CHECK-NEXT: isellt 3, 3, 5
+; CHECK-NEXT: subc 6, 4, 3
+; CHECK-NEXT: sub 5, 3, 4
+; CHECK-NEXT: subfe 3, 4, 3
+; CHECK-NEXT: subfe 3, 3, 5
; CHECK-NEXT: blr
%1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
ret i64 %1
More information about the llvm-commits
mailing list