[llvm] 777a58e - Support {S,U}REMEqFold before legalization

Simonas Kazlauskas via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 31 15:35:49 PDT 2021


Author: Simonas Kazlauskas
Date: 2021-04-01T01:35:41+03:00
New Revision: 777a58e05b22973d902e78091a2e06b99c71b65c

URL: https://github.com/llvm/llvm-project/commit/777a58e05b22973d902e78091a2e06b99c71b65c
DIFF: https://github.com/llvm/llvm-project/commit/777a58e05b22973d902e78091a2e06b99c71b65c.diff

LOG: Support {S,U}REMEqFold before legalization

This allows these optimisations to apply to e.g. `urem i16` directly
before `urem` is promoted to i32 on architectures where i16 operations
are not intrinsically legal (such as on Aarch64). The legalization then
later can happen more directly and generated code gets a chance to avoid
wasting time on computing results in types wider than necessary, in the end.

Seems like mostly an improvement in terms of results at least as far as x86_64 and aarch64 are concerned, with a few regressions here and there. It also helps in preventing regressions in changes like {D87976}.

Reviewed By: lebedev.ri

Differential Revision: https://reviews.llvm.org/D88785

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
    llvm/test/CodeGen/AArch64/srem-seteq.ll
    llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
    llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
    llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
    llvm/test/CodeGen/AArch64/urem-seteq.ll
    llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll
    llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
    llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
    llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
    llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
    llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 04067ffd2303..4f14c7e72409 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5453,7 +5453,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
   EVT ShSVT = ShVT.getScalarType();
 
   // If MUL is unavailable, we cannot proceed in any case.
-  if (!isOperationLegalOrCustom(ISD::MUL, VT))
+  if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT))
     return SDValue();
 
   bool ComparingWithAllZeros = true;
@@ -5583,7 +5583,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
   }
 
   if (!ComparingWithAllZeros && !AllComparisonsWithNonZerosAreTautological) {
-    if (!isOperationLegalOrCustom(ISD::SUB, VT))
+    if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::SUB, VT))
       return SDValue(); // FIXME: Could/should use `ISD::ADD`?
     assert(CompTargetNode.getValueType() == N.getValueType() &&
            "Expecting that the types on LHS and RHS of comparisons match.");
@@ -5598,7 +5598,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
   // divisors as a performance improvement, since rotating by 0 is a no-op.
   if (HadEvenDivisor) {
     // We need ROTR to do this.
-    if (!isOperationLegalOrCustom(ISD::ROTR, VT))
+    if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT))
       return SDValue();
     SDNodeFlags Flags;
     Flags.setExact(true);
@@ -5628,6 +5628,8 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
       DAG.getSetCC(DL, SETCCVT, D, CompTargetNode, ISD::SETULE);
   Created.push_back(TautologicalInvertedChannels.getNode());
 
+  // NOTE: we avoid letting illegal types through even if we're before legalize
+  // ops – legalization has a hard time producing good code for this.
   if (isOperationLegalOrCustom(ISD::VSELECT, SETCCVT)) {
     // If we have a vector select, let's replace the comparison results in the
     // affected lanes with the correct tautological result.
@@ -5638,6 +5640,8 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
   }
 
   // Else, we can just invert the comparison result in the appropriate lanes.
+  //
+  // NOTE: see the note above VSELECT above.
   if (isOperationLegalOrCustom(ISD::XOR, SETCCVT))
     return DAG.getNode(ISD::XOR, DL, SETCCVT, NewCC,
                        TautologicalInvertedChannels);
@@ -5692,8 +5696,9 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
   EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
   EVT ShSVT = ShVT.getScalarType();
 
-  // If MUL is unavailable, we cannot proceed in any case.
-  if (!isOperationLegalOrCustom(ISD::MUL, VT))
+  // If we are after ops legalization, and MUL is unavailable, we can not
+  // proceed.
+  if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT))
     return SDValue();
 
   // TODO: Could support comparing with non-zero too.
@@ -5848,7 +5853,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
 
   if (NeedToApplyOffset) {
     // We need ADD to do this.
-    if (!isOperationLegalOrCustom(ISD::ADD, VT))
+    if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ADD, VT))
       return SDValue();
 
     // (add (mul N, P), A)
@@ -5860,7 +5865,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
   // divisors as a performance improvement, since rotating by 0 is a no-op.
   if (HadEvenDivisor) {
     // We need ROTR to do this.
-    if (!isOperationLegalOrCustom(ISD::ROTR, VT))
+    if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT))
       return SDValue();
     SDNodeFlags Flags;
     Flags.setExact(true);
@@ -5883,6 +5888,9 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
   // we must fix-up results for said lanes.
   assert(VT.isVector() && "Can/should only get here for vectors.");
 
+  // NOTE: we avoid letting illegal types through even if we're before legalize
+  // ops – legalization has a hard time producing good code for the code that
+  // follows.
   if (!isOperationLegalOrCustom(ISD::SETEQ, VT) ||
       !isOperationLegalOrCustom(ISD::AND, VT) ||
       !isOperationLegalOrCustom(Cond, VT) ||

diff  --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index 80b8c2ade9b3..f4c86c082332 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -4,14 +4,14 @@
 define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK-LABEL: test_srem_odd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #33099
-; CHECK-NEXT:    mov w10, #64874
-; CHECK-NEXT:    sbfx w8, w0, #0, #29
-; CHECK-NEXT:    movk w9, #48986, lsl #16
-; CHECK-NEXT:    movk w10, #330, lsl #16
-; CHECK-NEXT:    madd w8, w8, w9, w10
-; CHECK-NEXT:    mov w9, #64213
-; CHECK-NEXT:    movk w9, #661, lsl #16
+; CHECK-NEXT:    mov w8, #33099
+; CHECK-NEXT:    mov w9, #24493
+; CHECK-NEXT:    movk w8, #8026, lsl #16
+; CHECK-NEXT:    movk w9, #41, lsl #16
+; CHECK-NEXT:    madd w8, w0, w8, w9
+; CHECK-NEXT:    mov w9, #48987
+; CHECK-NEXT:    and w8, w8, #0x1fffffff
+; CHECK-NEXT:    movk w9, #82, lsl #16
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
index 3fb4fa03d421..cc6cf4064f59 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
@@ -9,20 +9,18 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    adrp x8, .LCPI0_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_1]
-; CHECK-NEXT:    adrp x8, .LCPI0_2
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI0_2]
 ; CHECK-NEXT:    adrp x8, .LCPI0_3
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT:    adrp x8, .LCPI0_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI0_2]
+; CHECK-NEXT:    adrp x8, .LCPI0_4
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    usra v3.4s, v1.4s, #31
-; CHECK-NEXT:    mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -82,27 +80,19 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_even_allones_eq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    adrp x8, .LCPI3_1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT:    adrp x8, .LCPI3_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI3_2]
-; CHECK-NEXT:    adrp x8, .LCPI3_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI3_3]
-; CHECK-NEXT:    adrp x8, .LCPI3_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_4]
-; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mov w8, #28087
+; CHECK-NEXT:    mov w9, #9362
+; CHECK-NEXT:    movk w8, #46811, lsl #16
+; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    adrp x10, .LCPI3_0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    ldr q3, [x10, :lo12:.LCPI3_0]
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    shl v0.4s, v2.4s, #31
+; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -114,29 +104,21 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_even_allones_ne:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI4_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT:    adrp x8, .LCPI4_1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT:    adrp x8, .LCPI4_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT:    adrp x8, .LCPI4_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT:    adrp x8, .LCPI4_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_4]
-; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mov w8, #28087
+; CHECK-NEXT:    mov w9, #9362
+; CHECK-NEXT:    movk w8, #46811, lsl #16
+; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    adrp x10, .LCPI4_0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    ldr q3, [x10, :lo12:.LCPI4_0]
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    shl v0.4s, v2.4s, #31
+; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
   %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -152,23 +134,18 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    adrp x8, .LCPI5_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_1]
-; CHECK-NEXT:    adrp x8, .LCPI5_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI5_2]
 ; CHECK-NEXT:    adrp x8, .LCPI5_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI5_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI5_3]
+; CHECK-NEXT:    adrp x8, .LCPI5_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI5_2]
 ; CHECK-NEXT:    adrp x8, .LCPI5_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -184,25 +161,20 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
 ; CHECK-NEXT:    adrp x8, .LCPI6_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_1]
-; CHECK-NEXT:    adrp x8, .LCPI6_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI6_2]
 ; CHECK-NEXT:    adrp x8, .LCPI6_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI6_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI6_3]
+; CHECK-NEXT:    adrp x8, .LCPI6_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI6_2]
 ; CHECK-NEXT:    adrp x8, .LCPI6_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -220,20 +192,18 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
 ; CHECK-NEXT:    adrp x8, .LCPI7_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_1]
-; CHECK-NEXT:    adrp x8, .LCPI7_2
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI7_2]
 ; CHECK-NEXT:    adrp x8, .LCPI7_3
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT:    adrp x8, .LCPI7_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI7_2]
+; CHECK-NEXT:    adrp x8, .LCPI7_4
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    usra v3.4s, v1.4s, #31
-; CHECK-NEXT:    mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -251,14 +221,18 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_0]
 ; CHECK-NEXT:    adrp x8, .LCPI8_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_1]
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v0.4s
-; CHECK-NEXT:    sshr v3.4s, v1.4s, #3
-; CHECK-NEXT:    usra v3.4s, v1.4s, #31
-; CHECK-NEXT:    mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    adrp x8, .LCPI8_3
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI8_3]
+; CHECK-NEXT:    adrp x8, .LCPI8_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI8_2]
+; CHECK-NEXT:    adrp x8, .LCPI8_4
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_4]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -276,20 +250,18 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_0]
 ; CHECK-NEXT:    adrp x8, .LCPI9_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_1]
-; CHECK-NEXT:    adrp x8, .LCPI9_2
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI9_2]
 ; CHECK-NEXT:    adrp x8, .LCPI9_3
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI9_3]
+; CHECK-NEXT:    adrp x8, .LCPI9_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI9_2]
+; CHECK-NEXT:    adrp x8, .LCPI9_4
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    usra v3.4s, v1.4s, #31
-; CHECK-NEXT:    mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -328,25 +300,19 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_even_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI11_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT:    adrp x8, .LCPI11_1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI11_1]
-; CHECK-NEXT:    adrp x8, .LCPI11_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI11_2]
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    adrp x8, .LCPI11_3
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI11_3]
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v0.4s
-; CHECK-NEXT:    sshl v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mov w8, #28087
+; CHECK-NEXT:    mov w9, #9362
+; CHECK-NEXT:    movk w8, #46811, lsl #16
+; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    adrp x10, .LCPI11_0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    ldr q3, [x10, :lo12:.LCPI11_0]
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    shl v0.4s, v2.4s, #31
+; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -364,24 +330,18 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    adrp x8, .LCPI12_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_1]
-; CHECK-NEXT:    adrp x8, .LCPI12_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI12_2]
 ; CHECK-NEXT:    adrp x8, .LCPI12_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT:    adrp x8, .LCPI12_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI12_2]
 ; CHECK-NEXT:    adrp x8, .LCPI12_4
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -493,23 +453,18 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    adrp x8, .LCPI16_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_1]
-; CHECK-NEXT:    adrp x8, .LCPI16_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI16_2]
 ; CHECK-NEXT:    adrp x8, .LCPI16_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT:    adrp x8, .LCPI16_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI16_2]
 ; CHECK-NEXT:    adrp x8, .LCPI16_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -527,23 +482,18 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-NEXT:    adrp x8, .LCPI17_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_1]
-; CHECK-NEXT:    adrp x8, .LCPI17_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI17_2]
 ; CHECK-NEXT:    adrp x8, .LCPI17_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI17_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI17_3]
+; CHECK-NEXT:    adrp x8, .LCPI17_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI17_2]
 ; CHECK-NEXT:    adrp x8, .LCPI17_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -561,23 +511,18 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    adrp x8, .LCPI18_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_1]
-; CHECK-NEXT:    adrp x8, .LCPI18_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI18_2]
 ; CHECK-NEXT:    adrp x8, .LCPI18_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT:    adrp x8, .LCPI18_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI18_2]
 ; CHECK-NEXT:    adrp x8, .LCPI18_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -616,27 +561,19 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_even_allones_and_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI20_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT:    adrp x8, .LCPI20_1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_1]
-; CHECK-NEXT:    adrp x8, .LCPI20_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI20_2]
-; CHECK-NEXT:    adrp x8, .LCPI20_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI20_3]
-; CHECK-NEXT:    adrp x8, .LCPI20_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_4]
-; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mov w8, #28087
+; CHECK-NEXT:    mov w9, #9362
+; CHECK-NEXT:    movk w8, #46811, lsl #16
+; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    adrp x10, .LCPI20_0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    ldr q3, [x10, :lo12:.LCPI20_0]
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    shl v0.4s, v2.4s, #31
+; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -654,23 +591,18 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
 ; CHECK-NEXT:    adrp x8, .LCPI21_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_1]
-; CHECK-NEXT:    adrp x8, .LCPI21_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI21_2]
 ; CHECK-NEXT:    adrp x8, .LCPI21_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT:    adrp x8, .LCPI21_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI21_2]
 ; CHECK-NEXT:    adrp x8, .LCPI21_4
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -690,24 +622,18 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
 ; CHECK-NEXT:    adrp x8, .LCPI22_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI22_1]
-; CHECK-NEXT:    adrp x8, .LCPI22_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI22_2]
 ; CHECK-NEXT:    adrp x8, .LCPI22_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT:    adrp x8, .LCPI22_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI22_2]
 ; CHECK-NEXT:    adrp x8, .LCPI22_4
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI22_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -725,21 +651,18 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
 ; CHECK-NEXT:    adrp x8, .LCPI23_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_1]
-; CHECK-NEXT:    adrp x8, .LCPI23_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI23_2]
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    adrp x8, .LCPI23_3
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI23_3]
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v0.4s
-; CHECK-NEXT:    sshl v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI23_3]
+; CHECK-NEXT:    adrp x8, .LCPI23_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI23_2]
+; CHECK-NEXT:    adrp x8, .LCPI23_4
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_4]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -757,24 +680,18 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_0]
 ; CHECK-NEXT:    adrp x8, .LCPI24_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_1]
-; CHECK-NEXT:    adrp x8, .LCPI24_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI24_2]
 ; CHECK-NEXT:    adrp x8, .LCPI24_3
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI24_3]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI24_3]
+; CHECK-NEXT:    adrp x8, .LCPI24_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI24_2]
 ; CHECK-NEXT:    adrp x8, .LCPI24_4
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_4]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -793,22 +710,18 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_0]
 ; CHECK-NEXT:    adrp x8, .LCPI25_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI25_1]
-; CHECK-NEXT:    adrp x8, .LCPI25_2
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI25_2]
 ; CHECK-NEXT:    adrp x8, .LCPI25_3
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI25_3]
-; CHECK-NEXT:    neg v4.4s, v4.4s
-; CHECK-NEXT:    movi v3.2d, #0x000000ffffffff
-; CHECK-NEXT:    sshl v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI25_3]
+; CHECK-NEXT:    adrp x8, .LCPI25_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI25_2]
+; CHECK-NEXT:    adrp x8, .LCPI25_4
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_4]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -825,22 +738,18 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_0]
 ; CHECK-NEXT:    adrp x8, .LCPI26_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI26_1]
-; CHECK-NEXT:    adrp x8, .LCPI26_2
-; CHECK-NEXT:    smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI26_2]
 ; CHECK-NEXT:    adrp x8, .LCPI26_3
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI26_3]
-; CHECK-NEXT:    neg v4.4s, v4.4s
-; CHECK-NEXT:    movi v3.2d, #0x000000ffffffff
-; CHECK-NEXT:    sshl v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #31
-; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI26_3]
+; CHECK-NEXT:    adrp x8, .LCPI26_2
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI26_2]
+; CHECK-NEXT:    adrp x8, .LCPI26_4
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_4]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index 328815b60138..34f7c6db5c39 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -29,17 +29,20 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_even_100:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #34079
-; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    dup v2.4s, w8
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT:    smull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    sshr v3.4s, v2.4s, #5
-; CHECK-NEXT:    movi v1.4s, #100
-; CHECK-NEXT:    usra v3.4s, v2.4s, #31
-; CHECK-NEXT:    mls v0.4s, v3.4s, v1.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mov w8, #23593
+; CHECK-NEXT:    mov w9, #47184
+; CHECK-NEXT:    movk w8, #49807, lsl #16
+; CHECK-NEXT:    movk w9, #1310, lsl #16
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    mov w10, #23592
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    movk w10, #655, lsl #16
+; CHECK-NEXT:    shl v0.4s, v2.4s, #30
+; CHECK-NEXT:    ushr v1.4s, v2.4s, #2
+; CHECK-NEXT:    dup v3.4s, w10
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -79,17 +82,20 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_even_neg100:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    adrp x8, .LCPI3_1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sshr v3.4s, v1.4s, #5
-; CHECK-NEXT:    usra v3.4s, v1.4s, #31
-; CHECK-NEXT:    mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mov w8, #23593
+; CHECK-NEXT:    mov w9, #47184
+; CHECK-NEXT:    movk w8, #49807, lsl #16
+; CHECK-NEXT:    movk w9, #1310, lsl #16
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    mov w10, #23592
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    movk w10, #655, lsl #16
+; CHECK-NEXT:    shl v0.4s, v2.4s, #30
+; CHECK-NEXT:    ushr v1.4s, v2.4s, #2
+; CHECK-NEXT:    dup v3.4s, w10
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    cmhs v0.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll
index 943b138124b5..65ffb6db414e 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll
@@ -83,15 +83,13 @@ define i32 @test_srem_odd_bit31(i32 %X) nounwind {
 define i16 @test_srem_even(i16 %X) nounwind {
 ; CHECK-LABEL: test_srem_even:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    mov w9, #18725
-; CHECK-NEXT:    mul w8, w8, w9
-; CHECK-NEXT:    asr w9, w8, #18
-; CHECK-NEXT:    add w8, w9, w8, lsr #31
-; CHECK-NEXT:    mov w9, #14
-; CHECK-NEXT:    msub w8, w8, w9, w0
-; CHECK-NEXT:    tst w8, #0xffff
-; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    mov w8, #28087
+; CHECK-NEXT:    mov w9, #4680
+; CHECK-NEXT:    madd w8, w0, w8, w9
+; CHECK-NEXT:    lsl w10, w8, #15
+; CHECK-NEXT:    bfxil w10, w8, #1, #15
+; CHECK-NEXT:    cmp w9, w10, uxth
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %srem = srem i16 %X, 14
   %cmp = icmp ne i16 %srem, 0

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
index b626ce2f598a..a7642088fa26 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
@@ -4,13 +4,10 @@
 define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #52429
-; CHECK-NEXT:    and w8, w0, #0x1fff
-; CHECK-NEXT:    movk w9, #52428, lsl #16
-; CHECK-NEXT:    mul w8, w8, w9
-; CHECK-NEXT:    mov w9, #13108
-; CHECK-NEXT:    movk w9, #13107, lsl #16
-; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #3277
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    and w8, w8, #0x1fff
+; CHECK-NEXT:    cmp w8, #1639 // =1639
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %urem = urem i13 %X, 5
@@ -21,13 +18,14 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK-LABEL: test_urem_even:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #28087
-; CHECK-NEXT:    and w8, w0, #0x7ffffff
-; CHECK-NEXT:    movk w9, #46811, lsl #16
-; CHECK-NEXT:    mul w8, w8, w9
-; CHECK-NEXT:    mov w9, #9363
-; CHECK-NEXT:    ror w8, w8, #1
-; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    mov w8, #28087
+; CHECK-NEXT:    movk w8, #1755, lsl #16
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    lsl w9, w8, #26
+; CHECK-NEXT:    bfxil w9, w8, #1, #26
+; CHECK-NEXT:    and w8, w9, #0x7ffffff
+; CHECK-NEXT:    mov w9, #18725
+; CHECK-NEXT:    movk w9, #146, lsl #16
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
@@ -39,12 +37,10 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_setne:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #52429
-; CHECK-NEXT:    and w8, w0, #0xf
-; CHECK-NEXT:    movk w9, #52428, lsl #16
-; CHECK-NEXT:    mul w8, w8, w9
-; CHECK-NEXT:    mov w9, #858993459
-; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #13
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    and w8, w8, #0xf
+; CHECK-NEXT:    cmp w8, #3 // =3
 ; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
   %urem = urem i4 %X, 5
@@ -55,13 +51,10 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; CHECK-LABEL: test_urem_negative_odd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #57651
-; CHECK-NEXT:    and w8, w0, #0x1ff
-; CHECK-NEXT:    movk w9, #43302, lsl #16
-; CHECK-NEXT:    mul w8, w8, w9
-; CHECK-NEXT:    mov w9, #17191
-; CHECK-NEXT:    movk w9, #129, lsl #16
-; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #307
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    and w8, w8, #0x1ff
+; CHECK-NEXT:    cmp w8, #1 // =1
 ; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
   %urem = urem i9 %X, -5
@@ -72,41 +65,29 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-LABEL: test_urem_vec:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, #43691
-; CHECK-NEXT:    and w8, w0, #0x7ff
-; CHECK-NEXT:    movk w12, #43690, lsl #16
-; CHECK-NEXT:    umull x12, w8, w12
-; CHECK-NEXT:    mov w11, #25663
-; CHECK-NEXT:    mov w13, #6
-; CHECK-NEXT:    lsr x12, x12, #34
-; CHECK-NEXT:    and w10, w2, #0x7ff
-; CHECK-NEXT:    movk w11, #160, lsl #16
-; CHECK-NEXT:    msub w8, w12, w13, w8
-; CHECK-NEXT:    mov w12, #18725
-; CHECK-NEXT:    and w9, w1, #0x7ff
-; CHECK-NEXT:    movk w12, #9362, lsl #16
-; CHECK-NEXT:    umull x11, w10, w11
-; CHECK-NEXT:    adrp x13, .LCPI4_0
-; CHECK-NEXT:    umull x12, w9, w12
-; CHECK-NEXT:    lsr x11, x11, #32
-; CHECK-NEXT:    ldr d0, [x13, :lo12:.LCPI4_0]
-; CHECK-NEXT:    lsr x12, x12, #32
-; CHECK-NEXT:    sub w13, w10, w11
-; CHECK-NEXT:    add w11, w11, w13, lsr #1
-; CHECK-NEXT:    sub w13, w9, w12
-; CHECK-NEXT:    add w12, w12, w13, lsr #1
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov w8, #2043
-; CHECK-NEXT:    lsr w11, w11, #10
-; CHECK-NEXT:    lsr w12, w12, #2
-; CHECK-NEXT:    msub w8, w11, w8, w10
-; CHECK-NEXT:    sub w10, w12, w12, lsl #3
-; CHECK-NEXT:    add w9, w9, w10
-; CHECK-NEXT:    mov v1.h[1], w9
-; CHECK-NEXT:    mov v1.h[2], w8
-; CHECK-NEXT:    bic v1.4h, #248, lsl #8
-; CHECK-NEXT:    cmeq v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    adrp x9, .LCPI4_1
+; CHECK-NEXT:    mov v0.h[1], w1
+; CHECK-NEXT:    ldr d2, [x9, :lo12:.LCPI4_1]
+; CHECK-NEXT:    adrp x8, .LCPI4_2
+; CHECK-NEXT:    adrp x9, .LCPI4_3
+; CHECK-NEXT:    mov v0.h[2], w2
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr d1, [x9, :lo12:.LCPI4_3]
+; CHECK-NEXT:    mul v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    adrp x8, .LCPI4_4
+; CHECK-NEXT:    shl v2.4h, v0.4h, #1
+; CHECK-NEXT:    ushl v2.4h, v2.4h, v3.4h
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_4]
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    bic v0.4h, #248, lsl #8
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    bic v0.4h, #248, lsl #8
+; CHECK-NEXT:    cmhi v0.4h, v0.4h, v3.4h
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    umov w1, v0.h[1]
 ; CHECK-NEXT:    umov w2, v0.h[2]

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
index a2ea19dd2314..a3d5c7da7b6d 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
@@ -195,15 +195,12 @@ define i1 @t32_6_5(i32 %X) nounwind {
 define i1 @t16_3_2(i16 %X) nounwind {
 ; CHECK-LABEL: t16_3_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w9, #43691
-; CHECK-NEXT:    mul w8, w8, w9
-; CHECK-NEXT:    lsr w8, w8, #17
-; CHECK-NEXT:    add w8, w8, w8, lsl #1
-; CHECK-NEXT:    sub w8, w0, w8
-; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    cmp w8, #2 // =2
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    mov w8, #-21845
+; CHECK-NEXT:    mov w9, #-21846
+; CHECK-NEXT:    madd w8, w0, w8, w9
+; CHECK-NEXT:    mov w9, #21845
+; CHECK-NEXT:    cmp w9, w8, uxth
+; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
   %urem = urem i16 %X, 3
   %cmp = icmp eq i16 %urem, 2
@@ -213,15 +210,12 @@ define i1 @t16_3_2(i16 %X) nounwind {
 define i1 @t8_3_2(i8 %X) nounwind {
 ; CHECK-LABEL: t8_3_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    mov w9, #171
-; CHECK-NEXT:    mul w8, w8, w9
-; CHECK-NEXT:    lsr w8, w8, #9
-; CHECK-NEXT:    add w8, w8, w8, lsl #1
-; CHECK-NEXT:    sub w8, w0, w8
+; CHECK-NEXT:    mov w8, #-85
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    sub w8, w8, #86 // =86
 ; CHECK-NEXT:    and w8, w8, #0xff
-; CHECK-NEXT:    cmp w8, #2 // =2
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    cmp w8, #85 // =85
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %urem = urem i8 %X, 3
   %cmp = icmp eq i8 %urem, 2

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
index 0faeebbacacc..bfdbd2b91d92 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
@@ -11,17 +11,14 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_1]
 ; CHECK-NEXT:    adrp x8, .LCPI0_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI0_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI0_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -79,17 +76,14 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_1]
 ; CHECK-NEXT:    adrp x8, .LCPI3_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI3_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI3_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -107,19 +101,16 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_1]
 ; CHECK-NEXT:    adrp x8, .LCPI4_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI4_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
   %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -137,17 +128,14 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_1]
 ; CHECK-NEXT:    adrp x8, .LCPI5_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI5_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI5_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -165,19 +153,16 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_1]
 ; CHECK-NEXT:    adrp x8, .LCPI6_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI6_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI6_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -197,13 +182,14 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_1]
 ; CHECK-NEXT:    adrp x8, .LCPI7_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI7_2]
-; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    adrp x8, .LCPI7_3
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -223,17 +209,14 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_1]
 ; CHECK-NEXT:    adrp x8, .LCPI8_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI8_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI8_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -253,17 +236,14 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_1]
 ; CHECK-NEXT:    adrp x8, .LCPI9_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI9_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI9_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -299,26 +279,16 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_even_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI11_0
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT:    adrp x8, .LCPI11_1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI11_1]
-; CHECK-NEXT:    adrp x8, .LCPI11_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI11_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
-; CHECK-NEXT:    adrp x8, .LCPI11_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI11_3]
-; CHECK-NEXT:    adrp x8, .LCPI11_4
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI11_4]
-; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mov w8, #28087
+; CHECK-NEXT:    movk w8, #46811, lsl #16
+; CHECK-NEXT:    adrp x9, .LCPI11_0
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI11_0]
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    shl v1.4s, v0.4s, #31
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -338,20 +308,14 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_1]
 ; CHECK-NEXT:    adrp x8, .LCPI12_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI12_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI12_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_3]
-; CHECK-NEXT:    adrp x8, .LCPI12_4
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI12_4]
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -373,13 +337,14 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_1]
 ; CHECK-NEXT:    adrp x8, .LCPI13_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI13_2]
-; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    adrp x8, .LCPI13_3
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI13_3]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -399,17 +364,14 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_1]
 ; CHECK-NEXT:    adrp x8, .LCPI14_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI14_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI14_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -429,17 +391,14 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_1]
 ; CHECK-NEXT:    adrp x8, .LCPI15_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI15_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI15_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -461,13 +420,14 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_1]
 ; CHECK-NEXT:    adrp x8, .LCPI16_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI16_2]
-; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    adrp x8, .LCPI16_3
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -487,17 +447,14 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_1]
 ; CHECK-NEXT:    adrp x8, .LCPI17_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI17_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI17_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -517,13 +474,14 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_1]
 ; CHECK-NEXT:    adrp x8, .LCPI18_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI18_2]
-; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    adrp x8, .LCPI18_3
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -564,20 +522,14 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_1]
 ; CHECK-NEXT:    adrp x8, .LCPI20_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI20_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI20_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_3]
-; CHECK-NEXT:    adrp x8, .LCPI20_4
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI20_4]
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -598,15 +550,13 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI21_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI21_2]
 ; CHECK-NEXT:    adrp x8, .LCPI21_3
-; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI21_3]
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -629,15 +579,13 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI22_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI22_2]
 ; CHECK-NEXT:    adrp x8, .LCPI22_3
-; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI22_3]
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -657,20 +605,14 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_1]
 ; CHECK-NEXT:    adrp x8, .LCPI23_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI23_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI23_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_3]
-; CHECK-NEXT:    adrp x8, .LCPI23_4
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI23_4]
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -691,15 +633,13 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI24_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI24_2]
 ; CHECK-NEXT:    adrp x8, .LCPI24_3
-; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI24_3]
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_3]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -721,15 +661,13 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-NEXT:    adrp x8, .LCPI25_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI25_2]
 ; CHECK-NEXT:    adrp x8, .LCPI25_3
-; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI25_3]
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_3]
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -748,20 +686,14 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI26_1]
 ; CHECK-NEXT:    adrp x8, .LCPI26_2
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI26_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI26_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI26_3]
-; CHECK-NEXT:    adrp x8, .LCPI26_4
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI26_4]
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_3]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
index 85abb4d7f830..3cf09fb2d5cd 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
@@ -45,18 +45,20 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
 define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: t32_6_part0:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    mov w8, #43691
 ; CHECK-NEXT:    movk w8, #43690, lsl #16
-; CHECK-NEXT:    adrp x9, .LCPI2_0
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI2_0]
-; CHECK-NEXT:    umull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #2
-; CHECK-NEXT:    movi v3.4s, #6
-; CHECK-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    dup v2.4s, w8
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov w9, #43690
+; CHECK-NEXT:    mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    movk w9, #10922, lsl #16
+; CHECK-NEXT:    shl v1.4s, v0.4s, #31
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    dup v1.4s, w9
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
@@ -67,18 +69,19 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
 define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: t32_6_part1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #43691
-; CHECK-NEXT:    movk w8, #43690, lsl #16
-; CHECK-NEXT:    adrp x9, .LCPI3_0
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI3_0]
-; CHECK-NEXT:    umull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #2
-; CHECK-NEXT:    movi v3.4s, #6
-; CHECK-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    mov w9, #43691
+; CHECK-NEXT:    movk w9, #43690, lsl #16
+; CHECK-NEXT:    adrp x8, .LCPI3_1
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI3_1]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    shl v1.4s, v0.4s, #31
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
@@ -92,22 +95,16 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    adrp x8, .LCPI4_1
+; CHECK-NEXT:    mov w9, #43691
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT:    adrp x8, .LCPI4_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT:    adrp x8, .LCPI4_3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT:    adrp x8, .LCPI4_4
-; CHECK-NEXT:    umull2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT:    neg v2.4s, v2.4s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI4_4]
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v5.4s
+; CHECK-NEXT:    movk w9, #43690, lsl #16
+; CHECK-NEXT:    dup v3.4s, w9
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    movi d1, #0x00ffffffff0000
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 2, i32 3>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 2>

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
index ae51708e02ab..5a6add7eb49c 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
@@ -26,16 +26,17 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_even_100:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #34079
-; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    dup v2.4s, w8
-; CHECK-NEXT:    umull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    movi v1.4s, #100
-; CHECK-NEXT:    ushr v2.4s, v2.4s, #5
-; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mov w8, #23593
+; CHECK-NEXT:    movk w8, #49807, lsl #16
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    mov w9, #23592
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    movk w9, #655, lsl #16
+; CHECK-NEXT:    shl v1.4s, v0.4s, #30
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #2
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -74,19 +75,11 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    adrp x8, .LCPI3_1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT:    adrp x8, .LCPI3_2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI3_2]
-; CHECK-NEXT:    neg v1.4s, v1.4s
-; CHECK-NEXT:    adrp x8, .LCPI3_3
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_3]
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    neg v3.4s, v3.4s
-; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    shl v1.4s, v0.4s, #30
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #2
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll
index 551d2591ba48..74659f44808b 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll
@@ -78,14 +78,14 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind {
 define i16 @test_urem_even(i16 %X) nounwind {
 ; CHECK-LABEL: test_urem_even:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx w8, w0, #1, #15
-; CHECK-NEXT:    mov w9, #18725
-; CHECK-NEXT:    mul w8, w8, w9
-; CHECK-NEXT:    lsr w8, w8, #17
-; CHECK-NEXT:    mov w9, #14
-; CHECK-NEXT:    msub w8, w8, w9, w0
-; CHECK-NEXT:    tst w8, #0xffff
-; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    mov w8, #28087
+; CHECK-NEXT:    mul w8, w0, w8
+; CHECK-NEXT:    and w9, w8, #0xfffc
+; CHECK-NEXT:    lsr w9, w9, #1
+; CHECK-NEXT:    bfi w9, w8, #15, #17
+; CHECK-NEXT:    ubfx w8, w9, #1, #15
+; CHECK-NEXT:    cmp w8, #2340 // =2340
+; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
   %urem = urem i16 %X, 14
   %cmp = icmp ne i16 %urem, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 02e1186a4984..9bafe57b786e 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -5,17 +5,12 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK-LABEL: test_srem_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_bfe_i32 v0, v0, 0, 29
-; CHECK-NEXT:    s_mov_b32 s5, 0xa57eb503
-; CHECK-NEXT:    s_movk_i32 s4, 0x63
-; CHECK-NEXT:    v_mul_hi_i32 v1, v0, s5
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
-; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 6, v1
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, s4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0x1f5a814b
+; CHECK-NEXT:    s_mov_b32 s5, 0x52bf5b
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0x295fad, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem i29 %X, 99

diff  --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index eee9a4e69738..96b1c77d0849 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -5,13 +5,12 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0xcccccccd
-; CHECK-NEXT:    v_mul_hi_u32 v1, v0, s4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
-; CHECK-NEXT:    v_mul_u32_u24_e32 v1, 5, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_movk_i32 s4, 0x1fff
+; CHECK-NEXT:    s_movk_i32 s5, 0x667
+; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0xccd, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i13 %X, 5
@@ -23,14 +22,14 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK-LABEL: test_urem_even:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7ffffff, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0x6db6db7
+; CHECK-NEXT:    s_mov_b32 s5, 0x924925
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s4
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 26, v0
 ; CHECK-NEXT:    v_bfe_u32 v0, v0, 1, 26
-; CHECK-NEXT:    s_mov_b32 s4, 0x92492493
-; CHECK-NEXT:    v_mul_hi_u32 v0, v0, s4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
-; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 14, v0
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i27 %X, 14
@@ -43,12 +42,9 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 15, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0xcccccccd
-; CHECK-NEXT:    v_mul_hi_u32 v1, v0, s4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
-; CHECK-NEXT:    v_mul_u32_u24_e32 v1, 5, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 13, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, 15, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 3, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i4 %X, 5
@@ -60,13 +56,11 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; CHECK-LABEL: test_urem_negative_odd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0x2050c9f9
-; CHECK-NEXT:    v_mul_hi_u32 v1, v0, s4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 6, v1
-; CHECK-NEXT:    v_mul_u32_u24_e32 v1, 0x1fb, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_movk_i32 s4, 0x1ff
+; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0x133, v0
+; CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i9 %X, -5

diff  --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
index 1fa7c5ca450c..5ba72fcb427c 100644
--- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
@@ -10,32 +10,30 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; ARM5-LABEL: test_srem_odd:
 ; ARM5:       @ %bb.0:
 ; ARM5-NEXT:    ldr r2, .LCPI0_1
-; ARM5-NEXT:    lsl r0, r0, #3
-; ARM5-NEXT:    asr r0, r0, #3
 ; ARM5-NEXT:    ldr r1, .LCPI0_0
 ; ARM5-NEXT:    mla r3, r0, r2, r1
-; ARM5-NEXT:    ldr r1, .LCPI0_2
+; ARM5-NEXT:    ldr r2, .LCPI0_2
 ; ARM5-NEXT:    mov r0, #0
-; ARM5-NEXT:    cmp r3, r1
+; ARM5-NEXT:    bic r1, r3, #-536870912
+; ARM5-NEXT:    cmp r1, r2
 ; ARM5-NEXT:    movlo r0, #1
 ; ARM5-NEXT:    bx lr
 ; ARM5-NEXT:    .p2align 2
 ; ARM5-NEXT:  @ %bb.1:
 ; ARM5-NEXT:  .LCPI0_0:
-; ARM5-NEXT:    .long 21691754 @ 0x14afd6a
+; ARM5-NEXT:    .long 2711469 @ 0x295fad
 ; ARM5-NEXT:  .LCPI0_1:
-; ARM5-NEXT:    .long 3210379595 @ 0xbf5a814b
+; ARM5-NEXT:    .long 526025035 @ 0x1f5a814b
 ; ARM5-NEXT:  .LCPI0_2:
-; ARM5-NEXT:    .long 43383509 @ 0x295fad5
+; ARM5-NEXT:    .long 5422939 @ 0x52bf5b
 ;
 ; ARM6-LABEL: test_srem_odd:
 ; ARM6:       @ %bb.0:
 ; ARM6-NEXT:    ldr r2, .LCPI0_1
-; ARM6-NEXT:    lsl r0, r0, #3
-; ARM6-NEXT:    asr r0, r0, #3
 ; ARM6-NEXT:    ldr r1, .LCPI0_0
-; ARM6-NEXT:    mla r1, r0, r2, r1
+; ARM6-NEXT:    mla r0, r0, r2, r1
 ; ARM6-NEXT:    ldr r2, .LCPI0_2
+; ARM6-NEXT:    bic r1, r0, #-536870912
 ; ARM6-NEXT:    mov r0, #0
 ; ARM6-NEXT:    cmp r1, r2
 ; ARM6-NEXT:    movlo r0, #1
@@ -43,22 +41,22 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; ARM6-NEXT:    .p2align 2
 ; ARM6-NEXT:  @ %bb.1:
 ; ARM6-NEXT:  .LCPI0_0:
-; ARM6-NEXT:    .long 21691754 @ 0x14afd6a
+; ARM6-NEXT:    .long 2711469 @ 0x295fad
 ; ARM6-NEXT:  .LCPI0_1:
-; ARM6-NEXT:    .long 3210379595 @ 0xbf5a814b
+; ARM6-NEXT:    .long 526025035 @ 0x1f5a814b
 ; ARM6-NEXT:  .LCPI0_2:
-; ARM6-NEXT:    .long 43383509 @ 0x295fad5
+; ARM6-NEXT:    .long 5422939 @ 0x52bf5b
 ;
 ; ARM7-LABEL: test_srem_odd:
 ; ARM7:       @ %bb.0:
-; ARM7-NEXT:    movw r1, #64874
+; ARM7-NEXT:    movw r1, #24493
 ; ARM7-NEXT:    movw r2, #33099
-; ARM7-NEXT:    sbfx r0, r0, #0, #29
-; ARM7-NEXT:    movt r1, #330
-; ARM7-NEXT:    movt r2, #48986
-; ARM7-NEXT:    mla r1, r0, r2, r1
-; ARM7-NEXT:    movw r2, #64213
-; ARM7-NEXT:    movt r2, #661
+; ARM7-NEXT:    movt r1, #41
+; ARM7-NEXT:    movt r2, #8026
+; ARM7-NEXT:    mla r0, r0, r2, r1
+; ARM7-NEXT:    movw r2, #48987
+; ARM7-NEXT:    movt r2, #82
+; ARM7-NEXT:    bic r1, r0, #-536870912
 ; ARM7-NEXT:    mov r0, #0
 ; ARM7-NEXT:    cmp r1, r2
 ; ARM7-NEXT:    movwlo r0, #1
@@ -66,14 +64,14 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ;
 ; ARM8-LABEL: test_srem_odd:
 ; ARM8:       @ %bb.0:
-; ARM8-NEXT:    movw r1, #64874
+; ARM8-NEXT:    movw r1, #24493
 ; ARM8-NEXT:    movw r2, #33099
-; ARM8-NEXT:    sbfx r0, r0, #0, #29
-; ARM8-NEXT:    movt r1, #330
-; ARM8-NEXT:    movt r2, #48986
-; ARM8-NEXT:    mla r1, r0, r2, r1
-; ARM8-NEXT:    movw r2, #64213
-; ARM8-NEXT:    movt r2, #661
+; ARM8-NEXT:    movt r1, #41
+; ARM8-NEXT:    movt r2, #8026
+; ARM8-NEXT:    mla r0, r0, r2, r1
+; ARM8-NEXT:    movw r2, #48987
+; ARM8-NEXT:    movt r2, #82
+; ARM8-NEXT:    bic r1, r0, #-536870912
 ; ARM8-NEXT:    mov r0, #0
 ; ARM8-NEXT:    cmp r1, r2
 ; ARM8-NEXT:    movwlo r0, #1
@@ -81,14 +79,14 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ;
 ; NEON7-LABEL: test_srem_odd:
 ; NEON7:       @ %bb.0:
-; NEON7-NEXT:    movw r1, #64874
+; NEON7-NEXT:    movw r1, #24493
 ; NEON7-NEXT:    movw r2, #33099
-; NEON7-NEXT:    sbfx r0, r0, #0, #29
-; NEON7-NEXT:    movt r1, #330
-; NEON7-NEXT:    movt r2, #48986
-; NEON7-NEXT:    mla r1, r0, r2, r1
-; NEON7-NEXT:    movw r2, #64213
-; NEON7-NEXT:    movt r2, #661
+; NEON7-NEXT:    movt r1, #41
+; NEON7-NEXT:    movt r2, #8026
+; NEON7-NEXT:    mla r0, r0, r2, r1
+; NEON7-NEXT:    movw r2, #48987
+; NEON7-NEXT:    movt r2, #82
+; NEON7-NEXT:    bic r1, r0, #-536870912
 ; NEON7-NEXT:    mov r0, #0
 ; NEON7-NEXT:    cmp r1, r2
 ; NEON7-NEXT:    movwlo r0, #1
@@ -96,14 +94,14 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ;
 ; NEON8-LABEL: test_srem_odd:
 ; NEON8:       @ %bb.0:
-; NEON8-NEXT:    movw r1, #64874
+; NEON8-NEXT:    movw r1, #24493
 ; NEON8-NEXT:    movw r2, #33099
-; NEON8-NEXT:    sbfx r0, r0, #0, #29
-; NEON8-NEXT:    movt r1, #330
-; NEON8-NEXT:    movt r2, #48986
-; NEON8-NEXT:    mla r1, r0, r2, r1
-; NEON8-NEXT:    movw r2, #64213
-; NEON8-NEXT:    movt r2, #661
+; NEON8-NEXT:    movt r1, #41
+; NEON8-NEXT:    movt r2, #8026
+; NEON8-NEXT:    mla r0, r0, r2, r1
+; NEON8-NEXT:    movw r2, #48987
+; NEON8-NEXT:    movt r2, #82
+; NEON8-NEXT:    bic r1, r0, #-536870912
 ; NEON8-NEXT:    mov r0, #0
 ; NEON8-NEXT:    cmp r1, r2
 ; NEON8-NEXT:    movwlo r0, #1

diff  --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
index ec1781130b5e..dd20fd07dff0 100644
--- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
@@ -9,90 +9,74 @@
 define i1 @test_urem_odd(i13 %X) nounwind {
 ; ARM5-LABEL: test_urem_odd:
 ; ARM5:       @ %bb.0:
-; ARM5-NEXT:    mov r1, #255
-; ARM5-NEXT:    orr r1, r1, #7936
-; ARM5-NEXT:    and r0, r0, r1
-; ARM5-NEXT:    ldr r1, .LCPI0_0
+; ARM5-NEXT:    mov r1, #205
+; ARM5-NEXT:    orr r1, r1, #3072
 ; ARM5-NEXT:    mul r2, r0, r1
-; ARM5-NEXT:    ldr r1, .LCPI0_1
+; ARM5-NEXT:    mov r0, #255
+; ARM5-NEXT:    orr r0, r0, #7936
+; ARM5-NEXT:    and r1, r2, r0
+; ARM5-NEXT:    mov r2, #103
+; ARM5-NEXT:    orr r2, r2, #1536
 ; ARM5-NEXT:    mov r0, #0
-; ARM5-NEXT:    cmp r2, r1
+; ARM5-NEXT:    cmp r1, r2
 ; ARM5-NEXT:    movlo r0, #1
 ; ARM5-NEXT:    bx lr
-; ARM5-NEXT:    .p2align 2
-; ARM5-NEXT:  @ %bb.1:
-; ARM5-NEXT:  .LCPI0_0:
-; ARM5-NEXT:    .long 3435973837 @ 0xcccccccd
-; ARM5-NEXT:  .LCPI0_1:
-; ARM5-NEXT:    .long 858993460 @ 0x33333334
 ;
 ; ARM6-LABEL: test_urem_odd:
 ; ARM6:       @ %bb.0:
+; ARM6-NEXT:    mov r1, #205
+; ARM6-NEXT:    mov r2, #103
+; ARM6-NEXT:    orr r1, r1, #3072
+; ARM6-NEXT:    orr r2, r2, #1536
+; ARM6-NEXT:    mul r0, r0, r1
 ; ARM6-NEXT:    mov r1, #255
-; ARM6-NEXT:    ldr r2, .LCPI0_1
 ; ARM6-NEXT:    orr r1, r1, #7936
-; ARM6-NEXT:    and r0, r0, r1
-; ARM6-NEXT:    ldr r1, .LCPI0_0
-; ARM6-NEXT:    mul r1, r0, r1
+; ARM6-NEXT:    and r1, r0, r1
 ; ARM6-NEXT:    mov r0, #0
 ; ARM6-NEXT:    cmp r1, r2
 ; ARM6-NEXT:    movlo r0, #1
 ; ARM6-NEXT:    bx lr
-; ARM6-NEXT:    .p2align 2
-; ARM6-NEXT:  @ %bb.1:
-; ARM6-NEXT:  .LCPI0_0:
-; ARM6-NEXT:    .long 3435973837 @ 0xcccccccd
-; ARM6-NEXT:  .LCPI0_1:
-; ARM6-NEXT:    .long 858993460 @ 0x33333334
 ;
 ; ARM7-LABEL: test_urem_odd:
 ; ARM7:       @ %bb.0:
-; ARM7-NEXT:    movw r1, #52429
-; ARM7-NEXT:    bfc r0, #13, #19
-; ARM7-NEXT:    movt r1, #52428
-; ARM7-NEXT:    movw r2, #13108
+; ARM7-NEXT:    movw r1, #3277
+; ARM7-NEXT:    movw r2, #1639
 ; ARM7-NEXT:    mul r1, r0, r1
-; ARM7-NEXT:    movt r2, #13107
 ; ARM7-NEXT:    mov r0, #0
+; ARM7-NEXT:    bfc r1, #13, #19
 ; ARM7-NEXT:    cmp r1, r2
 ; ARM7-NEXT:    movwlo r0, #1
 ; ARM7-NEXT:    bx lr
 ;
 ; ARM8-LABEL: test_urem_odd:
 ; ARM8:       @ %bb.0:
-; ARM8-NEXT:    movw r1, #52429
-; ARM8-NEXT:    bfc r0, #13, #19
-; ARM8-NEXT:    movt r1, #52428
-; ARM8-NEXT:    movw r2, #13108
+; ARM8-NEXT:    movw r1, #3277
+; ARM8-NEXT:    movw r2, #1639
 ; ARM8-NEXT:    mul r1, r0, r1
-; ARM8-NEXT:    movt r2, #13107
 ; ARM8-NEXT:    mov r0, #0
+; ARM8-NEXT:    bfc r1, #13, #19
 ; ARM8-NEXT:    cmp r1, r2
 ; ARM8-NEXT:    movwlo r0, #1
 ; ARM8-NEXT:    bx lr
 ;
 ; NEON7-LABEL: test_urem_odd:
 ; NEON7:       @ %bb.0:
-; NEON7-NEXT:    movw r1, #52429
-; NEON7-NEXT:    bfc r0, #13, #19
-; NEON7-NEXT:    movt r1, #52428
-; NEON7-NEXT:    movw r2, #13108
+; NEON7-NEXT:    movw r1, #3277
+; NEON7-NEXT:    movw r2, #1639
 ; NEON7-NEXT:    mul r1, r0, r1
-; NEON7-NEXT:    movt r2, #13107
 ; NEON7-NEXT:    mov r0, #0
+; NEON7-NEXT:    bfc r1, #13, #19
 ; NEON7-NEXT:    cmp r1, r2
 ; NEON7-NEXT:    movwlo r0, #1
 ; NEON7-NEXT:    bx lr
 ;
 ; NEON8-LABEL: test_urem_odd:
 ; NEON8:       @ %bb.0:
-; NEON8-NEXT:    movw r1, #52429
-; NEON8-NEXT:    bfc r0, #13, #19
-; NEON8-NEXT:    movt r1, #52428
-; NEON8-NEXT:    movw r2, #13108
+; NEON8-NEXT:    movw r1, #3277
+; NEON8-NEXT:    movw r2, #1639
 ; NEON8-NEXT:    mul r1, r0, r1
-; NEON8-NEXT:    movt r2, #13107
 ; NEON8-NEXT:    mov r0, #0
+; NEON8-NEXT:    bfc r1, #13, #19
 ; NEON8-NEXT:    cmp r1, r2
 ; NEON8-NEXT:    movwlo r0, #1
 ; NEON8-NEXT:    bx lr
@@ -105,28 +89,32 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; ARM5-LABEL: test_urem_even:
 ; ARM5:       @ %bb.0:
 ; ARM5-NEXT:    ldr r1, .LCPI1_0
-; ARM5-NEXT:    bic r0, r0, #-134217728
 ; ARM5-NEXT:    mul r2, r0, r1
-; ARM5-NEXT:    mov r0, #0
-; ARM5-NEXT:    ror r1, r2, #1
+; ARM5-NEXT:    bic r0, r2, #-134217727
+; ARM5-NEXT:    lsr r0, r0, #1
+; ARM5-NEXT:    orr r0, r0, r2, lsl #26
 ; ARM5-NEXT:    ldr r2, .LCPI1_1
+; ARM5-NEXT:    bic r1, r0, #-134217728
+; ARM5-NEXT:    mov r0, #0
 ; ARM5-NEXT:    cmp r1, r2
 ; ARM5-NEXT:    movlo r0, #1
 ; ARM5-NEXT:    bx lr
 ; ARM5-NEXT:    .p2align 2
 ; ARM5-NEXT:  @ %bb.1:
 ; ARM5-NEXT:  .LCPI1_0:
-; ARM5-NEXT:    .long 3067833783 @ 0xb6db6db7
+; ARM5-NEXT:    .long 115043767 @ 0x6db6db7
 ; ARM5-NEXT:  .LCPI1_1:
-; ARM5-NEXT:    .long 306783379 @ 0x12492493
+; ARM5-NEXT:    .long 9586981 @ 0x924925
 ;
 ; ARM6-LABEL: test_urem_even:
 ; ARM6:       @ %bb.0:
 ; ARM6-NEXT:    ldr r1, .LCPI1_0
-; ARM6-NEXT:    bic r0, r0, #-134217728
 ; ARM6-NEXT:    ldr r2, .LCPI1_1
 ; ARM6-NEXT:    mul r0, r0, r1
-; ARM6-NEXT:    ror r1, r0, #1
+; ARM6-NEXT:    bic r1, r0, #-134217727
+; ARM6-NEXT:    lsr r1, r1, #1
+; ARM6-NEXT:    orr r0, r1, r0, lsl #26
+; ARM6-NEXT:    bic r1, r0, #-134217728
 ; ARM6-NEXT:    mov r0, #0
 ; ARM6-NEXT:    cmp r1, r2
 ; ARM6-NEXT:    movlo r0, #1
@@ -134,19 +122,20 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; ARM6-NEXT:    .p2align 2
 ; ARM6-NEXT:  @ %bb.1:
 ; ARM6-NEXT:  .LCPI1_0:
-; ARM6-NEXT:    .long 3067833783 @ 0xb6db6db7
+; ARM6-NEXT:    .long 115043767 @ 0x6db6db7
 ; ARM6-NEXT:  .LCPI1_1:
-; ARM6-NEXT:    .long 306783379 @ 0x12492493
+; ARM6-NEXT:    .long 9586981 @ 0x924925
 ;
 ; ARM7-LABEL: test_urem_even:
 ; ARM7:       @ %bb.0:
 ; ARM7-NEXT:    movw r1, #28087
-; ARM7-NEXT:    bic r0, r0, #-134217728
-; ARM7-NEXT:    movt r1, #46811
-; ARM7-NEXT:    movw r2, #9363
+; ARM7-NEXT:    movw r2, #18725
+; ARM7-NEXT:    movt r1, #1755
+; ARM7-NEXT:    movt r2, #146
 ; ARM7-NEXT:    mul r0, r0, r1
-; ARM7-NEXT:    movt r2, #4681
-; ARM7-NEXT:    ror r1, r0, #1
+; ARM7-NEXT:    ubfx r1, r0, #1, #26
+; ARM7-NEXT:    orr r0, r1, r0, lsl #26
+; ARM7-NEXT:    bic r1, r0, #-134217728
 ; ARM7-NEXT:    mov r0, #0
 ; ARM7-NEXT:    cmp r1, r2
 ; ARM7-NEXT:    movwlo r0, #1
@@ -155,12 +144,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; ARM8-LABEL: test_urem_even:
 ; ARM8:       @ %bb.0:
 ; ARM8-NEXT:    movw r1, #28087
-; ARM8-NEXT:    bic r0, r0, #-134217728
-; ARM8-NEXT:    movt r1, #46811
-; ARM8-NEXT:    movw r2, #9363
+; ARM8-NEXT:    movw r2, #18725
+; ARM8-NEXT:    movt r1, #1755
+; ARM8-NEXT:    movt r2, #146
 ; ARM8-NEXT:    mul r0, r0, r1
-; ARM8-NEXT:    movt r2, #4681
-; ARM8-NEXT:    ror r1, r0, #1
+; ARM8-NEXT:    ubfx r1, r0, #1, #26
+; ARM8-NEXT:    orr r0, r1, r0, lsl #26
+; ARM8-NEXT:    bic r1, r0, #-134217728
 ; ARM8-NEXT:    mov r0, #0
 ; ARM8-NEXT:    cmp r1, r2
 ; ARM8-NEXT:    movwlo r0, #1
@@ -169,12 +159,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; NEON7-LABEL: test_urem_even:
 ; NEON7:       @ %bb.0:
 ; NEON7-NEXT:    movw r1, #28087
-; NEON7-NEXT:    bic r0, r0, #-134217728
-; NEON7-NEXT:    movt r1, #46811
-; NEON7-NEXT:    movw r2, #9363
+; NEON7-NEXT:    movw r2, #18725
+; NEON7-NEXT:    movt r1, #1755
+; NEON7-NEXT:    movt r2, #146
 ; NEON7-NEXT:    mul r0, r0, r1
-; NEON7-NEXT:    movt r2, #4681
-; NEON7-NEXT:    ror r1, r0, #1
+; NEON7-NEXT:    ubfx r1, r0, #1, #26
+; NEON7-NEXT:    orr r0, r1, r0, lsl #26
+; NEON7-NEXT:    bic r1, r0, #-134217728
 ; NEON7-NEXT:    mov r0, #0
 ; NEON7-NEXT:    cmp r1, r2
 ; NEON7-NEXT:    movwlo r0, #1
@@ -183,12 +174,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; NEON8-LABEL: test_urem_even:
 ; NEON8:       @ %bb.0:
 ; NEON8-NEXT:    movw r1, #28087
-; NEON8-NEXT:    bic r0, r0, #-134217728
-; NEON8-NEXT:    movt r1, #46811
-; NEON8-NEXT:    movw r2, #9363
+; NEON8-NEXT:    movw r2, #18725
+; NEON8-NEXT:    movt r1, #1755
+; NEON8-NEXT:    movt r2, #146
 ; NEON8-NEXT:    mul r0, r0, r1
-; NEON8-NEXT:    movt r2, #4681
-; NEON8-NEXT:    ror r1, r0, #1
+; NEON8-NEXT:    ubfx r1, r0, #1, #26
+; NEON8-NEXT:    orr r0, r1, r0, lsl #26
+; NEON8-NEXT:    bic r1, r0, #-134217728
 ; NEON8-NEXT:    mov r0, #0
 ; NEON8-NEXT:    cmp r1, r2
 ; NEON8-NEXT:    movwlo r0, #1
@@ -201,87 +193,61 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; ARM5-LABEL: test_urem_odd_setne:
 ; ARM5:       @ %bb.0:
-; ARM5-NEXT:    ldr r1, .LCPI2_0
-; ARM5-NEXT:    and r0, r0, #15
+; ARM5-NEXT:    mov r1, #13
 ; ARM5-NEXT:    mul r2, r0, r1
-; ARM5-NEXT:    ldr r1, .LCPI2_1
 ; ARM5-NEXT:    mov r0, #0
-; ARM5-NEXT:    cmp r2, r1
+; ARM5-NEXT:    and r1, r2, #15
+; ARM5-NEXT:    cmp r1, #3
 ; ARM5-NEXT:    movhi r0, #1
 ; ARM5-NEXT:    bx lr
-; ARM5-NEXT:    .p2align 2
-; ARM5-NEXT:  @ %bb.1:
-; ARM5-NEXT:  .LCPI2_0:
-; ARM5-NEXT:    .long 3435973837 @ 0xcccccccd
-; ARM5-NEXT:  .LCPI2_1:
-; ARM5-NEXT:    .long 858993459 @ 0x33333333
 ;
 ; ARM6-LABEL: test_urem_odd_setne:
 ; ARM6:       @ %bb.0:
-; ARM6-NEXT:    ldr r1, .LCPI2_0
-; ARM6-NEXT:    and r0, r0, #15
-; ARM6-NEXT:    ldr r2, .LCPI2_1
-; ARM6-NEXT:    mul r1, r0, r1
+; ARM6-NEXT:    mov r1, #13
+; ARM6-NEXT:    mul r0, r0, r1
+; ARM6-NEXT:    and r1, r0, #15
 ; ARM6-NEXT:    mov r0, #0
-; ARM6-NEXT:    cmp r1, r2
+; ARM6-NEXT:    cmp r1, #3
 ; ARM6-NEXT:    movhi r0, #1
 ; ARM6-NEXT:    bx lr
-; ARM6-NEXT:    .p2align 2
-; ARM6-NEXT:  @ %bb.1:
-; ARM6-NEXT:  .LCPI2_0:
-; ARM6-NEXT:    .long 3435973837 @ 0xcccccccd
-; ARM6-NEXT:  .LCPI2_1:
-; ARM6-NEXT:    .long 858993459 @ 0x33333333
 ;
 ; ARM7-LABEL: test_urem_odd_setne:
 ; ARM7:       @ %bb.0:
-; ARM7-NEXT:    movw r1, #52429
-; ARM7-NEXT:    and r0, r0, #15
-; ARM7-NEXT:    movt r1, #52428
-; ARM7-NEXT:    movw r2, #13107
-; ARM7-NEXT:    mul r1, r0, r1
-; ARM7-NEXT:    movt r2, #13107
+; ARM7-NEXT:    mov r1, #13
+; ARM7-NEXT:    mul r0, r0, r1
+; ARM7-NEXT:    and r1, r0, #15
 ; ARM7-NEXT:    mov r0, #0
-; ARM7-NEXT:    cmp r1, r2
+; ARM7-NEXT:    cmp r1, #3
 ; ARM7-NEXT:    movwhi r0, #1
 ; ARM7-NEXT:    bx lr
 ;
 ; ARM8-LABEL: test_urem_odd_setne:
 ; ARM8:       @ %bb.0:
-; ARM8-NEXT:    movw r1, #52429
-; ARM8-NEXT:    and r0, r0, #15
-; ARM8-NEXT:    movt r1, #52428
-; ARM8-NEXT:    movw r2, #13107
-; ARM8-NEXT:    mul r1, r0, r1
-; ARM8-NEXT:    movt r2, #13107
+; ARM8-NEXT:    mov r1, #13
+; ARM8-NEXT:    mul r0, r0, r1
+; ARM8-NEXT:    and r1, r0, #15
 ; ARM8-NEXT:    mov r0, #0
-; ARM8-NEXT:    cmp r1, r2
+; ARM8-NEXT:    cmp r1, #3
 ; ARM8-NEXT:    movwhi r0, #1
 ; ARM8-NEXT:    bx lr
 ;
 ; NEON7-LABEL: test_urem_odd_setne:
 ; NEON7:       @ %bb.0:
-; NEON7-NEXT:    movw r1, #52429
-; NEON7-NEXT:    and r0, r0, #15
-; NEON7-NEXT:    movt r1, #52428
-; NEON7-NEXT:    movw r2, #13107
-; NEON7-NEXT:    mul r1, r0, r1
-; NEON7-NEXT:    movt r2, #13107
+; NEON7-NEXT:    mov r1, #13
+; NEON7-NEXT:    mul r0, r0, r1
+; NEON7-NEXT:    and r1, r0, #15
 ; NEON7-NEXT:    mov r0, #0
-; NEON7-NEXT:    cmp r1, r2
+; NEON7-NEXT:    cmp r1, #3
 ; NEON7-NEXT:    movwhi r0, #1
 ; NEON7-NEXT:    bx lr
 ;
 ; NEON8-LABEL: test_urem_odd_setne:
 ; NEON8:       @ %bb.0:
-; NEON8-NEXT:    movw r1, #52429
-; NEON8-NEXT:    and r0, r0, #15
-; NEON8-NEXT:    movt r1, #52428
-; NEON8-NEXT:    movw r2, #13107
-; NEON8-NEXT:    mul r1, r0, r1
-; NEON8-NEXT:    movt r2, #13107
+; NEON8-NEXT:    mov r1, #13
+; NEON8-NEXT:    mul r0, r0, r1
+; NEON8-NEXT:    and r1, r0, #15
 ; NEON8-NEXT:    mov r0, #0
-; NEON8-NEXT:    cmp r1, r2
+; NEON8-NEXT:    cmp r1, #3
 ; NEON8-NEXT:    movwhi r0, #1
 ; NEON8-NEXT:    bx lr
   %urem = urem i4 %X, 5
@@ -292,91 +258,67 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; ARM5-LABEL: test_urem_negative_odd:
 ; ARM5:       @ %bb.0:
-; ARM5-NEXT:    mov r1, #255
+; ARM5-NEXT:    mov r1, #51
 ; ARM5-NEXT:    orr r1, r1, #256
-; ARM5-NEXT:    and r0, r0, r1
-; ARM5-NEXT:    ldr r1, .LCPI3_0
 ; ARM5-NEXT:    mul r2, r0, r1
-; ARM5-NEXT:    ldr r1, .LCPI3_1
+; ARM5-NEXT:    mov r0, #255
+; ARM5-NEXT:    orr r0, r0, #256
+; ARM5-NEXT:    and r1, r2, r0
 ; ARM5-NEXT:    mov r0, #0
-; ARM5-NEXT:    cmp r2, r1
+; ARM5-NEXT:    cmp r1, #1
 ; ARM5-NEXT:    movhi r0, #1
 ; ARM5-NEXT:    bx lr
-; ARM5-NEXT:    .p2align 2
-; ARM5-NEXT:  @ %bb.1:
-; ARM5-NEXT:  .LCPI3_0:
-; ARM5-NEXT:    .long 2837897523 @ 0xa926e133
-; ARM5-NEXT:  .LCPI3_1:
-; ARM5-NEXT:    .long 8471335 @ 0x814327
 ;
 ; ARM6-LABEL: test_urem_negative_odd:
 ; ARM6:       @ %bb.0:
+; ARM6-NEXT:    mov r1, #51
+; ARM6-NEXT:    orr r1, r1, #256
+; ARM6-NEXT:    mul r0, r0, r1
 ; ARM6-NEXT:    mov r1, #255
-; ARM6-NEXT:    ldr r2, .LCPI3_1
 ; ARM6-NEXT:    orr r1, r1, #256
-; ARM6-NEXT:    and r0, r0, r1
-; ARM6-NEXT:    ldr r1, .LCPI3_0
-; ARM6-NEXT:    mul r1, r0, r1
+; ARM6-NEXT:    and r1, r0, r1
 ; ARM6-NEXT:    mov r0, #0
-; ARM6-NEXT:    cmp r1, r2
+; ARM6-NEXT:    cmp r1, #1
 ; ARM6-NEXT:    movhi r0, #1
 ; ARM6-NEXT:    bx lr
-; ARM6-NEXT:    .p2align 2
-; ARM6-NEXT:  @ %bb.1:
-; ARM6-NEXT:  .LCPI3_0:
-; ARM6-NEXT:    .long 2837897523 @ 0xa926e133
-; ARM6-NEXT:  .LCPI3_1:
-; ARM6-NEXT:    .long 8471335 @ 0x814327
 ;
 ; ARM7-LABEL: test_urem_negative_odd:
 ; ARM7:       @ %bb.0:
-; ARM7-NEXT:    movw r1, #57651
-; ARM7-NEXT:    bfc r0, #9, #23
-; ARM7-NEXT:    movt r1, #43302
-; ARM7-NEXT:    movw r2, #17191
+; ARM7-NEXT:    movw r1, #307
 ; ARM7-NEXT:    mul r1, r0, r1
-; ARM7-NEXT:    movt r2, #129
 ; ARM7-NEXT:    mov r0, #0
-; ARM7-NEXT:    cmp r1, r2
+; ARM7-NEXT:    bfc r1, #9, #23
+; ARM7-NEXT:    cmp r1, #1
 ; ARM7-NEXT:    movwhi r0, #1
 ; ARM7-NEXT:    bx lr
 ;
 ; ARM8-LABEL: test_urem_negative_odd:
 ; ARM8:       @ %bb.0:
-; ARM8-NEXT:    movw r1, #57651
-; ARM8-NEXT:    bfc r0, #9, #23
-; ARM8-NEXT:    movt r1, #43302
-; ARM8-NEXT:    movw r2, #17191
+; ARM8-NEXT:    movw r1, #307
 ; ARM8-NEXT:    mul r1, r0, r1
-; ARM8-NEXT:    movt r2, #129
 ; ARM8-NEXT:    mov r0, #0
-; ARM8-NEXT:    cmp r1, r2
+; ARM8-NEXT:    bfc r1, #9, #23
+; ARM8-NEXT:    cmp r1, #1
 ; ARM8-NEXT:    movwhi r0, #1
 ; ARM8-NEXT:    bx lr
 ;
 ; NEON7-LABEL: test_urem_negative_odd:
 ; NEON7:       @ %bb.0:
-; NEON7-NEXT:    movw r1, #57651
-; NEON7-NEXT:    bfc r0, #9, #23
-; NEON7-NEXT:    movt r1, #43302
-; NEON7-NEXT:    movw r2, #17191
+; NEON7-NEXT:    movw r1, #307
 ; NEON7-NEXT:    mul r1, r0, r1
-; NEON7-NEXT:    movt r2, #129
 ; NEON7-NEXT:    mov r0, #0
-; NEON7-NEXT:    cmp r1, r2
+; NEON7-NEXT:    bfc r1, #9, #23
+; NEON7-NEXT:    cmp r1, #1
 ; NEON7-NEXT:    movwhi r0, #1
 ; NEON7-NEXT:    bx lr
 ;
 ; NEON8-LABEL: test_urem_negative_odd:
 ; NEON8:       @ %bb.0:
-; NEON8-NEXT:    movw r1, #57651
-; NEON8-NEXT:    bfc r0, #9, #23
-; NEON8-NEXT:    movt r1, #43302
-; NEON8-NEXT:    movw r2, #17191
+; NEON8-NEXT:    movw r1, #307
 ; NEON8-NEXT:    mul r1, r0, r1
-; NEON8-NEXT:    movt r2, #129
 ; NEON8-NEXT:    mov r0, #0
-; NEON8-NEXT:    cmp r1, r2
+; NEON8-NEXT:    bfc r1, #9, #23
+; NEON8-NEXT:    cmp r1, #1
 ; NEON8-NEXT:    movwhi r0, #1
 ; NEON8-NEXT:    bx lr
   %urem = urem i9 %X, -5
@@ -388,289 +330,291 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; ARM5-LABEL: test_urem_vec:
 ; ARM5:       @ %bb.0:
 ; ARM5-NEXT:    push {r4, r5, r11, lr}
+; ARM5-NEXT:    mov r3, #183
+; ARM5-NEXT:    mvn r12, #182
+; ARM5-NEXT:    orr r3, r3, #1280
+; ARM5-NEXT:    sub r12, r12, #1280
+; ARM5-NEXT:    mov r4, #51
+; ARM5-NEXT:    mla lr, r1, r3, r12
 ; ARM5-NEXT:    mov r12, #255
-; ARM5-NEXT:    ldr r3, .LCPI4_1
 ; ARM5-NEXT:    orr r12, r12, #1792
-; ARM5-NEXT:    ldr lr, .LCPI4_0
-; ARM5-NEXT:    and r1, r1, r12
-; ARM5-NEXT:    and r2, r2, r12
-; ARM5-NEXT:    and r0, r0, r12
-; ARM5-NEXT:    mla r4, r1, r3, lr
-; ARM5-NEXT:    ldr r1, .LCPI4_2
-; ARM5-NEXT:    ldr lr, .LCPI4_3
+; ARM5-NEXT:    orr r4, r4, #768
 ; ARM5-NEXT:    mov r3, #0
-; ARM5-NEXT:    cmp r4, r1
-; ARM5-NEXT:    ldr r4, .LCPI4_4
+; ARM5-NEXT:    and r1, lr, r12
+; ARM5-NEXT:    mvn lr, #101
+; ARM5-NEXT:    sub lr, lr, #1536
+; ARM5-NEXT:    cmp r1, #292
+; ARM5-NEXT:    mla r5, r2, r4, lr
 ; ARM5-NEXT:    mov r1, #0
 ; ARM5-NEXT:    movhi r1, #1
-; ARM5-NEXT:    mla r5, r2, r4, lr
-; ARM5-NEXT:    ldr r2, .LCPI4_5
-; ARM5-NEXT:    cmp r5, r2
-; ARM5-NEXT:    ldr r5, .LCPI4_6
+; ARM5-NEXT:    and r2, r5, r12
+; ARM5-NEXT:    mov r5, #171
+; ARM5-NEXT:    orr r5, r5, #512
+; ARM5-NEXT:    cmp r2, #1
 ; ARM5-NEXT:    mov r2, #0
-; ARM5-NEXT:    movhi r2, #1
 ; ARM5-NEXT:    mul r4, r0, r5
-; ARM5-NEXT:    ldr r5, .LCPI4_7
-; ARM5-NEXT:    ror r0, r4, #1
-; ARM5-NEXT:    cmp r0, r5
+; ARM5-NEXT:    mov r0, #1020
+; ARM5-NEXT:    orr r0, r0, #1024
+; ARM5-NEXT:    mov r5, #254
+; ARM5-NEXT:    movhi r2, #1
+; ARM5-NEXT:    orr r5, r5, #1792
+; ARM5-NEXT:    and r0, r4, r0
+; ARM5-NEXT:    lsr r0, r0, #1
+; ARM5-NEXT:    orr r0, r0, r4, lsl #10
+; ARM5-NEXT:    and r0, r0, r5
+; ARM5-NEXT:    lsr r0, r0, #1
+; ARM5-NEXT:    cmp r0, #170
 ; ARM5-NEXT:    movhi r3, #1
 ; ARM5-NEXT:    mov r0, r3
 ; ARM5-NEXT:    pop {r4, r5, r11, pc}
-; ARM5-NEXT:    .p2align 2
-; ARM5-NEXT:  @ %bb.1:
-; ARM5-NEXT:  .LCPI4_0:
-; ARM5-NEXT:    .long 1227133513 @ 0x49249249
-; ARM5-NEXT:  .LCPI4_1:
-; ARM5-NEXT:    .long 3067833783 @ 0xb6db6db7
-; ARM5-NEXT:  .LCPI4_2:
-; ARM5-NEXT:    .long 613566756 @ 0x24924924
-; ARM5-NEXT:  .LCPI4_3:
-; ARM5-NEXT:    .long 4191955354 @ 0xf9dc299a
-; ARM5-NEXT:  .LCPI4_4:
-; ARM5-NEXT:    .long 2198989619 @ 0x8311eb33
-; ARM5-NEXT:  .LCPI4_5:
-; ARM5-NEXT:    .long 2102284 @ 0x20140c
-; ARM5-NEXT:  .LCPI4_6:
-; ARM5-NEXT:    .long 2863311531 @ 0xaaaaaaab
-; ARM5-NEXT:  .LCPI4_7:
-; ARM5-NEXT:    .long 715827882 @ 0x2aaaaaaa
 ;
 ; ARM6-LABEL: test_urem_vec:
 ; ARM6:       @ %bb.0:
 ; ARM6-NEXT:    push {r4, lr}
+; ARM6-NEXT:    mov r4, #51
+; ARM6-NEXT:    mvn lr, #101
+; ARM6-NEXT:    orr r4, r4, #768
+; ARM6-NEXT:    sub lr, lr, #1536
+; ARM6-NEXT:    mov r3, #183
+; ARM6-NEXT:    mvn r12, #182
+; ARM6-NEXT:    mla r2, r2, r4, lr
+; ARM6-NEXT:    mov r4, #171
+; ARM6-NEXT:    orr r4, r4, #512
+; ARM6-NEXT:    orr r3, r3, #1280
+; ARM6-NEXT:    sub r12, r12, #1280
+; ARM6-NEXT:    mul r0, r0, r4
+; ARM6-NEXT:    mov r4, #1020
+; ARM6-NEXT:    orr r4, r4, #1024
+; ARM6-NEXT:    mla r1, r1, r3, r12
 ; ARM6-NEXT:    mov r12, #255
-; ARM6-NEXT:    ldr r3, .LCPI4_1
 ; ARM6-NEXT:    orr r12, r12, #1792
-; ARM6-NEXT:    ldr lr, .LCPI4_0
-; ARM6-NEXT:    and r1, r1, r12
-; ARM6-NEXT:    ldr r4, .LCPI4_4
 ; ARM6-NEXT:    and r2, r2, r12
-; ARM6-NEXT:    and r0, r0, r12
-; ARM6-NEXT:    mla r1, r1, r3, lr
-; ARM6-NEXT:    ldr lr, .LCPI4_2
 ; ARM6-NEXT:    mov r3, #0
-; ARM6-NEXT:    cmp r1, lr
-; ARM6-NEXT:    ldr lr, .LCPI4_3
-; ARM6-NEXT:    mla r2, r2, r4, lr
-; ARM6-NEXT:    ldr r4, .LCPI4_5
+; ARM6-NEXT:    and r4, r0, r4
+; ARM6-NEXT:    lsr r4, r4, #1
+; ARM6-NEXT:    orr r0, r4, r0, lsl #10
+; ARM6-NEXT:    mov r4, #254
+; ARM6-NEXT:    and r1, r1, r12
+; ARM6-NEXT:    orr r4, r4, #1792
+; ARM6-NEXT:    cmp r1, #292
 ; ARM6-NEXT:    mov r1, #0
+; ARM6-NEXT:    and r0, r0, r4
 ; ARM6-NEXT:    movhi r1, #1
-; ARM6-NEXT:    cmp r2, r4
-; ARM6-NEXT:    ldr r4, .LCPI4_6
+; ARM6-NEXT:    cmp r2, #1
 ; ARM6-NEXT:    mov r2, #0
+; ARM6-NEXT:    lsr r0, r0, #1
 ; ARM6-NEXT:    movhi r2, #1
-; ARM6-NEXT:    mul r0, r0, r4
-; ARM6-NEXT:    ldr r4, .LCPI4_7
-; ARM6-NEXT:    ror r0, r0, #1
-; ARM6-NEXT:    cmp r0, r4
+; ARM6-NEXT:    cmp r0, #170
 ; ARM6-NEXT:    movhi r3, #1
 ; ARM6-NEXT:    mov r0, r3
 ; ARM6-NEXT:    pop {r4, pc}
-; ARM6-NEXT:    .p2align 2
-; ARM6-NEXT:  @ %bb.1:
-; ARM6-NEXT:  .LCPI4_0:
-; ARM6-NEXT:    .long 1227133513 @ 0x49249249
-; ARM6-NEXT:  .LCPI4_1:
-; ARM6-NEXT:    .long 3067833783 @ 0xb6db6db7
-; ARM6-NEXT:  .LCPI4_2:
-; ARM6-NEXT:    .long 613566756 @ 0x24924924
-; ARM6-NEXT:  .LCPI4_3:
-; ARM6-NEXT:    .long 4191955354 @ 0xf9dc299a
-; ARM6-NEXT:  .LCPI4_4:
-; ARM6-NEXT:    .long 2198989619 @ 0x8311eb33
-; ARM6-NEXT:  .LCPI4_5:
-; ARM6-NEXT:    .long 2102284 @ 0x20140c
-; ARM6-NEXT:  .LCPI4_6:
-; ARM6-NEXT:    .long 2863311531 @ 0xaaaaaaab
-; ARM6-NEXT:  .LCPI4_7:
-; ARM6-NEXT:    .long 715827882 @ 0x2aaaaaaa
 ;
 ; ARM7-LABEL: test_urem_vec:
 ; ARM7:       @ %bb.0:
-; ARM7-NEXT:    push {r4, lr}
-; ARM7-NEXT:    movw r3, #18725
-; ARM7-NEXT:    bfc r1, #11, #21
-; ARM7-NEXT:    movt r3, #9362
-; ARM7-NEXT:    bfc r2, #11, #21
-; ARM7-NEXT:    umull r3, r12, r1, r3
-; ARM7-NEXT:    bfc r0, #11, #21
-; ARM7-NEXT:    movw r3, #25663
-; ARM7-NEXT:    movt r3, #160
-; ARM7-NEXT:    umull r3, lr, r2, r3
-; ARM7-NEXT:    vldr d17, .LCPI4_0
-; ARM7-NEXT:    movw r3, #43691
-; ARM7-NEXT:    movt r3, #43690
-; ARM7-NEXT:    umull r3, r4, r0, r3
-; ARM7-NEXT:    sub r3, r1, r12
-; ARM7-NEXT:    add r3, r12, r3, lsr #1
-; ARM7-NEXT:    lsr r12, r3, #2
-; ARM7-NEXT:    sub r3, r2, lr
-; ARM7-NEXT:    lsr r4, r4, #2
-; ARM7-NEXT:    add r4, r4, r4, lsl #1
-; ARM7-NEXT:    add r3, lr, r3, lsr #1
-; ARM7-NEXT:    sub r0, r0, r4, lsl #1
-; ARM7-NEXT:    lsr lr, r3, #10
-; ARM7-NEXT:    movw r3, #2043
 ; ARM7-NEXT:    vmov.16 d16[0], r0
-; ARM7-NEXT:    sub r0, r12, r12, lsl #3
-; ARM7-NEXT:    mls r2, lr, r3, r2
-; ARM7-NEXT:    add r0, r1, r0
-; ARM7-NEXT:    vmov.16 d16[1], r0
+; ARM7-NEXT:    vldr d17, .LCPI4_0
+; ARM7-NEXT:    vmov.16 d16[1], r1
+; ARM7-NEXT:    vldr d19, .LCPI4_3
 ; ARM7-NEXT:    vmov.16 d16[2], r2
+; ARM7-NEXT:    vsub.i16 d16, d16, d17
+; ARM7-NEXT:    vldr d17, .LCPI4_1
+; ARM7-NEXT:    vmul.i16 d16, d16, d17
+; ARM7-NEXT:    vldr d17, .LCPI4_2
+; ARM7-NEXT:    vneg.s16 d17, d17
+; ARM7-NEXT:    vshl.i16 d18, d16, #1
 ; ARM7-NEXT:    vbic.i16 d16, #0xf800
-; ARM7-NEXT:    vceq.i16 d16, d16, d17
-; ARM7-NEXT:    vmvn d16, d16
+; ARM7-NEXT:    vshl.u16 d16, d16, d17
+; ARM7-NEXT:    vshl.u16 d17, d18, d19
+; ARM7-NEXT:    vorr d16, d16, d17
+; ARM7-NEXT:    vldr d17, .LCPI4_4
+; ARM7-NEXT:    vbic.i16 d16, #0xf800
+; ARM7-NEXT:    vcgt.u16 d16, d16, d17
 ; ARM7-NEXT:    vmov.u16 r0, d16[0]
 ; ARM7-NEXT:    vmov.u16 r1, d16[1]
 ; ARM7-NEXT:    vmov.u16 r2, d16[2]
-; ARM7-NEXT:    pop {r4, pc}
+; ARM7-NEXT:    bx lr
 ; ARM7-NEXT:    .p2align 3
 ; ARM7-NEXT:  @ %bb.1:
 ; ARM7-NEXT:  .LCPI4_0:
 ; ARM7-NEXT:    .short 0 @ 0x0
 ; ARM7-NEXT:    .short 1 @ 0x1
 ; ARM7-NEXT:    .short 2 @ 0x2
+; ARM7-NEXT:    .zero 2
+; ARM7-NEXT:  .LCPI4_1:
+; ARM7-NEXT:    .short 683 @ 0x2ab
+; ARM7-NEXT:    .short 1463 @ 0x5b7
+; ARM7-NEXT:    .short 819 @ 0x333
+; ARM7-NEXT:    .zero 2
+; ARM7-NEXT:  .LCPI4_2:
+; ARM7-NEXT:    .short 1 @ 0x1
+; ARM7-NEXT:    .short 0 @ 0x0
+; ARM7-NEXT:    .short 0 @ 0x0
+; ARM7-NEXT:    .short 0 @ 0x0
+; ARM7-NEXT:  .LCPI4_3:
+; ARM7-NEXT:    .short 9 @ 0x9
+; ARM7-NEXT:    .short 10 @ 0xa
+; ARM7-NEXT:    .short 10 @ 0xa
+; ARM7-NEXT:    .short 10 @ 0xa
+; ARM7-NEXT:  .LCPI4_4:
+; ARM7-NEXT:    .short 341 @ 0x155
+; ARM7-NEXT:    .short 292 @ 0x124
+; ARM7-NEXT:    .short 1 @ 0x1
 ; ARM7-NEXT:    .short 0 @ 0x0
 ;
 ; ARM8-LABEL: test_urem_vec:
 ; ARM8:       @ %bb.0:
-; ARM8-NEXT:    push {r4, lr}
-; ARM8-NEXT:    movw r3, #18725
-; ARM8-NEXT:    bfc r1, #11, #21
-; ARM8-NEXT:    movt r3, #9362
-; ARM8-NEXT:    bfc r2, #11, #21
-; ARM8-NEXT:    umull r3, r12, r1, r3
-; ARM8-NEXT:    bfc r0, #11, #21
-; ARM8-NEXT:    movw r3, #25663
-; ARM8-NEXT:    movt r3, #160
-; ARM8-NEXT:    umull r3, lr, r2, r3
-; ARM8-NEXT:    vldr d17, .LCPI4_0
-; ARM8-NEXT:    movw r3, #43691
-; ARM8-NEXT:    movt r3, #43690
-; ARM8-NEXT:    umull r3, r4, r0, r3
-; ARM8-NEXT:    sub r3, r1, r12
-; ARM8-NEXT:    add r3, r12, r3, lsr #1
-; ARM8-NEXT:    lsr r12, r3, #2
-; ARM8-NEXT:    sub r3, r2, lr
-; ARM8-NEXT:    lsr r4, r4, #2
-; ARM8-NEXT:    add r4, r4, r4, lsl #1
-; ARM8-NEXT:    add r3, lr, r3, lsr #1
-; ARM8-NEXT:    sub r0, r0, r4, lsl #1
-; ARM8-NEXT:    lsr lr, r3, #10
-; ARM8-NEXT:    movw r3, #2043
 ; ARM8-NEXT:    vmov.16 d16[0], r0
-; ARM8-NEXT:    sub r0, r12, r12, lsl #3
-; ARM8-NEXT:    mls r2, lr, r3, r2
-; ARM8-NEXT:    add r0, r1, r0
-; ARM8-NEXT:    vmov.16 d16[1], r0
+; ARM8-NEXT:    vldr d17, .LCPI4_0
+; ARM8-NEXT:    vmov.16 d16[1], r1
+; ARM8-NEXT:    vldr d19, .LCPI4_3
 ; ARM8-NEXT:    vmov.16 d16[2], r2
+; ARM8-NEXT:    vsub.i16 d16, d16, d17
+; ARM8-NEXT:    vldr d17, .LCPI4_1
+; ARM8-NEXT:    vmul.i16 d16, d16, d17
+; ARM8-NEXT:    vldr d17, .LCPI4_2
+; ARM8-NEXT:    vneg.s16 d17, d17
+; ARM8-NEXT:    vshl.i16 d18, d16, #1
+; ARM8-NEXT:    vbic.i16 d16, #0xf800
+; ARM8-NEXT:    vshl.u16 d16, d16, d17
+; ARM8-NEXT:    vshl.u16 d17, d18, d19
+; ARM8-NEXT:    vorr d16, d16, d17
+; ARM8-NEXT:    vldr d17, .LCPI4_4
 ; ARM8-NEXT:    vbic.i16 d16, #0xf800
-; ARM8-NEXT:    vceq.i16 d16, d16, d17
-; ARM8-NEXT:    vmvn d16, d16
+; ARM8-NEXT:    vcgt.u16 d16, d16, d17
 ; ARM8-NEXT:    vmov.u16 r0, d16[0]
 ; ARM8-NEXT:    vmov.u16 r1, d16[1]
 ; ARM8-NEXT:    vmov.u16 r2, d16[2]
-; ARM8-NEXT:    pop {r4, pc}
+; ARM8-NEXT:    bx lr
 ; ARM8-NEXT:    .p2align 3
 ; ARM8-NEXT:  @ %bb.1:
 ; ARM8-NEXT:  .LCPI4_0:
 ; ARM8-NEXT:    .short 0 @ 0x0
 ; ARM8-NEXT:    .short 1 @ 0x1
 ; ARM8-NEXT:    .short 2 @ 0x2
+; ARM8-NEXT:    .zero 2
+; ARM8-NEXT:  .LCPI4_1:
+; ARM8-NEXT:    .short 683 @ 0x2ab
+; ARM8-NEXT:    .short 1463 @ 0x5b7
+; ARM8-NEXT:    .short 819 @ 0x333
+; ARM8-NEXT:    .zero 2
+; ARM8-NEXT:  .LCPI4_2:
+; ARM8-NEXT:    .short 1 @ 0x1
+; ARM8-NEXT:    .short 0 @ 0x0
+; ARM8-NEXT:    .short 0 @ 0x0
+; ARM8-NEXT:    .short 0 @ 0x0
+; ARM8-NEXT:  .LCPI4_3:
+; ARM8-NEXT:    .short 9 @ 0x9
+; ARM8-NEXT:    .short 10 @ 0xa
+; ARM8-NEXT:    .short 10 @ 0xa
+; ARM8-NEXT:    .short 10 @ 0xa
+; ARM8-NEXT:  .LCPI4_4:
+; ARM8-NEXT:    .short 341 @ 0x155
+; ARM8-NEXT:    .short 292 @ 0x124
+; ARM8-NEXT:    .short 1 @ 0x1
 ; ARM8-NEXT:    .short 0 @ 0x0
 ;
 ; NEON7-LABEL: test_urem_vec:
 ; NEON7:       @ %bb.0:
-; NEON7-NEXT:    push {r4, lr}
-; NEON7-NEXT:    movw r3, #18725
-; NEON7-NEXT:    bfc r1, #11, #21
-; NEON7-NEXT:    movt r3, #9362
-; NEON7-NEXT:    bfc r2, #11, #21
-; NEON7-NEXT:    umull r3, r12, r1, r3
-; NEON7-NEXT:    bfc r0, #11, #21
-; NEON7-NEXT:    movw r3, #25663
-; NEON7-NEXT:    movt r3, #160
-; NEON7-NEXT:    umull r3, lr, r2, r3
-; NEON7-NEXT:    vldr d17, .LCPI4_0
-; NEON7-NEXT:    movw r3, #43691
-; NEON7-NEXT:    movt r3, #43690
-; NEON7-NEXT:    umull r3, r4, r0, r3
-; NEON7-NEXT:    sub r3, r1, r12
-; NEON7-NEXT:    add r3, r12, r3, lsr #1
-; NEON7-NEXT:    lsr r12, r3, #2
-; NEON7-NEXT:    sub r3, r2, lr
-; NEON7-NEXT:    lsr r4, r4, #2
-; NEON7-NEXT:    add r4, r4, r4, lsl #1
-; NEON7-NEXT:    add r3, lr, r3, lsr #1
-; NEON7-NEXT:    sub r0, r0, r4, lsl #1
-; NEON7-NEXT:    lsr lr, r3, #10
-; NEON7-NEXT:    movw r3, #2043
 ; NEON7-NEXT:    vmov.16 d16[0], r0
-; NEON7-NEXT:    sub r0, r12, r12, lsl #3
-; NEON7-NEXT:    mls r2, lr, r3, r2
-; NEON7-NEXT:    add r0, r1, r0
-; NEON7-NEXT:    vmov.16 d16[1], r0
+; NEON7-NEXT:    vldr d17, .LCPI4_0
+; NEON7-NEXT:    vmov.16 d16[1], r1
+; NEON7-NEXT:    vldr d19, .LCPI4_3
 ; NEON7-NEXT:    vmov.16 d16[2], r2
+; NEON7-NEXT:    vsub.i16 d16, d16, d17
+; NEON7-NEXT:    vldr d17, .LCPI4_1
+; NEON7-NEXT:    vmul.i16 d16, d16, d17
+; NEON7-NEXT:    vldr d17, .LCPI4_2
+; NEON7-NEXT:    vneg.s16 d17, d17
+; NEON7-NEXT:    vshl.i16 d18, d16, #1
+; NEON7-NEXT:    vbic.i16 d16, #0xf800
+; NEON7-NEXT:    vshl.u16 d16, d16, d17
+; NEON7-NEXT:    vshl.u16 d17, d18, d19
+; NEON7-NEXT:    vorr d16, d16, d17
+; NEON7-NEXT:    vldr d17, .LCPI4_4
 ; NEON7-NEXT:    vbic.i16 d16, #0xf800
-; NEON7-NEXT:    vceq.i16 d16, d16, d17
-; NEON7-NEXT:    vmvn d16, d16
+; NEON7-NEXT:    vcgt.u16 d16, d16, d17
 ; NEON7-NEXT:    vmov.u16 r0, d16[0]
 ; NEON7-NEXT:    vmov.u16 r1, d16[1]
 ; NEON7-NEXT:    vmov.u16 r2, d16[2]
-; NEON7-NEXT:    pop {r4, pc}
+; NEON7-NEXT:    bx lr
 ; NEON7-NEXT:    .p2align 3
 ; NEON7-NEXT:  @ %bb.1:
 ; NEON7-NEXT:  .LCPI4_0:
 ; NEON7-NEXT:    .short 0 @ 0x0
 ; NEON7-NEXT:    .short 1 @ 0x1
 ; NEON7-NEXT:    .short 2 @ 0x2
+; NEON7-NEXT:    .zero 2
+; NEON7-NEXT:  .LCPI4_1:
+; NEON7-NEXT:    .short 683 @ 0x2ab
+; NEON7-NEXT:    .short 1463 @ 0x5b7
+; NEON7-NEXT:    .short 819 @ 0x333
+; NEON7-NEXT:    .zero 2
+; NEON7-NEXT:  .LCPI4_2:
+; NEON7-NEXT:    .short 1 @ 0x1
+; NEON7-NEXT:    .short 0 @ 0x0
+; NEON7-NEXT:    .short 0 @ 0x0
+; NEON7-NEXT:    .short 0 @ 0x0
+; NEON7-NEXT:  .LCPI4_3:
+; NEON7-NEXT:    .short 9 @ 0x9
+; NEON7-NEXT:    .short 10 @ 0xa
+; NEON7-NEXT:    .short 10 @ 0xa
+; NEON7-NEXT:    .short 10 @ 0xa
+; NEON7-NEXT:  .LCPI4_4:
+; NEON7-NEXT:    .short 341 @ 0x155
+; NEON7-NEXT:    .short 292 @ 0x124
+; NEON7-NEXT:    .short 1 @ 0x1
 ; NEON7-NEXT:    .short 0 @ 0x0
 ;
 ; NEON8-LABEL: test_urem_vec:
 ; NEON8:       @ %bb.0:
-; NEON8-NEXT:    push {r4, lr}
-; NEON8-NEXT:    movw r3, #18725
-; NEON8-NEXT:    bfc r1, #11, #21
-; NEON8-NEXT:    movt r3, #9362
-; NEON8-NEXT:    bfc r2, #11, #21
-; NEON8-NEXT:    umull r3, r12, r1, r3
-; NEON8-NEXT:    bfc r0, #11, #21
-; NEON8-NEXT:    movw r3, #25663
-; NEON8-NEXT:    movt r3, #160
-; NEON8-NEXT:    umull r3, lr, r2, r3
-; NEON8-NEXT:    vldr d17, .LCPI4_0
-; NEON8-NEXT:    movw r3, #43691
-; NEON8-NEXT:    movt r3, #43690
-; NEON8-NEXT:    umull r3, r4, r0, r3
-; NEON8-NEXT:    sub r3, r1, r12
-; NEON8-NEXT:    add r3, r12, r3, lsr #1
-; NEON8-NEXT:    lsr r12, r3, #2
-; NEON8-NEXT:    sub r3, r2, lr
-; NEON8-NEXT:    lsr r4, r4, #2
-; NEON8-NEXT:    add r4, r4, r4, lsl #1
-; NEON8-NEXT:    add r3, lr, r3, lsr #1
-; NEON8-NEXT:    sub r0, r0, r4, lsl #1
-; NEON8-NEXT:    lsr lr, r3, #10
-; NEON8-NEXT:    movw r3, #2043
 ; NEON8-NEXT:    vmov.16 d16[0], r0
-; NEON8-NEXT:    sub r0, r12, r12, lsl #3
-; NEON8-NEXT:    mls r2, lr, r3, r2
-; NEON8-NEXT:    add r0, r1, r0
-; NEON8-NEXT:    vmov.16 d16[1], r0
+; NEON8-NEXT:    vldr d17, .LCPI4_0
+; NEON8-NEXT:    vmov.16 d16[1], r1
+; NEON8-NEXT:    vldr d19, .LCPI4_3
 ; NEON8-NEXT:    vmov.16 d16[2], r2
+; NEON8-NEXT:    vsub.i16 d16, d16, d17
+; NEON8-NEXT:    vldr d17, .LCPI4_1
+; NEON8-NEXT:    vmul.i16 d16, d16, d17
+; NEON8-NEXT:    vldr d17, .LCPI4_2
+; NEON8-NEXT:    vneg.s16 d17, d17
+; NEON8-NEXT:    vshl.i16 d18, d16, #1
 ; NEON8-NEXT:    vbic.i16 d16, #0xf800
-; NEON8-NEXT:    vceq.i16 d16, d16, d17
-; NEON8-NEXT:    vmvn d16, d16
+; NEON8-NEXT:    vshl.u16 d16, d16, d17
+; NEON8-NEXT:    vshl.u16 d17, d18, d19
+; NEON8-NEXT:    vorr d16, d16, d17
+; NEON8-NEXT:    vldr d17, .LCPI4_4
+; NEON8-NEXT:    vbic.i16 d16, #0xf800
+; NEON8-NEXT:    vcgt.u16 d16, d16, d17
 ; NEON8-NEXT:    vmov.u16 r0, d16[0]
 ; NEON8-NEXT:    vmov.u16 r1, d16[1]
 ; NEON8-NEXT:    vmov.u16 r2, d16[2]
-; NEON8-NEXT:    pop {r4, pc}
+; NEON8-NEXT:    bx lr
 ; NEON8-NEXT:    .p2align 3
 ; NEON8-NEXT:  @ %bb.1:
 ; NEON8-NEXT:  .LCPI4_0:
 ; NEON8-NEXT:    .short 0 @ 0x0
 ; NEON8-NEXT:    .short 1 @ 0x1
 ; NEON8-NEXT:    .short 2 @ 0x2
+; NEON8-NEXT:    .zero 2
+; NEON8-NEXT:  .LCPI4_1:
+; NEON8-NEXT:    .short 683 @ 0x2ab
+; NEON8-NEXT:    .short 1463 @ 0x5b7
+; NEON8-NEXT:    .short 819 @ 0x333
+; NEON8-NEXT:    .zero 2
+; NEON8-NEXT:  .LCPI4_2:
+; NEON8-NEXT:    .short 1 @ 0x1
+; NEON8-NEXT:    .short 0 @ 0x0
+; NEON8-NEXT:    .short 0 @ 0x0
+; NEON8-NEXT:    .short 0 @ 0x0
+; NEON8-NEXT:  .LCPI4_3:
+; NEON8-NEXT:    .short 9 @ 0x9
+; NEON8-NEXT:    .short 10 @ 0xa
+; NEON8-NEXT:    .short 10 @ 0xa
+; NEON8-NEXT:    .short 10 @ 0xa
+; NEON8-NEXT:  .LCPI4_4:
+; NEON8-NEXT:    .short 341 @ 0x155
+; NEON8-NEXT:    .short 292 @ 0x124
+; NEON8-NEXT:    .short 1 @ 0x1
 ; NEON8-NEXT:    .short 0 @ 0x0
   %urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
   %cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>
@@ -680,86 +624,150 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 define i1 @test_urem_larger(i63 %X) nounwind {
 ; ARM5-LABEL: test_urem_larger:
 ; ARM5:       @ %bb.0:
-; ARM5-NEXT:    push {r11, lr}
-; ARM5-NEXT:    ldr r2, .LCPI5_0
-; ARM5-NEXT:    bic r1, r1, #-2147483648
-; ARM5-NEXT:    mov r3, #0
-; ARM5-NEXT:    bl __umoddi3
-; ARM5-NEXT:    orr r0, r0, r1
-; ARM5-NEXT:    clz r0, r0
-; ARM5-NEXT:    lsr r0, r0, #5
-; ARM5-NEXT:    pop {r11, pc}
+; ARM5-NEXT:    push {r4, lr}
+; ARM5-NEXT:    ldr r12, .LCPI5_0
+; ARM5-NEXT:    ldr r2, .LCPI5_1
+; ARM5-NEXT:    umull r3, lr, r0, r12
+; ARM5-NEXT:    mla r4, r0, r2, lr
+; ARM5-NEXT:    mla r0, r1, r12, r4
+; ARM5-NEXT:    bic r0, r0, #-2147483648
+; ARM5-NEXT:    lsrs r0, r0, #1
+; ARM5-NEXT:    rrx r1, r3
+; ARM5-NEXT:    orr r0, r0, r3, lsl #30
+; ARM5-NEXT:    ldr r3, .LCPI5_2
+; ARM5-NEXT:    bic r2, r0, #-2147483648
+; ARM5-NEXT:    mov r0, #0
+; ARM5-NEXT:    subs r1, r1, r3
+; ARM5-NEXT:    sbcs r1, r2, #1
+; ARM5-NEXT:    movlo r0, #1
+; ARM5-NEXT:    pop {r4, pc}
 ; ARM5-NEXT:    .p2align 2
 ; ARM5-NEXT:  @ %bb.1:
 ; ARM5-NEXT:  .LCPI5_0:
-; ARM5-NEXT:    .long 1234567890 @ 0x499602d2
+; ARM5-NEXT:    .long 3456474841 @ 0xce059ed9
+; ARM5-NEXT:  .LCPI5_1:
+; ARM5-NEXT:    .long 790204738 @ 0x2f199142
+; ARM5-NEXT:  .LCPI5_2:
+; ARM5-NEXT:    .long 3175964122 @ 0xbd4d5dda
 ;
 ; ARM6-LABEL: test_urem_larger:
 ; ARM6:       @ %bb.0:
 ; ARM6-NEXT:    push {r11, lr}
-; ARM6-NEXT:    ldr r2, .LCPI5_0
-; ARM6-NEXT:    bic r1, r1, #-2147483648
-; ARM6-NEXT:    mov r3, #0
-; ARM6-NEXT:    bl __umoddi3
-; ARM6-NEXT:    orr r0, r0, r1
-; ARM6-NEXT:    clz r0, r0
-; ARM6-NEXT:    lsr r0, r0, #5
+; ARM6-NEXT:    ldr r12, .LCPI5_0
+; ARM6-NEXT:    ldr r2, .LCPI5_1
+; ARM6-NEXT:    umull r3, lr, r0, r12
+; ARM6-NEXT:    mla r0, r0, r2, lr
+; ARM6-NEXT:    mla r0, r1, r12, r0
+; ARM6-NEXT:    bic r0, r0, #-2147483648
+; ARM6-NEXT:    lsrs r0, r0, #1
+; ARM6-NEXT:    rrx r1, r3
+; ARM6-NEXT:    orr r0, r0, r3, lsl #30
+; ARM6-NEXT:    ldr r3, .LCPI5_2
+; ARM6-NEXT:    bic r2, r0, #-2147483648
+; ARM6-NEXT:    mov r0, #0
+; ARM6-NEXT:    subs r1, r1, r3
+; ARM6-NEXT:    sbcs r1, r2, #1
+; ARM6-NEXT:    movlo r0, #1
 ; ARM6-NEXT:    pop {r11, pc}
 ; ARM6-NEXT:    .p2align 2
 ; ARM6-NEXT:  @ %bb.1:
 ; ARM6-NEXT:  .LCPI5_0:
-; ARM6-NEXT:    .long 1234567890 @ 0x499602d2
+; ARM6-NEXT:    .long 3456474841 @ 0xce059ed9
+; ARM6-NEXT:  .LCPI5_1:
+; ARM6-NEXT:    .long 790204738 @ 0x2f199142
+; ARM6-NEXT:  .LCPI5_2:
+; ARM6-NEXT:    .long 3175964122 @ 0xbd4d5dda
 ;
 ; ARM7-LABEL: test_urem_larger:
 ; ARM7:       @ %bb.0:
 ; ARM7-NEXT:    push {r11, lr}
-; ARM7-NEXT:    movw r2, #722
-; ARM7-NEXT:    bic r1, r1, #-2147483648
-; ARM7-NEXT:    movt r2, #18838
-; ARM7-NEXT:    mov r3, #0
-; ARM7-NEXT:    bl __umoddi3
-; ARM7-NEXT:    orr r0, r0, r1
-; ARM7-NEXT:    clz r0, r0
-; ARM7-NEXT:    lsr r0, r0, #5
+; ARM7-NEXT:    movw r12, #40665
+; ARM7-NEXT:    movw r2, #37186
+; ARM7-NEXT:    movt r12, #52741
+; ARM7-NEXT:    movt r2, #12057
+; ARM7-NEXT:    umull r3, lr, r0, r12
+; ARM7-NEXT:    mla r0, r0, r2, lr
+; ARM7-NEXT:    mla r0, r1, r12, r0
+; ARM7-NEXT:    bic r0, r0, #-2147483648
+; ARM7-NEXT:    lsrs r0, r0, #1
+; ARM7-NEXT:    rrx r1, r3
+; ARM7-NEXT:    orr r0, r0, r3, lsl #30
+; ARM7-NEXT:    movw r3, #24026
+; ARM7-NEXT:    bic r2, r0, #-2147483648
+; ARM7-NEXT:    movt r3, #48461
+; ARM7-NEXT:    subs r1, r1, r3
+; ARM7-NEXT:    mov r0, #0
+; ARM7-NEXT:    sbcs r1, r2, #1
+; ARM7-NEXT:    movwlo r0, #1
 ; ARM7-NEXT:    pop {r11, pc}
 ;
 ; ARM8-LABEL: test_urem_larger:
 ; ARM8:       @ %bb.0:
 ; ARM8-NEXT:    push {r11, lr}
-; ARM8-NEXT:    movw r2, #722
-; ARM8-NEXT:    bic r1, r1, #-2147483648
-; ARM8-NEXT:    movt r2, #18838
-; ARM8-NEXT:    mov r3, #0
-; ARM8-NEXT:    bl __umoddi3
-; ARM8-NEXT:    orr r0, r0, r1
-; ARM8-NEXT:    clz r0, r0
-; ARM8-NEXT:    lsr r0, r0, #5
+; ARM8-NEXT:    movw r12, #40665
+; ARM8-NEXT:    movw r2, #37186
+; ARM8-NEXT:    movt r12, #52741
+; ARM8-NEXT:    movt r2, #12057
+; ARM8-NEXT:    umull r3, lr, r0, r12
+; ARM8-NEXT:    mla r0, r0, r2, lr
+; ARM8-NEXT:    mla r0, r1, r12, r0
+; ARM8-NEXT:    bic r0, r0, #-2147483648
+; ARM8-NEXT:    lsrs r0, r0, #1
+; ARM8-NEXT:    rrx r1, r3
+; ARM8-NEXT:    orr r0, r0, r3, lsl #30
+; ARM8-NEXT:    movw r3, #24026
+; ARM8-NEXT:    bic r2, r0, #-2147483648
+; ARM8-NEXT:    movt r3, #48461
+; ARM8-NEXT:    subs r1, r1, r3
+; ARM8-NEXT:    mov r0, #0
+; ARM8-NEXT:    sbcs r1, r2, #1
+; ARM8-NEXT:    movwlo r0, #1
 ; ARM8-NEXT:    pop {r11, pc}
 ;
 ; NEON7-LABEL: test_urem_larger:
 ; NEON7:       @ %bb.0:
 ; NEON7-NEXT:    push {r11, lr}
-; NEON7-NEXT:    movw r2, #722
-; NEON7-NEXT:    bic r1, r1, #-2147483648
-; NEON7-NEXT:    movt r2, #18838
-; NEON7-NEXT:    mov r3, #0
-; NEON7-NEXT:    bl __umoddi3
-; NEON7-NEXT:    orr r0, r0, r1
-; NEON7-NEXT:    clz r0, r0
-; NEON7-NEXT:    lsr r0, r0, #5
+; NEON7-NEXT:    movw r12, #40665
+; NEON7-NEXT:    movw r2, #37186
+; NEON7-NEXT:    movt r12, #52741
+; NEON7-NEXT:    movt r2, #12057
+; NEON7-NEXT:    umull r3, lr, r0, r12
+; NEON7-NEXT:    mla r0, r0, r2, lr
+; NEON7-NEXT:    mla r0, r1, r12, r0
+; NEON7-NEXT:    bic r0, r0, #-2147483648
+; NEON7-NEXT:    lsrs r0, r0, #1
+; NEON7-NEXT:    rrx r1, r3
+; NEON7-NEXT:    orr r0, r0, r3, lsl #30
+; NEON7-NEXT:    movw r3, #24026
+; NEON7-NEXT:    bic r2, r0, #-2147483648
+; NEON7-NEXT:    movt r3, #48461
+; NEON7-NEXT:    subs r1, r1, r3
+; NEON7-NEXT:    mov r0, #0
+; NEON7-NEXT:    sbcs r1, r2, #1
+; NEON7-NEXT:    movwlo r0, #1
 ; NEON7-NEXT:    pop {r11, pc}
 ;
 ; NEON8-LABEL: test_urem_larger:
 ; NEON8:       @ %bb.0:
 ; NEON8-NEXT:    push {r11, lr}
-; NEON8-NEXT:    movw r2, #722
-; NEON8-NEXT:    bic r1, r1, #-2147483648
-; NEON8-NEXT:    movt r2, #18838
-; NEON8-NEXT:    mov r3, #0
-; NEON8-NEXT:    bl __umoddi3
-; NEON8-NEXT:    orr r0, r0, r1
-; NEON8-NEXT:    clz r0, r0
-; NEON8-NEXT:    lsr r0, r0, #5
+; NEON8-NEXT:    movw r12, #40665
+; NEON8-NEXT:    movw r2, #37186
+; NEON8-NEXT:    movt r12, #52741
+; NEON8-NEXT:    movt r2, #12057
+; NEON8-NEXT:    umull r3, lr, r0, r12
+; NEON8-NEXT:    mla r0, r0, r2, lr
+; NEON8-NEXT:    mla r0, r1, r12, r0
+; NEON8-NEXT:    bic r0, r0, #-2147483648
+; NEON8-NEXT:    lsrs r0, r0, #1
+; NEON8-NEXT:    rrx r1, r3
+; NEON8-NEXT:    orr r0, r0, r3, lsl #30
+; NEON8-NEXT:    movw r3, #24026
+; NEON8-NEXT:    bic r2, r0, #-2147483648
+; NEON8-NEXT:    movt r3, #48461
+; NEON8-NEXT:    subs r1, r1, r3
+; NEON8-NEXT:    mov r0, #0
+; NEON8-NEXT:    sbcs r1, r2, #1
+; NEON8-NEXT:    movwlo r0, #1
 ; NEON8-NEXT:    pop {r11, pc}
   %urem = urem i63 %X, 1234567890
   %cmp = icmp eq i63 %urem, 0

diff  --git a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
index 7c211b7f934c..adebd8f8a1d8 100644
--- a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
@@ -5,32 +5,34 @@
 define i1 @test_srem_odd(i29 %X) nounwind {
 ; MIPSEL-LABEL: test_srem_odd:
 ; MIPSEL:       # %bb.0:
-; MIPSEL-NEXT:    lui $1, 48986
+; MIPSEL-NEXT:    lui $1, 8026
 ; MIPSEL-NEXT:    ori $1, $1, 33099
-; MIPSEL-NEXT:    sll $2, $4, 3
-; MIPSEL-NEXT:    sra $2, $2, 3
-; MIPSEL-NEXT:    mul $1, $2, $1
-; MIPSEL-NEXT:    lui $2, 330
-; MIPSEL-NEXT:    ori $2, $2, 64874
+; MIPSEL-NEXT:    mul $1, $4, $1
+; MIPSEL-NEXT:    lui $2, 41
+; MIPSEL-NEXT:    ori $2, $2, 24493
 ; MIPSEL-NEXT:    addu $1, $1, $2
-; MIPSEL-NEXT:    lui $2, 661
-; MIPSEL-NEXT:    ori $2, $2, 64213
+; MIPSEL-NEXT:    lui $2, 8191
+; MIPSEL-NEXT:    ori $2, $2, 65535
+; MIPSEL-NEXT:    and $1, $1, $2
+; MIPSEL-NEXT:    lui $2, 82
+; MIPSEL-NEXT:    ori $2, $2, 48987
 ; MIPSEL-NEXT:    jr $ra
 ; MIPSEL-NEXT:    sltu $2, $1, $2
 ;
 ; MIPS64EL-LABEL: test_srem_odd:
 ; MIPS64EL:       # %bb.0:
-; MIPS64EL-NEXT:    lui $1, 48986
+; MIPS64EL-NEXT:    lui $1, 8026
 ; MIPS64EL-NEXT:    ori $1, $1, 33099
 ; MIPS64EL-NEXT:    sll $2, $4, 0
-; MIPS64EL-NEXT:    sll $2, $2, 3
-; MIPS64EL-NEXT:    sra $2, $2, 3
 ; MIPS64EL-NEXT:    mul $1, $2, $1
-; MIPS64EL-NEXT:    lui $2, 330
-; MIPS64EL-NEXT:    ori $2, $2, 64874
+; MIPS64EL-NEXT:    lui $2, 41
+; MIPS64EL-NEXT:    ori $2, $2, 24493
 ; MIPS64EL-NEXT:    addu $1, $1, $2
-; MIPS64EL-NEXT:    lui $2, 661
-; MIPS64EL-NEXT:    ori $2, $2, 64213
+; MIPS64EL-NEXT:    lui $2, 8191
+; MIPS64EL-NEXT:    ori $2, $2, 65535
+; MIPS64EL-NEXT:    and $1, $1, $2
+; MIPS64EL-NEXT:    lui $2, 82
+; MIPS64EL-NEXT:    ori $2, $2, 48987
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    sltu $2, $1, $2
   %srem = srem i29 %X, 99

diff  --git a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
index d1f23523f339..e23351ee581a 100644
--- a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
@@ -5,26 +5,30 @@
 define i1 @test_urem_odd(i13 %X) nounwind {
 ; MIPSEL-LABEL: test_urem_odd:
 ; MIPSEL:       # %bb.0:
-; MIPSEL-NEXT:    lui $1, 52428
-; MIPSEL-NEXT:    ori $1, $1, 52429
-; MIPSEL-NEXT:    andi $2, $4, 8191
-; MIPSEL-NEXT:    mul $1, $2, $1
-; MIPSEL-NEXT:    lui $2, 13107
-; MIPSEL-NEXT:    ori $2, $2, 13108
+; MIPSEL-NEXT:    addiu $1, $zero, 3277
+; MIPSEL-NEXT:    mul $1, $4, $1
+; MIPSEL-NEXT:    andi $1, $1, 8191
 ; MIPSEL-NEXT:    jr $ra
-; MIPSEL-NEXT:    sltu $2, $1, $2
+; MIPSEL-NEXT:    sltiu $2, $1, 1639
 ;
 ; MIPS64EL-LABEL: test_urem_odd:
 ; MIPS64EL:       # %bb.0:
-; MIPS64EL-NEXT:    lui $1, 52428
-; MIPS64EL-NEXT:    ori $1, $1, 52429
-; MIPS64EL-NEXT:    sll $2, $4, 0
-; MIPS64EL-NEXT:    andi $2, $2, 8191
-; MIPS64EL-NEXT:    mul $1, $2, $1
-; MIPS64EL-NEXT:    lui $2, 13107
-; MIPS64EL-NEXT:    ori $2, $2, 13108
+; MIPS64EL-NEXT:    sll $1, $4, 0
+; MIPS64EL-NEXT:    sll $2, $1, 1
+; MIPS64EL-NEXT:    addu $2, $2, $1
+; MIPS64EL-NEXT:    sll $3, $1, 4
+; MIPS64EL-NEXT:    subu $2, $3, $2
+; MIPS64EL-NEXT:    sll $3, $1, 6
+; MIPS64EL-NEXT:    subu $2, $2, $3
+; MIPS64EL-NEXT:    sll $3, $1, 8
+; MIPS64EL-NEXT:    addu $2, $3, $2
+; MIPS64EL-NEXT:    sll $3, $1, 10
+; MIPS64EL-NEXT:    subu $2, $2, $3
+; MIPS64EL-NEXT:    sll $1, $1, 12
+; MIPS64EL-NEXT:    addu $1, $1, $2
+; MIPS64EL-NEXT:    andi $1, $1, 8191
 ; MIPS64EL-NEXT:    jr $ra
-; MIPS64EL-NEXT:    sltu $2, $1, $2
+; MIPS64EL-NEXT:    sltiu $2, $1, 1639
   %urem = urem i13 %X, 5
   %cmp = icmp eq i13 %urem, 0
   ret i1 %cmp
@@ -33,40 +37,40 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 define i1 @test_urem_even(i27 %X) nounwind {
 ; MIPSEL-LABEL: test_urem_even:
 ; MIPSEL:       # %bb.0:
-; MIPSEL-NEXT:    lui $1, 2047
-; MIPSEL-NEXT:    ori $1, $1, 65535
-; MIPSEL-NEXT:    and $1, $4, $1
-; MIPSEL-NEXT:    srl $2, $1, 1
-; MIPSEL-NEXT:    lui $3, 37449
-; MIPSEL-NEXT:    ori $3, $3, 9363
-; MIPSEL-NEXT:    multu $2, $3
-; MIPSEL-NEXT:    mfhi $2
-; MIPSEL-NEXT:    srl $2, $2, 2
-; MIPSEL-NEXT:    sll $3, $2, 4
-; MIPSEL-NEXT:    sll $2, $2, 1
-; MIPSEL-NEXT:    subu $2, $2, $3
-; MIPSEL-NEXT:    addu $1, $1, $2
+; MIPSEL-NEXT:    lui $1, 1755
+; MIPSEL-NEXT:    ori $1, $1, 28087
+; MIPSEL-NEXT:    mul $1, $4, $1
+; MIPSEL-NEXT:    sll $2, $1, 26
+; MIPSEL-NEXT:    lui $3, 2047
+; MIPSEL-NEXT:    ori $4, $3, 65534
+; MIPSEL-NEXT:    and $1, $1, $4
+; MIPSEL-NEXT:    srl $1, $1, 1
+; MIPSEL-NEXT:    or $1, $1, $2
+; MIPSEL-NEXT:    ori $2, $3, 65535
+; MIPSEL-NEXT:    and $1, $1, $2
+; MIPSEL-NEXT:    lui $2, 146
+; MIPSEL-NEXT:    ori $2, $2, 18725
 ; MIPSEL-NEXT:    jr $ra
-; MIPSEL-NEXT:    sltiu $2, $1, 1
+; MIPSEL-NEXT:    sltu $2, $1, $2
 ;
 ; MIPS64EL-LABEL: test_urem_even:
 ; MIPS64EL:       # %bb.0:
-; MIPS64EL-NEXT:    lui $1, 2047
-; MIPS64EL-NEXT:    ori $1, $1, 65535
+; MIPS64EL-NEXT:    lui $1, 1755
+; MIPS64EL-NEXT:    ori $1, $1, 28087
 ; MIPS64EL-NEXT:    sll $2, $4, 0
-; MIPS64EL-NEXT:    and $1, $2, $1
-; MIPS64EL-NEXT:    srl $2, $1, 1
-; MIPS64EL-NEXT:    lui $3, 37449
-; MIPS64EL-NEXT:    ori $3, $3, 9363
-; MIPS64EL-NEXT:    multu $2, $3
-; MIPS64EL-NEXT:    mfhi $2
-; MIPS64EL-NEXT:    srl $2, $2, 2
-; MIPS64EL-NEXT:    sll $3, $2, 4
-; MIPS64EL-NEXT:    sll $2, $2, 1
-; MIPS64EL-NEXT:    subu $2, $2, $3
-; MIPS64EL-NEXT:    addu $1, $1, $2
+; MIPS64EL-NEXT:    mul $1, $2, $1
+; MIPS64EL-NEXT:    sll $2, $1, 26
+; MIPS64EL-NEXT:    lui $3, 2047
+; MIPS64EL-NEXT:    ori $4, $3, 65534
+; MIPS64EL-NEXT:    and $1, $1, $4
+; MIPS64EL-NEXT:    srl $1, $1, 1
+; MIPS64EL-NEXT:    or $1, $1, $2
+; MIPS64EL-NEXT:    ori $2, $3, 65535
+; MIPS64EL-NEXT:    lui $3, 146
+; MIPS64EL-NEXT:    and $1, $1, $2
+; MIPS64EL-NEXT:    ori $2, $3, 18725
 ; MIPS64EL-NEXT:    jr $ra
-; MIPS64EL-NEXT:    sltiu $2, $1, 1
+; MIPS64EL-NEXT:    sltu $2, $1, $2
   %urem = urem i27 %X, 14
   %cmp = icmp eq i27 %urem, 0
   ret i1 %cmp
@@ -75,24 +79,22 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; MIPSEL-LABEL: test_urem_odd_setne:
 ; MIPSEL:       # %bb.0:
-; MIPSEL-NEXT:    lui $1, 52428
-; MIPSEL-NEXT:    ori $1, $1, 52429
-; MIPSEL-NEXT:    andi $2, $4, 15
-; MIPSEL-NEXT:    mul $1, $2, $1
-; MIPSEL-NEXT:    lui $2, 13107
-; MIPSEL-NEXT:    ori $2, $2, 13107
+; MIPSEL-NEXT:    sll $1, $4, 1
+; MIPSEL-NEXT:    addu $1, $1, $4
+; MIPSEL-NEXT:    negu $1, $1
+; MIPSEL-NEXT:    andi $1, $1, 15
+; MIPSEL-NEXT:    addiu $2, $zero, 3
 ; MIPSEL-NEXT:    jr $ra
 ; MIPSEL-NEXT:    sltu $2, $2, $1
 ;
 ; MIPS64EL-LABEL: test_urem_odd_setne:
 ; MIPS64EL:       # %bb.0:
-; MIPS64EL-NEXT:    lui $1, 52428
-; MIPS64EL-NEXT:    ori $1, $1, 52429
-; MIPS64EL-NEXT:    sll $2, $4, 0
-; MIPS64EL-NEXT:    andi $2, $2, 15
-; MIPS64EL-NEXT:    mul $1, $2, $1
-; MIPS64EL-NEXT:    lui $2, 13107
-; MIPS64EL-NEXT:    ori $2, $2, 13107
+; MIPS64EL-NEXT:    sll $1, $4, 0
+; MIPS64EL-NEXT:    sll $2, $1, 1
+; MIPS64EL-NEXT:    addu $1, $2, $1
+; MIPS64EL-NEXT:    negu $1, $1
+; MIPS64EL-NEXT:    andi $1, $1, 15
+; MIPS64EL-NEXT:    addiu $2, $zero, 3
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    sltu $2, $2, $1
   %urem = urem i4 %X, 5
@@ -103,26 +105,34 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; MIPSEL-LABEL: test_urem_negative_odd:
 ; MIPSEL:       # %bb.0:
-; MIPSEL-NEXT:    lui $1, 43302
-; MIPSEL-NEXT:    ori $1, $1, 57651
-; MIPSEL-NEXT:    andi $2, $4, 511
-; MIPSEL-NEXT:    mul $1, $2, $1
-; MIPSEL-NEXT:    lui $2, 129
-; MIPSEL-NEXT:    ori $2, $2, 17191
+; MIPSEL-NEXT:    sll $1, $4, 1
+; MIPSEL-NEXT:    addu $1, $1, $4
+; MIPSEL-NEXT:    sll $2, $4, 4
+; MIPSEL-NEXT:    subu $1, $1, $2
+; MIPSEL-NEXT:    sll $2, $4, 6
+; MIPSEL-NEXT:    addu $1, $2, $1
+; MIPSEL-NEXT:    sll $2, $4, 8
+; MIPSEL-NEXT:    addu $1, $2, $1
+; MIPSEL-NEXT:    andi $1, $1, 511
+; MIPSEL-NEXT:    addiu $2, $zero, 1
 ; MIPSEL-NEXT:    jr $ra
 ; MIPSEL-NEXT:    sltu $2, $2, $1
 ;
 ; MIPS64EL-LABEL: test_urem_negative_odd:
 ; MIPS64EL:       # %bb.0:
-; MIPS64EL-NEXT:    lui $1, 43302
-; MIPS64EL-NEXT:    ori $1, $1, 57651
-; MIPS64EL-NEXT:    sll $2, $4, 0
-; MIPS64EL-NEXT:    andi $2, $2, 511
-; MIPS64EL-NEXT:    mul $1, $2, $1
-; MIPS64EL-NEXT:    lui $2, 129
-; MIPS64EL-NEXT:    ori $2, $2, 17191
+; MIPS64EL-NEXT:    sll $1, $4, 0
+; MIPS64EL-NEXT:    sll $2, $1, 1
+; MIPS64EL-NEXT:    addu $2, $2, $1
+; MIPS64EL-NEXT:    sll $3, $1, 4
+; MIPS64EL-NEXT:    subu $2, $2, $3
+; MIPS64EL-NEXT:    sll $3, $1, 6
+; MIPS64EL-NEXT:    addu $2, $3, $2
+; MIPS64EL-NEXT:    sll $1, $1, 8
+; MIPS64EL-NEXT:    addiu $3, $zero, 1
+; MIPS64EL-NEXT:    addu $1, $1, $2
+; MIPS64EL-NEXT:    andi $1, $1, 511
 ; MIPS64EL-NEXT:    jr $ra
-; MIPS64EL-NEXT:    sltu $2, $2, $1
+; MIPS64EL-NEXT:    sltu $2, $3, $1
   %urem = urem i9 %X, -5
   %cmp = icmp ne i9 %urem, 0
   ret i1 %cmp
@@ -142,37 +152,71 @@ define i1 @test_urem_oversized(i66 %X) nounwind {
 ; MIPSEL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
 ; MIPSEL-NEXT:    move $7, $6
 ; MIPSEL-NEXT:    move $6, $5
-; MIPSEL-NEXT:    lui $1, 18838
-; MIPSEL-NEXT:    ori $1, $1, 722
-; MIPSEL-NEXT:    sw $1, 28($sp)
-; MIPSEL-NEXT:    sw $zero, 24($sp)
-; MIPSEL-NEXT:    sw $zero, 20($sp)
+; MIPSEL-NEXT:    move $5, $4
+; MIPSEL-NEXT:    lui $1, 12057
+; MIPSEL-NEXT:    ori $1, $1, 37186
+; MIPSEL-NEXT:    lui $2, 52741
+; MIPSEL-NEXT:    ori $2, $2, 40665
+; MIPSEL-NEXT:    sw $2, 28($sp)
+; MIPSEL-NEXT:    sw $1, 24($sp)
+; MIPSEL-NEXT:    addiu $1, $zero, 2
+; MIPSEL-NEXT:    sw $1, 20($sp)
 ; MIPSEL-NEXT:    sw $zero, 16($sp)
-; MIPSEL-NEXT:    andi $5, $4, 3
-; MIPSEL-NEXT:    jal __umodti3
+; MIPSEL-NEXT:    jal __multi3
 ; MIPSEL-NEXT:    addiu $4, $zero, 0
-; MIPSEL-NEXT:    or $1, $4, $2
-; MIPSEL-NEXT:    or $2, $5, $3
+; MIPSEL-NEXT:    sll $1, $4, 31
+; MIPSEL-NEXT:    srl $2, $5, 1
 ; MIPSEL-NEXT:    or $1, $2, $1
-; MIPSEL-NEXT:    sltiu $2, $1, 1
+; MIPSEL-NEXT:    lui $2, 60010
+; MIPSEL-NEXT:    ori $2, $2, 61135
+; MIPSEL-NEXT:    sltu $1, $1, $2
+; MIPSEL-NEXT:    srl $2, $4, 1
+; MIPSEL-NEXT:    andi $3, $3, 3
+; MIPSEL-NEXT:    sll $4, $3, 31
+; MIPSEL-NEXT:    or $4, $2, $4
+; MIPSEL-NEXT:    sltiu $2, $4, 13
+; MIPSEL-NEXT:    xori $4, $4, 13
+; MIPSEL-NEXT:    movz $2, $1, $4
+; MIPSEL-NEXT:    sll $1, $5, 1
+; MIPSEL-NEXT:    srl $3, $3, 1
+; MIPSEL-NEXT:    or $1, $3, $1
+; MIPSEL-NEXT:    andi $1, $1, 3
+; MIPSEL-NEXT:    movn $2, $zero, $1
 ; MIPSEL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
 ; MIPSEL-NEXT:    jr $ra
 ; MIPSEL-NEXT:    addiu $sp, $sp, 40
 ;
 ; MIPS64EL-LABEL: test_urem_oversized:
 ; MIPS64EL:       # %bb.0:
-; MIPS64EL-NEXT:    daddiu $sp, $sp, -16
-; MIPS64EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
-; MIPS64EL-NEXT:    andi $5, $5, 3
-; MIPS64EL-NEXT:    lui $1, 18838
-; MIPS64EL-NEXT:    ori $6, $1, 722
-; MIPS64EL-NEXT:    jal __umodti3
-; MIPS64EL-NEXT:    daddiu $7, $zero, 0
-; MIPS64EL-NEXT:    or $1, $2, $3
-; MIPS64EL-NEXT:    sltiu $2, $1, 1
-; MIPS64EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64EL-NEXT:    lui $1, 6029
+; MIPS64EL-NEXT:    daddiu $1, $1, -14175
+; MIPS64EL-NEXT:    dsll $1, $1, 16
+; MIPS64EL-NEXT:    daddiu $1, $1, 26371
+; MIPS64EL-NEXT:    dsll $1, $1, 17
+; MIPS64EL-NEXT:    daddiu $1, $1, -24871
+; MIPS64EL-NEXT:    dmult $5, $1
+; MIPS64EL-NEXT:    mflo $2
+; MIPS64EL-NEXT:    dmultu $4, $1
+; MIPS64EL-NEXT:    mflo $1
+; MIPS64EL-NEXT:    mfhi $3
+; MIPS64EL-NEXT:    lui $5, 14
+; MIPS64EL-NEXT:    daddiu $5, $5, -5525
+; MIPS64EL-NEXT:    dsll $5, $5, 16
+; MIPS64EL-NEXT:    daddiu $5, $5, -4401
+; MIPS64EL-NEXT:    dsll $4, $4, 1
+; MIPS64EL-NEXT:    daddu $3, $3, $4
+; MIPS64EL-NEXT:    daddu $2, $3, $2
+; MIPS64EL-NEXT:    andi $3, $2, 3
+; MIPS64EL-NEXT:    dsll $2, $3, 63
+; MIPS64EL-NEXT:    dsrl $4, $1, 1
+; MIPS64EL-NEXT:    or $2, $4, $2
+; MIPS64EL-NEXT:    sltu $2, $2, $5
+; MIPS64EL-NEXT:    dsrl $3, $3, 1
+; MIPS64EL-NEXT:    dsll $1, $1, 1
+; MIPS64EL-NEXT:    or $1, $3, $1
+; MIPS64EL-NEXT:    andi $1, $1, 3
 ; MIPS64EL-NEXT:    jr $ra
-; MIPS64EL-NEXT:    daddiu $sp, $sp, 16
+; MIPS64EL-NEXT:    movn $2, $zero, $1
   %urem = urem i66 %X, 1234567890
   %cmp = icmp eq i66 %urem, 0
   ret i1 %cmp

diff  --git a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
index b2de90ed3344..fc655df6030b 100644
--- a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
@@ -5,36 +5,37 @@
 define i1 @test_srem_odd(i29 %X) nounwind {
 ; PPC-LABEL: test_srem_odd:
 ; PPC:       # %bb.0:
-; PPC-NEXT:    lis 4, -23170
-; PPC-NEXT:    slwi 3, 3, 3
-; PPC-NEXT:    ori 4, 4, 46339
-; PPC-NEXT:    srawi 3, 3, 3
-; PPC-NEXT:    mulhw 4, 3, 4
-; PPC-NEXT:    add 4, 4, 3
-; PPC-NEXT:    srwi 5, 4, 31
-; PPC-NEXT:    srawi 4, 4, 6
-; PPC-NEXT:    add 4, 4, 5
-; PPC-NEXT:    mulli 4, 4, 99
-; PPC-NEXT:    sub 3, 3, 4
-; PPC-NEXT:    cntlzw 3, 3
-; PPC-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT:    lis 4, 8026
+; PPC-NEXT:    ori 4, 4, 33099
+; PPC-NEXT:    mullw 3, 3, 4
+; PPC-NEXT:    addi 3, 3, 24493
+; PPC-NEXT:    lis 4, 82
+; PPC-NEXT:    addis 3, 3, 41
+; PPC-NEXT:    ori 4, 4, 48987
+; PPC-NEXT:    clrlwi 3, 3, 3
+; PPC-NEXT:    cmplw 3, 4
+; PPC-NEXT:    li 3, 0
+; PPC-NEXT:    li 4, 1
+; PPC-NEXT:    bc 12, 0, .LBB0_1
+; PPC-NEXT:    blr
+; PPC-NEXT:  .LBB0_1:
+; PPC-NEXT:    addi 3, 4, 0
 ; PPC-NEXT:    blr
 ;
 ; PPC64LE-LABEL: test_srem_odd:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lis 4, -23170
-; PPC64LE-NEXT:    slwi 3, 3, 3
-; PPC64LE-NEXT:    srawi 3, 3, 3
-; PPC64LE-NEXT:    ori 4, 4, 46339
-; PPC64LE-NEXT:    mulhw 4, 3, 4
-; PPC64LE-NEXT:    add 4, 4, 3
-; PPC64LE-NEXT:    srwi 5, 4, 31
-; PPC64LE-NEXT:    srawi 4, 4, 6
-; PPC64LE-NEXT:    add 4, 4, 5
-; PPC64LE-NEXT:    mulli 4, 4, 99
-; PPC64LE-NEXT:    sub 3, 3, 4
-; PPC64LE-NEXT:    cntlzw 3, 3
-; PPC64LE-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT:    lis 4, 8026
+; PPC64LE-NEXT:    ori 4, 4, 33099
+; PPC64LE-NEXT:    mullw 3, 3, 4
+; PPC64LE-NEXT:    lis 4, 82
+; PPC64LE-NEXT:    ori 4, 4, 48987
+; PPC64LE-NEXT:    addi 3, 3, 24493
+; PPC64LE-NEXT:    addis 3, 3, 41
+; PPC64LE-NEXT:    clrlwi 3, 3, 3
+; PPC64LE-NEXT:    cmplw 3, 4
+; PPC64LE-NEXT:    li 3, 0
+; PPC64LE-NEXT:    li 4, 1
+; PPC64LE-NEXT:    isellt 3, 4, 3
 ; PPC64LE-NEXT:    blr
   %srem = srem i29 %X, 99
   %cmp = icmp eq i29 %srem, 0

diff  --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll
index 40d402d424e6..ef73fa686b3c 100644
--- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll
@@ -5,29 +5,24 @@
 define i1 @test_urem_odd(i13 %X) nounwind {
 ; PPC-LABEL: test_urem_odd:
 ; PPC:       # %bb.0:
-; PPC-NEXT:    lis 4, -13108
+; PPC-NEXT:    mulli 3, 3, 3277
 ; PPC-NEXT:    clrlwi 3, 3, 19
-; PPC-NEXT:    ori 4, 4, 52429
-; PPC-NEXT:    mulhwu 4, 3, 4
-; PPC-NEXT:    srwi 4, 4, 2
-; PPC-NEXT:    mulli 4, 4, 5
-; PPC-NEXT:    sub 3, 3, 4
-; PPC-NEXT:    cntlzw 3, 3
-; PPC-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT:    li 4, 0
+; PPC-NEXT:    cmplwi 3, 1639
+; PPC-NEXT:    li 3, 1
+; PPC-NEXT:    bclr 12, 0, 0
+; PPC-NEXT:  # %bb.1:
+; PPC-NEXT:    ori 3, 4, 0
 ; PPC-NEXT:    blr
 ;
 ; PPC64LE-LABEL: test_urem_odd:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lis 4, -13108
+; PPC64LE-NEXT:    mulli 3, 3, 3277
+; PPC64LE-NEXT:    li 4, 0
 ; PPC64LE-NEXT:    clrlwi 3, 3, 19
-; PPC64LE-NEXT:    ori 4, 4, 52429
-; PPC64LE-NEXT:    mulhwu 4, 3, 4
-; PPC64LE-NEXT:    rlwinm 5, 4, 0, 0, 29
-; PPC64LE-NEXT:    srwi 4, 4, 2
-; PPC64LE-NEXT:    add 4, 4, 5
-; PPC64LE-NEXT:    sub 3, 3, 4
-; PPC64LE-NEXT:    cntlzw 3, 3
-; PPC64LE-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT:    cmplwi 3, 1639
+; PPC64LE-NEXT:    li 3, 1
+; PPC64LE-NEXT:    isellt 3, 3, 4
 ; PPC64LE-NEXT:    blr
   %urem = urem i13 %X, 5
   %cmp = icmp eq i13 %urem, 0
@@ -37,30 +32,35 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 define i1 @test_urem_even(i27 %X) nounwind {
 ; PPC-LABEL: test_urem_even:
 ; PPC:       # %bb.0:
-; PPC-NEXT:    lis 4, -28087
-; PPC-NEXT:    rlwinm 5, 3, 31, 6, 31
-; PPC-NEXT:    ori 4, 4, 9363
-; PPC-NEXT:    mulhwu 4, 5, 4
-; PPC-NEXT:    srwi 4, 4, 2
-; PPC-NEXT:    clrlwi 3, 3, 5
-; PPC-NEXT:    mulli 4, 4, 14
-; PPC-NEXT:    sub 3, 3, 4
-; PPC-NEXT:    cntlzw 3, 3
-; PPC-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT:    lis 4, 1755
+; PPC-NEXT:    ori 4, 4, 28087
+; PPC-NEXT:    mullw 3, 3, 4
+; PPC-NEXT:    rlwinm 4, 3, 31, 6, 31
+; PPC-NEXT:    rlwimi 4, 3, 26, 5, 5
+; PPC-NEXT:    lis 3, 146
+; PPC-NEXT:    ori 3, 3, 18725
+; PPC-NEXT:    cmplw 4, 3
+; PPC-NEXT:    li 3, 0
+; PPC-NEXT:    li 4, 1
+; PPC-NEXT:    bc 12, 0, .LBB1_1
+; PPC-NEXT:    blr
+; PPC-NEXT:  .LBB1_1:
+; PPC-NEXT:    addi 3, 4, 0
 ; PPC-NEXT:    blr
 ;
 ; PPC64LE-LABEL: test_urem_even:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lis 4, -28087
+; PPC64LE-NEXT:    lis 4, 1755
+; PPC64LE-NEXT:    ori 4, 4, 28087
+; PPC64LE-NEXT:    mullw 3, 3, 4
+; PPC64LE-NEXT:    lis 4, 146
 ; PPC64LE-NEXT:    rlwinm 5, 3, 31, 6, 31
-; PPC64LE-NEXT:    clrlwi 3, 3, 5
-; PPC64LE-NEXT:    ori 4, 4, 9363
-; PPC64LE-NEXT:    mulhwu 4, 5, 4
-; PPC64LE-NEXT:    srwi 4, 4, 2
-; PPC64LE-NEXT:    mulli 4, 4, 14
-; PPC64LE-NEXT:    sub 3, 3, 4
-; PPC64LE-NEXT:    cntlzw 3, 3
-; PPC64LE-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT:    rlwimi 5, 3, 26, 5, 5
+; PPC64LE-NEXT:    ori 3, 4, 18725
+; PPC64LE-NEXT:    li 4, 1
+; PPC64LE-NEXT:    cmplw 5, 3
+; PPC64LE-NEXT:    li 3, 0
+; PPC64LE-NEXT:    isellt 3, 4, 3
 ; PPC64LE-NEXT:    blr
   %urem = urem i27 %X, 14
   %cmp = icmp eq i27 %urem, 0
@@ -70,30 +70,26 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; PPC-LABEL: test_urem_odd_setne:
 ; PPC:       # %bb.0:
-; PPC-NEXT:    lis 4, -13108
+; PPC-NEXT:    mulli 3, 3, 13
 ; PPC-NEXT:    clrlwi 3, 3, 28
-; PPC-NEXT:    ori 4, 4, 52429
-; PPC-NEXT:    mulhwu 4, 3, 4
-; PPC-NEXT:    srwi 4, 4, 2
-; PPC-NEXT:    mulli 4, 4, 5
-; PPC-NEXT:    sub 3, 3, 4
-; PPC-NEXT:    cntlzw 3, 3
-; PPC-NEXT:    not 3, 3
-; PPC-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT:    li 4, 0
+; PPC-NEXT:    cmplwi 3, 3
+; PPC-NEXT:    li 3, 1
+; PPC-NEXT:    bclr 12, 1, 0
+; PPC-NEXT:  # %bb.1:
+; PPC-NEXT:    ori 3, 4, 0
 ; PPC-NEXT:    blr
 ;
 ; PPC64LE-LABEL: test_urem_odd_setne:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lis 4, -13108
+; PPC64LE-NEXT:    slwi 5, 3, 1
+; PPC64LE-NEXT:    li 4, 0
+; PPC64LE-NEXT:    add 3, 3, 5
+; PPC64LE-NEXT:    neg 3, 3
 ; PPC64LE-NEXT:    clrlwi 3, 3, 28
-; PPC64LE-NEXT:    ori 4, 4, 52429
-; PPC64LE-NEXT:    mulhwu 4, 3, 4
-; PPC64LE-NEXT:    srwi 4, 4, 2
-; PPC64LE-NEXT:    rlwimi 4, 4, 2, 28, 29
-; PPC64LE-NEXT:    sub 3, 3, 4
-; PPC64LE-NEXT:    cntlzw 3, 3
-; PPC64LE-NEXT:    not 3, 3
-; PPC64LE-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT:    cmplwi 3, 3
+; PPC64LE-NEXT:    li 3, 1
+; PPC64LE-NEXT:    iselgt 3, 3, 4
 ; PPC64LE-NEXT:    blr
   %urem = urem i4 %X, 5
   %cmp = icmp ne i4 %urem, 0
@@ -103,30 +99,24 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; PPC-LABEL: test_urem_negative_odd:
 ; PPC:       # %bb.0:
-; PPC-NEXT:    lis 4, 8272
+; PPC-NEXT:    mulli 3, 3, 307
 ; PPC-NEXT:    clrlwi 3, 3, 23
-; PPC-NEXT:    ori 4, 4, 51705
-; PPC-NEXT:    mulhwu 4, 3, 4
-; PPC-NEXT:    srwi 4, 4, 6
-; PPC-NEXT:    mulli 4, 4, 507
-; PPC-NEXT:    sub 3, 3, 4
-; PPC-NEXT:    cntlzw 3, 3
-; PPC-NEXT:    not 3, 3
-; PPC-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT:    li 4, 0
+; PPC-NEXT:    cmplwi 3, 1
+; PPC-NEXT:    li 3, 1
+; PPC-NEXT:    bclr 12, 1, 0
+; PPC-NEXT:  # %bb.1:
+; PPC-NEXT:    ori 3, 4, 0
 ; PPC-NEXT:    blr
 ;
 ; PPC64LE-LABEL: test_urem_negative_odd:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lis 4, 8272
+; PPC64LE-NEXT:    mulli 3, 3, 307
+; PPC64LE-NEXT:    li 4, 0
 ; PPC64LE-NEXT:    clrlwi 3, 3, 23
-; PPC64LE-NEXT:    ori 4, 4, 51705
-; PPC64LE-NEXT:    mulhwu 4, 3, 4
-; PPC64LE-NEXT:    srwi 4, 4, 6
-; PPC64LE-NEXT:    mulli 4, 4, 507
-; PPC64LE-NEXT:    sub 3, 3, 4
-; PPC64LE-NEXT:    cntlzw 3, 3
-; PPC64LE-NEXT:    not 3, 3
-; PPC64LE-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT:    cmplwi 3, 1
+; PPC64LE-NEXT:    li 3, 1
+; PPC64LE-NEXT:    iselgt 3, 3, 4
 ; PPC64LE-NEXT:    blr
   %urem = urem i9 %X, -5
   %cmp = icmp ne i9 %urem, 0
@@ -136,103 +126,79 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; PPC-LABEL: test_urem_vec:
 ; PPC:       # %bb.0:
-; PPC-NEXT:    lis 6, -31983
-; PPC-NEXT:    clrlwi 5, 5, 21
-; PPC-NEXT:    ori 6, 6, 60211
-; PPC-NEXT:    mullw 5, 5, 6
-; PPC-NEXT:    lis 6, 32
-; PPC-NEXT:    addi 5, 5, 10650
-; PPC-NEXT:    ori 6, 6, 5132
-; PPC-NEXT:    addis 5, 5, -1572
-; PPC-NEXT:    cmplw 5, 6
-; PPC-NEXT:    lis 6, -18725
+; PPC-NEXT:    mulli 3, 3, 683
+; PPC-NEXT:    rlwinm 7, 3, 31, 22, 31
+; PPC-NEXT:    rlwimi 7, 3, 10, 21, 21
+; PPC-NEXT:    mulli 5, 5, 819
+; PPC-NEXT:    li 6, 0
+; PPC-NEXT:    cmplwi 7, 341
+; PPC-NEXT:    mulli 3, 4, 1463
+; PPC-NEXT:    addi 4, 5, -1638
+; PPC-NEXT:    addi 3, 3, -1463
 ; PPC-NEXT:    clrlwi 4, 4, 21
-; PPC-NEXT:    ori 6, 6, 28087
-; PPC-NEXT:    lis 5, -21846
-; PPC-NEXT:    mullw 4, 4, 6
-; PPC-NEXT:    lis 6, 9362
 ; PPC-NEXT:    clrlwi 3, 3, 21
-; PPC-NEXT:    ori 5, 5, 43691
-; PPC-NEXT:    addi 4, 4, -28087
-; PPC-NEXT:    ori 6, 6, 18724
-; PPC-NEXT:    mulhwu 5, 3, 5
-; PPC-NEXT:    addis 4, 4, 18725
-; PPC-NEXT:    cmplw 1, 4, 6
-; PPC-NEXT:    srwi 4, 5, 2
-; PPC-NEXT:    li 6, 0
-; PPC-NEXT:    li 7, 1
-; PPC-NEXT:    mulli 4, 4, 6
-; PPC-NEXT:    sub 3, 3, 4
-; PPC-NEXT:    cntlzw 3, 3
-; PPC-NEXT:    not 3, 3
-; PPC-NEXT:    bc 12, 5, .LBB4_2
+; PPC-NEXT:    cmplwi 1, 4, 1
+; PPC-NEXT:    cmplwi 5, 3, 292
+; PPC-NEXT:    li 3, 1
+; PPC-NEXT:    bc 12, 21, .LBB4_2
 ; PPC-NEXT:  # %bb.1:
 ; PPC-NEXT:    ori 4, 6, 0
 ; PPC-NEXT:    b .LBB4_3
 ; PPC-NEXT:  .LBB4_2:
-; PPC-NEXT:    addi 4, 7, 0
+; PPC-NEXT:    addi 4, 3, 0
 ; PPC-NEXT:  .LBB4_3:
-; PPC-NEXT:    bc 12, 1, .LBB4_5
+; PPC-NEXT:    bc 12, 5, .LBB4_5
 ; PPC-NEXT:  # %bb.4:
 ; PPC-NEXT:    ori 5, 6, 0
 ; PPC-NEXT:    b .LBB4_6
 ; PPC-NEXT:  .LBB4_5:
-; PPC-NEXT:    addi 5, 7, 0
+; PPC-NEXT:    addi 5, 3, 0
 ; PPC-NEXT:  .LBB4_6:
-; PPC-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT:    bclr 12, 1, 0
+; PPC-NEXT:  # %bb.7:
+; PPC-NEXT:    ori 3, 6, 0
 ; PPC-NEXT:    blr
 ;
 ; PPC64LE-LABEL: test_urem_vec:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lis 6, 9362
-; PPC64LE-NEXT:    lis 7, -21846
-; PPC64LE-NEXT:    clrlwi 4, 4, 21
-; PPC64LE-NEXT:    clrlwi 3, 3, 21
-; PPC64LE-NEXT:    lis 8, 160
-; PPC64LE-NEXT:    clrlwi 5, 5, 21
-; PPC64LE-NEXT:    ori 6, 6, 18725
-; PPC64LE-NEXT:    ori 7, 7, 43691
-; PPC64LE-NEXT:    ori 8, 8, 25663
-; PPC64LE-NEXT:    vspltisw 4, -11
-; PPC64LE-NEXT:    mulhwu 6, 4, 6
-; PPC64LE-NEXT:    mulhwu 7, 3, 7
-; PPC64LE-NEXT:    mulhwu 8, 5, 8
-; PPC64LE-NEXT:    sub 9, 4, 6
-; PPC64LE-NEXT:    srwi 7, 7, 2
-; PPC64LE-NEXT:    srwi 9, 9, 1
-; PPC64LE-NEXT:    mulli 7, 7, 6
-; PPC64LE-NEXT:    add 6, 9, 6
-; PPC64LE-NEXT:    srwi 9, 6, 2
-; PPC64LE-NEXT:    rlwinm 6, 6, 1, 0, 28
-; PPC64LE-NEXT:    sub 6, 9, 6
-; PPC64LE-NEXT:    sub 9, 5, 8
-; PPC64LE-NEXT:    add 4, 4, 6
-; PPC64LE-NEXT:    srwi 6, 9, 1
-; PPC64LE-NEXT:    sub 3, 3, 7
-; PPC64LE-NEXT:    add 6, 6, 8
-; PPC64LE-NEXT:    mtvsrwz 34, 4
-; PPC64LE-NEXT:    srwi 4, 6, 10
-; PPC64LE-NEXT:    mtvsrwz 35, 3
-; PPC64LE-NEXT:    mulli 3, 4, 2043
-; PPC64LE-NEXT:    addis 4, 2, .LCPI4_0 at toc@ha
-; PPC64LE-NEXT:    vmrghw 2, 2, 3
-; PPC64LE-NEXT:    addi 4, 4, .LCPI4_0 at toc@l
-; PPC64LE-NEXT:    lvx 3, 0, 4
-; PPC64LE-NEXT:    sub 3, 5, 3
-; PPC64LE-NEXT:    mtvsrwz 37, 3
+; PPC64LE-NEXT:    mtvsrwz 34, 3
+; PPC64LE-NEXT:    addis 3, 2, .LCPI4_0 at toc@ha
+; PPC64LE-NEXT:    mtvsrwz 35, 4
+; PPC64LE-NEXT:    addi 3, 3, .LCPI4_0 at toc@l
+; PPC64LE-NEXT:    addis 4, 2, .LCPI4_2 at toc@ha
+; PPC64LE-NEXT:    mtvsrwz 36, 5
+; PPC64LE-NEXT:    vmrghw 2, 3, 2
+; PPC64LE-NEXT:    lvx 3, 0, 3
 ; PPC64LE-NEXT:    addis 3, 2, .LCPI4_1 at toc@ha
 ; PPC64LE-NEXT:    addi 3, 3, .LCPI4_1 at toc@l
-; PPC64LE-NEXT:    vperm 2, 5, 2, 3
-; PPC64LE-NEXT:    vsrw 3, 4, 4
+; PPC64LE-NEXT:    vperm 2, 4, 2, 3
+; PPC64LE-NEXT:    vspltisw 3, -11
 ; PPC64LE-NEXT:    lvx 4, 0, 3
-; PPC64LE-NEXT:    xxland 34, 34, 35
-; PPC64LE-NEXT:    vcmpequw 2, 2, 4
-; PPC64LE-NEXT:    xxlnor 0, 34, 34
-; PPC64LE-NEXT:    xxswapd 1, 0
-; PPC64LE-NEXT:    xxsldwi 2, 0, 0, 1
-; PPC64LE-NEXT:    mffprwz 5, 0
-; PPC64LE-NEXT:    mffprwz 3, 1
-; PPC64LE-NEXT:    mffprwz 4, 2
+; PPC64LE-NEXT:    addi 3, 4, .LCPI4_2 at toc@l
+; PPC64LE-NEXT:    addis 4, 2, .LCPI4_4 at toc@ha
+; PPC64LE-NEXT:    lvx 5, 0, 3
+; PPC64LE-NEXT:    addis 3, 2, .LCPI4_3 at toc@ha
+; PPC64LE-NEXT:    addi 4, 4, .LCPI4_4 at toc@l
+; PPC64LE-NEXT:    addi 3, 3, .LCPI4_3 at toc@l
+; PPC64LE-NEXT:    vsrw 3, 3, 3
+; PPC64LE-NEXT:    vsubuwm 2, 2, 4
+; PPC64LE-NEXT:    lvx 4, 0, 3
+; PPC64LE-NEXT:    addis 3, 2, .LCPI4_5 at toc@ha
+; PPC64LE-NEXT:    addi 3, 3, .LCPI4_5 at toc@l
+; PPC64LE-NEXT:    vmuluwm 2, 2, 5
+; PPC64LE-NEXT:    lvx 5, 0, 4
+; PPC64LE-NEXT:    xxland 32, 34, 35
+; PPC64LE-NEXT:    vslw 2, 2, 4
+; PPC64LE-NEXT:    vsrw 4, 0, 5
+; PPC64LE-NEXT:    xxlor 0, 36, 34
+; PPC64LE-NEXT:    lvx 2, 0, 3
+; PPC64LE-NEXT:    xxland 35, 0, 35
+; PPC64LE-NEXT:    vcmpgtuw 2, 3, 2
+; PPC64LE-NEXT:    xxswapd 0, 34
+; PPC64LE-NEXT:    xxsldwi 1, 34, 34, 1
+; PPC64LE-NEXT:    mfvsrwz 5, 34
+; PPC64LE-NEXT:    mffprwz 3, 0
+; PPC64LE-NEXT:    mffprwz 4, 1
 ; PPC64LE-NEXT:    blr
   %urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
   %cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>
@@ -247,19 +213,35 @@ define i1 @test_urem_oversized(i66 %X) nounwind {
 ; PPC-NEXT:    stwu 1, -16(1)
 ; PPC-NEXT:    mr 6, 5
 ; PPC-NEXT:    mr 5, 4
-; PPC-NEXT:    clrlwi 4, 3, 30
-; PPC-NEXT:    lis 3, 18838
-; PPC-NEXT:    ori 10, 3, 722
+; PPC-NEXT:    mr 4, 3
+; PPC-NEXT:    lis 3, 12057
+; PPC-NEXT:    lis 7, -12795
+; PPC-NEXT:    ori 9, 3, 37186
+; PPC-NEXT:    ori 10, 7, 40665
 ; PPC-NEXT:    li 3, 0
 ; PPC-NEXT:    li 7, 0
-; PPC-NEXT:    li 8, 0
-; PPC-NEXT:    li 9, 0
-; PPC-NEXT:    bl __umodti3
-; PPC-NEXT:    or 3, 5, 3
-; PPC-NEXT:    or 4, 6, 4
-; PPC-NEXT:    or 3, 4, 3
-; PPC-NEXT:    cntlzw 3, 3
-; PPC-NEXT:    rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT:    li 8, 2
+; PPC-NEXT:    bl __multi3
+; PPC-NEXT:    rotlwi 7, 6, 31
+; PPC-NEXT:    lis 3, -5526
+; PPC-NEXT:    rlwimi 7, 5, 31, 0, 0
+; PPC-NEXT:    rotlwi 5, 5, 31
+; PPC-NEXT:    rlwimi 5, 4, 31, 0, 0
+; PPC-NEXT:    ori 3, 3, 61135
+; PPC-NEXT:    cmplwi 1, 5, 13
+; PPC-NEXT:    cmplw 7, 3
+; PPC-NEXT:    rlwinm 4, 4, 31, 31, 31
+; PPC-NEXT:    crand 20, 6, 0
+; PPC-NEXT:    crandc 21, 4, 6
+; PPC-NEXT:    rlwimi. 4, 6, 1, 30, 30
+; PPC-NEXT:    cror 20, 20, 21
+; PPC-NEXT:    crnand 20, 2, 20
+; PPC-NEXT:    li 3, 1
+; PPC-NEXT:    bc 12, 20, .LBB5_1
+; PPC-NEXT:    b .LBB5_2
+; PPC-NEXT:  .LBB5_1:
+; PPC-NEXT:    li 3, 0
+; PPC-NEXT:  .LBB5_2:
 ; PPC-NEXT:    lwz 0, 20(1)
 ; PPC-NEXT:    addi 1, 1, 16
 ; PPC-NEXT:    mtlr 0
@@ -267,21 +249,28 @@ define i1 @test_urem_oversized(i66 %X) nounwind {
 ;
 ; PPC64LE-LABEL: test_urem_oversized:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    mflr 0
-; PPC64LE-NEXT:    std 0, 16(1)
-; PPC64LE-NEXT:    stdu 1, -32(1)
-; PPC64LE-NEXT:    lis 5, 18838
-; PPC64LE-NEXT:    clrldi 4, 4, 62
-; PPC64LE-NEXT:    li 6, 0
-; PPC64LE-NEXT:    ori 5, 5, 722
-; PPC64LE-NEXT:    bl __umodti3
-; PPC64LE-NEXT:    nop
-; PPC64LE-NEXT:    or 3, 3, 4
-; PPC64LE-NEXT:    cntlzd 3, 3
-; PPC64LE-NEXT:    rldicl 3, 3, 58, 63
-; PPC64LE-NEXT:    addi 1, 1, 32
-; PPC64LE-NEXT:    ld 0, 16(1)
-; PPC64LE-NEXT:    mtlr 0
+; PPC64LE-NEXT:    lis 5, 6028
+; PPC64LE-NEXT:    ori 5, 5, 51361
+; PPC64LE-NEXT:    rldic 5, 5, 33, 2
+; PPC64LE-NEXT:    oris 5, 5, 52741
+; PPC64LE-NEXT:    ori 5, 5, 40665
+; PPC64LE-NEXT:    mulhdu 6, 3, 5
+; PPC64LE-NEXT:    mulld 4, 4, 5
+; PPC64LE-NEXT:    mulld 5, 3, 5
+; PPC64LE-NEXT:    sldi 3, 3, 1
+; PPC64LE-NEXT:    add 3, 6, 3
+; PPC64LE-NEXT:    add 3, 3, 4
+; PPC64LE-NEXT:    lis 4, -8538
+; PPC64LE-NEXT:    rotldi 6, 5, 63
+; PPC64LE-NEXT:    ori 4, 4, 44780
+; PPC64LE-NEXT:    rldimi 6, 3, 63, 0
+; PPC64LE-NEXT:    rlwinm 3, 3, 31, 31, 31
+; PPC64LE-NEXT:    rldicl 4, 4, 4, 28
+; PPC64LE-NEXT:    rlwimi. 3, 5, 1, 30, 30
+; PPC64LE-NEXT:    cmpld 1, 6, 4
+; PPC64LE-NEXT:    li 3, 1
+; PPC64LE-NEXT:    crnand 20, 2, 4
+; PPC64LE-NEXT:    isel 3, 0, 3, 20
 ; PPC64LE-NEXT:    blr
   %urem = urem i66 %X, 1234567890
   %cmp = icmp eq i66 %urem, 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll
index cafa74af1929..2798b67fa355 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll
@@ -2,22 +2,23 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
 
 ; Test that the prepareSREMEqFold optimization doesn't crash on scalable
-; vector types. RVV doesn't have ROTR or ROTL operations so the optimization
-; itself doesn't kick in.
+; vector types.
 define <vscale x 4 x i1> @srem_eq_fold_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-LABEL: srem_eq_fold_nxv4i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a0, zero, 43
+; CHECK-NEXT:    addi a0, zero, -85
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vmulh.vx v25, v8, a0
-; CHECK-NEXT:    vadd.vi v25, v25, 0
-; CHECK-NEXT:    vsrl.vi v26, v25, 7
-; CHECK-NEXT:    vand.vi v26, v26, -1
-; CHECK-NEXT:    vadd.vv v25, v25, v26
-; CHECK-NEXT:    addi a0, zero, 6
-; CHECK-NEXT:    vmul.vx v25, v25, a0
-; CHECK-NEXT:    vsub.vv v25, v8, v25
-; CHECK-NEXT:    vmseq.vi v0, v25, 0
+; CHECK-NEXT:    vmul.vx v25, v8, a0
+; CHECK-NEXT:    addi a0, zero, 42
+; CHECK-NEXT:    vadd.vx v25, v25, a0
+; CHECK-NEXT:    vmv.v.i v26, 1
+; CHECK-NEXT:    vrsub.vi v27, v26, 0
+; CHECK-NEXT:    vand.vi v27, v27, 7
+; CHECK-NEXT:    vsll.vv v27, v25, v27
+; CHECK-NEXT:    vand.vi v26, v26, 7
+; CHECK-NEXT:    vsrl.vv v25, v25, v26
+; CHECK-NEXT:    vor.vv v25, v25, v27
+; CHECK-NEXT:    vmsleu.vx v0, v25, a0
 ; CHECK-NEXT:    ret
   %head_six = insertelement <vscale x 4 x i8> undef, i8 6, i32 0
   %splat_six = shufflevector <vscale x 4 x i8> %head_six, <vscale x 4 x i8> undef, <vscale x 4 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 7a7766681995..d7344114c21a 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -11,11 +11,18 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    srai a0, a0, 3
-; RV32-NEXT:    addi a1, zero, 99
-; RV32-NEXT:    call __modsi3 at plt
-; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    lui a1, 128424
+; RV32-NEXT:    addi a1, a1, 331
+; RV32-NEXT:    call __mulsi3 at plt
+; RV32-NEXT:    lui a1, 662
+; RV32-NEXT:    addi a1, a1, -83
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    lui a1, 131072
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    lui a1, 1324
+; RV32-NEXT:    addi a1, a1, -165
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -24,100 +31,83 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    slli a0, a0, 35
-; RV64-NEXT:    srai a0, a0, 35
-; RV64-NEXT:    addi a1, zero, 99
-; RV64-NEXT:    call __moddi3 at plt
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    lui a1, 128424
+; RV64-NEXT:    addiw a1, a1, 331
+; RV64-NEXT:    call __muldi3 at plt
+; RV64-NEXT:    lui a1, 662
+; RV64-NEXT:    addiw a1, a1, -83
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    lui a1, 131072
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    lui a1, 1324
+; RV64-NEXT:    addiw a1, a1, -165
+; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; RV32M-LABEL: test_srem_odd:
 ; RV32M:       # %bb.0:
-; RV32M-NEXT:    slli a0, a0, 3
-; RV32M-NEXT:    srai a0, a0, 3
-; RV32M-NEXT:    lui a1, 783784
+; RV32M-NEXT:    lui a1, 128424
 ; RV32M-NEXT:    addi a1, a1, 331
 ; RV32M-NEXT:    mul a0, a0, a1
-; RV32M-NEXT:    lui a1, 5296
-; RV32M-NEXT:    addi a1, a1, -662
+; RV32M-NEXT:    lui a1, 662
+; RV32M-NEXT:    addi a1, a1, -83
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    lui a1, 10592
-; RV32M-NEXT:    addi a1, a1, -1323
+; RV32M-NEXT:    lui a1, 131072
+; RV32M-NEXT:    addi a1, a1, -1
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    lui a1, 1324
+; RV32M-NEXT:    addi a1, a1, -165
 ; RV32M-NEXT:    sltu a0, a0, a1
 ; RV32M-NEXT:    ret
 ;
 ; RV64M-LABEL: test_srem_odd:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    slli a0, a0, 35
-; RV64M-NEXT:    srai a0, a0, 35
-; RV64M-NEXT:    lui a1, 1048536
-; RV64M-NEXT:    addiw a1, a1, -331
-; RV64M-NEXT:    slli a1, a1, 15
-; RV64M-NEXT:    addi a1, a1, 331
-; RV64M-NEXT:    slli a1, a1, 15
-; RV64M-NEXT:    addi a1, a1, -331
-; RV64M-NEXT:    slli a1, a1, 15
-; RV64M-NEXT:    addi a1, a1, 331
+; RV64M-NEXT:    lui a1, 128424
+; RV64M-NEXT:    addiw a1, a1, 331
 ; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    lui a1, 331
-; RV64M-NEXT:    addiw a1, a1, -41
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -1531
-; RV64M-NEXT:    slli a2, a1, 12
-; RV64M-NEXT:    addi a2, a2, 703
-; RV64M-NEXT:    slli a2, a2, 12
-; RV64M-NEXT:    addi a2, a2, 1448
-; RV64M-NEXT:    add a0, a0, a2
-; RV64M-NEXT:    slli a1, a1, 13
-; RV64M-NEXT:    addi a1, a1, 1407
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -1199
+; RV64M-NEXT:    lui a1, 662
+; RV64M-NEXT:    addiw a1, a1, -83
+; RV64M-NEXT:    add a0, a0, a1
+; RV64M-NEXT:    lui a1, 131072
+; RV64M-NEXT:    addiw a1, a1, -1
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    lui a1, 1324
+; RV64M-NEXT:    addiw a1, a1, -165
 ; RV64M-NEXT:    sltu a0, a0, a1
 ; RV64M-NEXT:    ret
 ;
 ; RV32MV-LABEL: test_srem_odd:
 ; RV32MV:       # %bb.0:
-; RV32MV-NEXT:    slli a0, a0, 3
-; RV32MV-NEXT:    srai a0, a0, 3
-; RV32MV-NEXT:    lui a1, 783784
+; RV32MV-NEXT:    lui a1, 128424
 ; RV32MV-NEXT:    addi a1, a1, 331
 ; RV32MV-NEXT:    mul a0, a0, a1
-; RV32MV-NEXT:    lui a1, 5296
-; RV32MV-NEXT:    addi a1, a1, -662
+; RV32MV-NEXT:    lui a1, 662
+; RV32MV-NEXT:    addi a1, a1, -83
 ; RV32MV-NEXT:    add a0, a0, a1
-; RV32MV-NEXT:    lui a1, 10592
-; RV32MV-NEXT:    addi a1, a1, -1323
+; RV32MV-NEXT:    lui a1, 131072
+; RV32MV-NEXT:    addi a1, a1, -1
+; RV32MV-NEXT:    and a0, a0, a1
+; RV32MV-NEXT:    lui a1, 1324
+; RV32MV-NEXT:    addi a1, a1, -165
 ; RV32MV-NEXT:    sltu a0, a0, a1
 ; RV32MV-NEXT:    ret
 ;
 ; RV64MV-LABEL: test_srem_odd:
 ; RV64MV:       # %bb.0:
-; RV64MV-NEXT:    slli a0, a0, 35
-; RV64MV-NEXT:    srai a0, a0, 35
-; RV64MV-NEXT:    lui a1, 1048536
-; RV64MV-NEXT:    addiw a1, a1, -331
-; RV64MV-NEXT:    slli a1, a1, 15
-; RV64MV-NEXT:    addi a1, a1, 331
-; RV64MV-NEXT:    slli a1, a1, 15
-; RV64MV-NEXT:    addi a1, a1, -331
-; RV64MV-NEXT:    slli a1, a1, 15
-; RV64MV-NEXT:    addi a1, a1, 331
+; RV64MV-NEXT:    lui a1, 128424
+; RV64MV-NEXT:    addiw a1, a1, 331
 ; RV64MV-NEXT:    mul a0, a0, a1
-; RV64MV-NEXT:    lui a1, 331
-; RV64MV-NEXT:    addiw a1, a1, -41
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -1531
-; RV64MV-NEXT:    slli a2, a1, 12
-; RV64MV-NEXT:    addi a2, a2, 703
-; RV64MV-NEXT:    slli a2, a2, 12
-; RV64MV-NEXT:    addi a2, a2, 1448
-; RV64MV-NEXT:    add a0, a0, a2
-; RV64MV-NEXT:    slli a1, a1, 13
-; RV64MV-NEXT:    addi a1, a1, 1407
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -1199
+; RV64MV-NEXT:    lui a1, 662
+; RV64MV-NEXT:    addiw a1, a1, -83
+; RV64MV-NEXT:    add a0, a0, a1
+; RV64MV-NEXT:    lui a1, 131072
+; RV64MV-NEXT:    addiw a1, a1, -1
+; RV64MV-NEXT:    and a0, a0, a1
+; RV64MV-NEXT:    lui a1, 1324
+; RV64MV-NEXT:    addiw a1, a1, -165
 ; RV64MV-NEXT:    sltu a0, a0, a1
 ; RV64MV-NEXT:    ret
   %srem = srem i29 %X, 99
@@ -423,46 +413,65 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    ld a1, 0(s0)
 ; RV64-NEXT:    slli a2, a0, 29
-; RV64-NEXT:    srai s2, a2, 31
+; RV64-NEXT:    srai s1, a2, 31
 ; RV64-NEXT:    slli a0, a0, 31
 ; RV64-NEXT:    srli a2, a1, 33
 ; RV64-NEXT:    or a0, a2, a0
 ; RV64-NEXT:    slli a0, a0, 31
-; RV64-NEXT:    srai s1, a0, 31
-; RV64-NEXT:    slli a0, a1, 31
 ; RV64-NEXT:    srai a0, a0, 31
-; RV64-NEXT:    addi a1, zero, 6
-; RV64-NEXT:    call __moddi3 at plt
-; RV64-NEXT:    mv s3, a0
+; RV64-NEXT:    slli a1, a1, 31
+; RV64-NEXT:    srai s2, a1, 31
 ; RV64-NEXT:    addi a1, zero, 7
 ; RV64-NEXT:    addi s5, zero, 7
+; RV64-NEXT:    call __moddi3 at plt
+; RV64-NEXT:    mv s3, a0
+; RV64-NEXT:    addi a1, zero, -5
 ; RV64-NEXT:    mv a0, s1
 ; RV64-NEXT:    call __moddi3 at plt
 ; RV64-NEXT:    mv s1, a0
-; RV64-NEXT:    addi a1, zero, -5
+; RV64-NEXT:    lui a0, 1026731
+; RV64-NEXT:    addiw a0, a0, -1365
+; RV64-NEXT:    slli a0, a0, 12
+; RV64-NEXT:    addi a0, a0, -1365
+; RV64-NEXT:    slli a0, a0, 12
+; RV64-NEXT:    addi a0, a0, -1365
+; RV64-NEXT:    slli a0, a0, 12
+; RV64-NEXT:    addi a1, a0, -1365
 ; RV64-NEXT:    mv a0, s2
-; RV64-NEXT:    call __moddi3 at plt
-; RV64-NEXT:    addi a0, a0, -2
-; RV64-NEXT:    snez a0, a0
-; RV64-NEXT:    addi a1, s1, -1
+; RV64-NEXT:    call __muldi3 at plt
+; RV64-NEXT:    lui a1, 10923
+; RV64-NEXT:    addiw a1, a1, -1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -1365
+; RV64-NEXT:    slli a1, a1, 12
+; RV64-NEXT:    addi a1, a1, -1366
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    slli a2, a0, 63
+; RV64-NEXT:    srli a0, a0, 1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addi a1, s1, -2
 ; RV64-NEXT:    snez a1, a1
-; RV64-NEXT:    snez a2, s3
+; RV64-NEXT:    addi a2, s3, -1
+; RV64-NEXT:    snez a2, a2
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    neg a2, a2
-; RV64-NEXT:    neg a1, a1
-; RV64-NEXT:    neg a3, a0
+; RV64-NEXT:    neg a3, a1
 ; RV64-NEXT:    slli a4, s5, 32
 ; RV64-NEXT:    and a3, a3, a4
 ; RV64-NEXT:    srli a3, a3, 32
 ; RV64-NEXT:    sb a3, 12(s0)
-; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    slli a1, a1, 2
 ; RV64-NEXT:    slli a3, s4, 33
 ; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a1, a1, a3
-; RV64-NEXT:    srli a4, a1, 31
-; RV64-NEXT:    sub a0, a4, a0
-; RV64-NEXT:    sw a0, 8(s0)
-; RV64-NEXT:    and a0, a2, a3
-; RV64-NEXT:    slli a1, a1, 33
+; RV64-NEXT:    and a2, a2, a3
+; RV64-NEXT:    srli a4, a2, 31
+; RV64-NEXT:    sub a1, a4, a1
+; RV64-NEXT:    sw a1, 8(s0)
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    slli a1, a2, 33
 ; RV64-NEXT:    or a0, a0, a1
 ; RV64-NEXT:    sd a0, 0(s0)
 ; RV64-NEXT:    ld s5, 8(sp) # 8-byte Folded Reload
@@ -609,7 +618,11 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV64M-NEXT:    slli a5, a2, 2
 ; RV64M-NEXT:    add a2, a5, a2
 ; RV64M-NEXT:    add a2, a4, a2
-; RV64M-NEXT:    lui a4, 10923
+; RV64M-NEXT:    addi a2, a2, -2
+; RV64M-NEXT:    snez a2, a2
+; RV64M-NEXT:    addi a1, a1, -1
+; RV64M-NEXT:    snez a1, a1
+; RV64M-NEXT:    lui a4, 1026731
 ; RV64M-NEXT:    addiw a4, a4, -1365
 ; RV64M-NEXT:    slli a4, a4, 12
 ; RV64M-NEXT:    addi a4, a4, -1365
@@ -617,17 +630,20 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV64M-NEXT:    addi a4, a4, -1365
 ; RV64M-NEXT:    slli a4, a4, 12
 ; RV64M-NEXT:    addi a4, a4, -1365
-; RV64M-NEXT:    mulh a4, a3, a4
-; RV64M-NEXT:    srli a5, a4, 63
-; RV64M-NEXT:    add a4, a4, a5
-; RV64M-NEXT:    addi a5, zero, 6
-; RV64M-NEXT:    mul a4, a4, a5
-; RV64M-NEXT:    sub a3, a3, a4
-; RV64M-NEXT:    addi a2, a2, -2
-; RV64M-NEXT:    snez a2, a2
-; RV64M-NEXT:    addi a1, a1, -1
-; RV64M-NEXT:    snez a1, a1
-; RV64M-NEXT:    snez a3, a3
+; RV64M-NEXT:    mul a3, a3, a4
+; RV64M-NEXT:    lui a4, 10923
+; RV64M-NEXT:    addiw a4, a4, -1365
+; RV64M-NEXT:    slli a4, a4, 12
+; RV64M-NEXT:    addi a4, a4, -1365
+; RV64M-NEXT:    slli a4, a4, 12
+; RV64M-NEXT:    addi a4, a4, -1365
+; RV64M-NEXT:    slli a4, a4, 12
+; RV64M-NEXT:    addi a4, a4, -1366
+; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    slli a5, a3, 63
+; RV64M-NEXT:    srli a3, a3, 1
+; RV64M-NEXT:    or a3, a3, a5
+; RV64M-NEXT:    sltu a3, a4, a3
 ; RV64M-NEXT:    neg a1, a1
 ; RV64M-NEXT:    neg a4, a2
 ; RV64M-NEXT:    neg a3, a3

diff  --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index 2f06aa4ea4dd..ed92311f5bd8 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -11,12 +11,13 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a1, 1
+; RV32-NEXT:    addi a1, a1, -819
+; RV32-NEXT:    call __mulsi3 at plt
 ; RV32-NEXT:    lui a1, 2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    addi a1, zero, 5
-; RV32-NEXT:    call __umodsi3 at plt
-; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    sltiu a0, a0, 1639
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -25,90 +26,59 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lui a1, 1
+; RV64-NEXT:    addiw a1, a1, -819
+; RV64-NEXT:    call __muldi3 at plt
 ; RV64-NEXT:    lui a1, 2
 ; RV64-NEXT:    addiw a1, a1, -1
 ; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    addi a1, zero, 5
-; RV64-NEXT:    call __umoddi3 at plt
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    sltiu a0, a0, 1639
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; RV32M-LABEL: test_urem_odd:
 ; RV32M:       # %bb.0:
+; RV32M-NEXT:    lui a1, 1
+; RV32M-NEXT:    addi a1, a1, -819
+; RV32M-NEXT:    mul a0, a0, a1
 ; RV32M-NEXT:    lui a1, 2
 ; RV32M-NEXT:    addi a1, a1, -1
 ; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    lui a1, 838861
-; RV32M-NEXT:    addi a1, a1, -819
-; RV32M-NEXT:    mul a0, a0, a1
-; RV32M-NEXT:    lui a1, 209715
-; RV32M-NEXT:    addi a1, a1, 820
-; RV32M-NEXT:    sltu a0, a0, a1
+; RV32M-NEXT:    sltiu a0, a0, 1639
 ; RV32M-NEXT:    ret
 ;
 ; RV64M-LABEL: test_urem_odd:
 ; RV64M:       # %bb.0:
+; RV64M-NEXT:    lui a1, 1
+; RV64M-NEXT:    addiw a1, a1, -819
+; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    lui a1, 2
 ; RV64M-NEXT:    addiw a1, a1, -1
 ; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 1035469
-; RV64M-NEXT:    addiw a1, a1, -819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -819
-; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    lui a1, 13107
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, 819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, 819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, 820
-; RV64M-NEXT:    sltu a0, a0, a1
+; RV64M-NEXT:    sltiu a0, a0, 1639
 ; RV64M-NEXT:    ret
 ;
 ; RV32MV-LABEL: test_urem_odd:
 ; RV32MV:       # %bb.0:
+; RV32MV-NEXT:    lui a1, 1
+; RV32MV-NEXT:    addi a1, a1, -819
+; RV32MV-NEXT:    mul a0, a0, a1
 ; RV32MV-NEXT:    lui a1, 2
 ; RV32MV-NEXT:    addi a1, a1, -1
 ; RV32MV-NEXT:    and a0, a0, a1
-; RV32MV-NEXT:    lui a1, 838861
-; RV32MV-NEXT:    addi a1, a1, -819
-; RV32MV-NEXT:    mul a0, a0, a1
-; RV32MV-NEXT:    lui a1, 209715
-; RV32MV-NEXT:    addi a1, a1, 820
-; RV32MV-NEXT:    sltu a0, a0, a1
+; RV32MV-NEXT:    sltiu a0, a0, 1639
 ; RV32MV-NEXT:    ret
 ;
 ; RV64MV-LABEL: test_urem_odd:
 ; RV64MV:       # %bb.0:
+; RV64MV-NEXT:    lui a1, 1
+; RV64MV-NEXT:    addiw a1, a1, -819
+; RV64MV-NEXT:    mul a0, a0, a1
 ; RV64MV-NEXT:    lui a1, 2
 ; RV64MV-NEXT:    addiw a1, a1, -1
 ; RV64MV-NEXT:    and a0, a0, a1
-; RV64MV-NEXT:    lui a1, 1035469
-; RV64MV-NEXT:    addiw a1, a1, -819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -819
-; RV64MV-NEXT:    mul a0, a0, a1
-; RV64MV-NEXT:    lui a1, 13107
-; RV64MV-NEXT:    addiw a1, a1, 819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, 819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, 819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, 820
-; RV64MV-NEXT:    sltu a0, a0, a1
+; RV64MV-NEXT:    sltiu a0, a0, 1639
 ; RV64MV-NEXT:    ret
   %urem = urem i13 %X, 5
   %cmp = icmp eq i13 %urem, 0
@@ -120,12 +90,20 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    lui a1, 32768
-; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    lui a1, 28087
+; RV32-NEXT:    addi a1, a1, -585
+; RV32-NEXT:    call __mulsi3 at plt
+; RV32-NEXT:    slli a1, a0, 26
+; RV32-NEXT:    lui a2, 32768
+; RV32-NEXT:    addi a3, a2, -2
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    addi a1, a2, -1
 ; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    addi a1, zero, 14
-; RV32-NEXT:    call __umodsi3 at plt
-; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    lui a1, 2341
+; RV32-NEXT:    addi a1, a1, -1755
+; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -134,90 +112,94 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    lui a1, 32768
-; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    lui a1, 28087
+; RV64-NEXT:    addiw a1, a1, -585
+; RV64-NEXT:    call __muldi3 at plt
+; RV64-NEXT:    slli a1, a0, 26
+; RV64-NEXT:    lui a2, 32768
+; RV64-NEXT:    addiw a3, a2, -2
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    srli a0, a0, 1
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    addiw a1, a2, -1
 ; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    addi a1, zero, 14
-; RV64-NEXT:    call __umoddi3 at plt
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    lui a1, 2341
+; RV64-NEXT:    addiw a1, a1, -1755
+; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; RV32M-LABEL: test_urem_even:
 ; RV32M:       # %bb.0:
-; RV32M-NEXT:    lui a1, 32768
-; RV32M-NEXT:    addi a1, a1, -1
+; RV32M-NEXT:    lui a1, 28087
+; RV32M-NEXT:    addi a1, a1, -585
+; RV32M-NEXT:    mul a0, a0, a1
+; RV32M-NEXT:    slli a1, a0, 26
+; RV32M-NEXT:    lui a2, 32768
+; RV32M-NEXT:    addi a3, a2, -2
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    srli a0, a0, 1
+; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    addi a1, a2, -1
 ; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 1
-; RV32M-NEXT:    lui a2, 599186
-; RV32M-NEXT:    addi a2, a2, 1171
-; RV32M-NEXT:    mulhu a1, a1, a2
-; RV32M-NEXT:    srli a1, a1, 2
-; RV32M-NEXT:    addi a2, zero, 14
-; RV32M-NEXT:    mul a1, a1, a2
-; RV32M-NEXT:    sub a0, a0, a1
-; RV32M-NEXT:    seqz a0, a0
+; RV32M-NEXT:    lui a1, 2341
+; RV32M-NEXT:    addi a1, a1, -1755
+; RV32M-NEXT:    sltu a0, a0, a1
 ; RV32M-NEXT:    ret
 ;
 ; RV64M-LABEL: test_urem_even:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    lui a1, 32768
-; RV64M-NEXT:    addiw a1, a1, -1
+; RV64M-NEXT:    lui a1, 28087
+; RV64M-NEXT:    addiw a1, a1, -585
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    slli a1, a0, 26
+; RV64M-NEXT:    lui a2, 32768
+; RV64M-NEXT:    addiw a3, a2, -2
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, -1
 ; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 18725
-; RV64M-NEXT:    addiw a2, a2, -1755
-; RV64M-NEXT:    slli a2, a2, 12
-; RV64M-NEXT:    addi a2, a2, -1755
-; RV64M-NEXT:    slli a2, a2, 12
-; RV64M-NEXT:    addi a2, a2, -1755
-; RV64M-NEXT:    slli a2, a2, 12
-; RV64M-NEXT:    addi a2, a2, -1755
-; RV64M-NEXT:    mulhu a1, a1, a2
-; RV64M-NEXT:    srli a1, a1, 1
-; RV64M-NEXT:    addi a2, zero, 14
-; RV64M-NEXT:    mul a1, a1, a2
-; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    seqz a0, a0
+; RV64M-NEXT:    lui a1, 2341
+; RV64M-NEXT:    addiw a1, a1, -1755
+; RV64M-NEXT:    sltu a0, a0, a1
 ; RV64M-NEXT:    ret
 ;
 ; RV32MV-LABEL: test_urem_even:
 ; RV32MV:       # %bb.0:
-; RV32MV-NEXT:    lui a1, 32768
-; RV32MV-NEXT:    addi a1, a1, -1
+; RV32MV-NEXT:    lui a1, 28087
+; RV32MV-NEXT:    addi a1, a1, -585
+; RV32MV-NEXT:    mul a0, a0, a1
+; RV32MV-NEXT:    slli a1, a0, 26
+; RV32MV-NEXT:    lui a2, 32768
+; RV32MV-NEXT:    addi a3, a2, -2
+; RV32MV-NEXT:    and a0, a0, a3
+; RV32MV-NEXT:    srli a0, a0, 1
+; RV32MV-NEXT:    or a0, a0, a1
+; RV32MV-NEXT:    addi a1, a2, -1
 ; RV32MV-NEXT:    and a0, a0, a1
-; RV32MV-NEXT:    srli a1, a0, 1
-; RV32MV-NEXT:    lui a2, 599186
-; RV32MV-NEXT:    addi a2, a2, 1171
-; RV32MV-NEXT:    mulhu a1, a1, a2
-; RV32MV-NEXT:    srli a1, a1, 2
-; RV32MV-NEXT:    addi a2, zero, 14
-; RV32MV-NEXT:    mul a1, a1, a2
-; RV32MV-NEXT:    sub a0, a0, a1
-; RV32MV-NEXT:    seqz a0, a0
+; RV32MV-NEXT:    lui a1, 2341
+; RV32MV-NEXT:    addi a1, a1, -1755
+; RV32MV-NEXT:    sltu a0, a0, a1
 ; RV32MV-NEXT:    ret
 ;
 ; RV64MV-LABEL: test_urem_even:
 ; RV64MV:       # %bb.0:
-; RV64MV-NEXT:    lui a1, 32768
-; RV64MV-NEXT:    addiw a1, a1, -1
+; RV64MV-NEXT:    lui a1, 28087
+; RV64MV-NEXT:    addiw a1, a1, -585
+; RV64MV-NEXT:    mul a0, a0, a1
+; RV64MV-NEXT:    slli a1, a0, 26
+; RV64MV-NEXT:    lui a2, 32768
+; RV64MV-NEXT:    addiw a3, a2, -2
+; RV64MV-NEXT:    and a0, a0, a3
+; RV64MV-NEXT:    srli a0, a0, 1
+; RV64MV-NEXT:    or a0, a0, a1
+; RV64MV-NEXT:    addiw a1, a2, -1
 ; RV64MV-NEXT:    and a0, a0, a1
-; RV64MV-NEXT:    srli a1, a0, 1
-; RV64MV-NEXT:    lui a2, 18725
-; RV64MV-NEXT:    addiw a2, a2, -1755
-; RV64MV-NEXT:    slli a2, a2, 12
-; RV64MV-NEXT:    addi a2, a2, -1755
-; RV64MV-NEXT:    slli a2, a2, 12
-; RV64MV-NEXT:    addi a2, a2, -1755
-; RV64MV-NEXT:    slli a2, a2, 12
-; RV64MV-NEXT:    addi a2, a2, -1755
-; RV64MV-NEXT:    mulhu a1, a1, a2
-; RV64MV-NEXT:    srli a1, a1, 1
-; RV64MV-NEXT:    addi a2, zero, 14
-; RV64MV-NEXT:    mul a1, a1, a2
-; RV64MV-NEXT:    sub a0, a0, a1
-; RV64MV-NEXT:    seqz a0, a0
+; RV64MV-NEXT:    lui a1, 2341
+; RV64MV-NEXT:    addiw a1, a1, -1755
+; RV64MV-NEXT:    sltu a0, a0, a1
 ; RV64MV-NEXT:    ret
   %urem = urem i27 %X, 14
   %cmp = icmp eq i27 %urem, 0
@@ -227,93 +209,61 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; RV32-LABEL: test_urem_odd_setne:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    neg a0, a0
 ; RV32-NEXT:    andi a0, a0, 15
-; RV32-NEXT:    addi a1, zero, 5
-; RV32-NEXT:    call __umodsi3 at plt
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    addi a1, zero, 3
+; RV32-NEXT:    sltu a0, a1, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_odd_setne:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    slli a1, a0, 1
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    andi a0, a0, 15
-; RV64-NEXT:    addi a1, zero, 5
-; RV64-NEXT:    call __umoddi3 at plt
-; RV64-NEXT:    snez a0, a0
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    addi a1, zero, 3
+; RV64-NEXT:    sltu a0, a1, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32M-LABEL: test_urem_odd_setne:
 ; RV32M:       # %bb.0:
+; RV32M-NEXT:    slli a1, a0, 1
+; RV32M-NEXT:    add a0, a1, a0
+; RV32M-NEXT:    neg a0, a0
 ; RV32M-NEXT:    andi a0, a0, 15
-; RV32M-NEXT:    lui a1, 838861
-; RV32M-NEXT:    addi a1, a1, -819
-; RV32M-NEXT:    mul a0, a0, a1
-; RV32M-NEXT:    lui a1, 209715
-; RV32M-NEXT:    addi a1, a1, 819
+; RV32M-NEXT:    addi a1, zero, 3
 ; RV32M-NEXT:    sltu a0, a1, a0
 ; RV32M-NEXT:    ret
 ;
 ; RV64M-LABEL: test_urem_odd_setne:
 ; RV64M:       # %bb.0:
+; RV64M-NEXT:    slli a1, a0, 1
+; RV64M-NEXT:    add a0, a1, a0
+; RV64M-NEXT:    neg a0, a0
 ; RV64M-NEXT:    andi a0, a0, 15
-; RV64M-NEXT:    lui a1, 1035469
-; RV64M-NEXT:    addiw a1, a1, -819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -819
-; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    lui a1, 13107
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, 819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, 819
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, 819
+; RV64M-NEXT:    addi a1, zero, 3
 ; RV64M-NEXT:    sltu a0, a1, a0
 ; RV64M-NEXT:    ret
 ;
 ; RV32MV-LABEL: test_urem_odd_setne:
 ; RV32MV:       # %bb.0:
+; RV32MV-NEXT:    slli a1, a0, 1
+; RV32MV-NEXT:    add a0, a1, a0
+; RV32MV-NEXT:    neg a0, a0
 ; RV32MV-NEXT:    andi a0, a0, 15
-; RV32MV-NEXT:    lui a1, 838861
-; RV32MV-NEXT:    addi a1, a1, -819
-; RV32MV-NEXT:    mul a0, a0, a1
-; RV32MV-NEXT:    lui a1, 209715
-; RV32MV-NEXT:    addi a1, a1, 819
+; RV32MV-NEXT:    addi a1, zero, 3
 ; RV32MV-NEXT:    sltu a0, a1, a0
 ; RV32MV-NEXT:    ret
 ;
 ; RV64MV-LABEL: test_urem_odd_setne:
 ; RV64MV:       # %bb.0:
+; RV64MV-NEXT:    slli a1, a0, 1
+; RV64MV-NEXT:    add a0, a1, a0
+; RV64MV-NEXT:    neg a0, a0
 ; RV64MV-NEXT:    andi a0, a0, 15
-; RV64MV-NEXT:    lui a1, 1035469
-; RV64MV-NEXT:    addiw a1, a1, -819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -819
-; RV64MV-NEXT:    mul a0, a0, a1
-; RV64MV-NEXT:    lui a1, 13107
-; RV64MV-NEXT:    addiw a1, a1, 819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, 819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, 819
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, 819
+; RV64MV-NEXT:    addi a1, zero, 3
 ; RV64MV-NEXT:    sltu a0, a1, a0
 ; RV64MV-NEXT:    ret
   %urem = urem i4 %X, 5
@@ -326,10 +276,11 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi a1, zero, 307
+; RV32-NEXT:    call __mulsi3 at plt
 ; RV32-NEXT:    andi a0, a0, 511
-; RV32-NEXT:    addi a1, zero, 507
-; RV32-NEXT:    call __umodsi3 at plt
-; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    addi a1, zero, 1
+; RV32-NEXT:    sltu a0, a1, a0
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
@@ -338,75 +289,48 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi a1, zero, 307
+; RV64-NEXT:    call __muldi3 at plt
 ; RV64-NEXT:    andi a0, a0, 511
-; RV64-NEXT:    addi a1, zero, 507
-; RV64-NEXT:    call __umoddi3 at plt
-; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    addi a1, zero, 1
+; RV64-NEXT:    sltu a0, a1, a0
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
 ;
 ; RV32M-LABEL: test_urem_negative_odd:
 ; RV32M:       # %bb.0:
-; RV32M-NEXT:    andi a0, a0, 511
-; RV32M-NEXT:    lui a1, 692846
-; RV32M-NEXT:    addi a1, a1, 307
+; RV32M-NEXT:    addi a1, zero, 307
 ; RV32M-NEXT:    mul a0, a0, a1
-; RV32M-NEXT:    lui a1, 2068
-; RV32M-NEXT:    addi a1, a1, 807
+; RV32M-NEXT:    andi a0, a0, 511
+; RV32M-NEXT:    addi a1, zero, 1
 ; RV32M-NEXT:    sltu a0, a1, a0
 ; RV32M-NEXT:    ret
 ;
 ; RV64M-LABEL: test_urem_negative_odd:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    andi a0, a0, 511
-; RV64M-NEXT:    lui a1, 1042824
-; RV64M-NEXT:    addiw a1, a1, -711
-; RV64M-NEXT:    slli a1, a1, 13
-; RV64M-NEXT:    addi a1, a1, 469
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -1737
-; RV64M-NEXT:    slli a1, a1, 13
-; RV64M-NEXT:    addi a1, a1, 307
+; RV64M-NEXT:    addi a1, zero, 307
 ; RV64M-NEXT:    mul a0, a0, a1
-; RV64M-NEXT:    lui a1, 132365
-; RV64M-NEXT:    addiw a1, a1, -1543
-; RV64M-NEXT:    slli a1, a1, 14
-; RV64M-NEXT:    addi a1, a1, -1131
-; RV64M-NEXT:    slli a1, a1, 12
-; RV64M-NEXT:    addi a1, a1, -186
+; RV64M-NEXT:    andi a0, a0, 511
+; RV64M-NEXT:    addi a1, zero, 1
 ; RV64M-NEXT:    sltu a0, a1, a0
 ; RV64M-NEXT:    ret
 ;
 ; RV32MV-LABEL: test_urem_negative_odd:
 ; RV32MV:       # %bb.0:
-; RV32MV-NEXT:    andi a0, a0, 511
-; RV32MV-NEXT:    lui a1, 692846
-; RV32MV-NEXT:    addi a1, a1, 307
+; RV32MV-NEXT:    addi a1, zero, 307
 ; RV32MV-NEXT:    mul a0, a0, a1
-; RV32MV-NEXT:    lui a1, 2068
-; RV32MV-NEXT:    addi a1, a1, 807
+; RV32MV-NEXT:    andi a0, a0, 511
+; RV32MV-NEXT:    addi a1, zero, 1
 ; RV32MV-NEXT:    sltu a0, a1, a0
 ; RV32MV-NEXT:    ret
 ;
 ; RV64MV-LABEL: test_urem_negative_odd:
 ; RV64MV:       # %bb.0:
-; RV64MV-NEXT:    andi a0, a0, 511
-; RV64MV-NEXT:    lui a1, 1042824
-; RV64MV-NEXT:    addiw a1, a1, -711
-; RV64MV-NEXT:    slli a1, a1, 13
-; RV64MV-NEXT:    addi a1, a1, 469
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -1737
-; RV64MV-NEXT:    slli a1, a1, 13
-; RV64MV-NEXT:    addi a1, a1, 307
+; RV64MV-NEXT:    addi a1, zero, 307
 ; RV64MV-NEXT:    mul a0, a0, a1
-; RV64MV-NEXT:    lui a1, 132365
-; RV64MV-NEXT:    addiw a1, a1, -1543
-; RV64MV-NEXT:    slli a1, a1, 14
-; RV64MV-NEXT:    addi a1, a1, -1131
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -186
+; RV64MV-NEXT:    andi a0, a0, 511
+; RV64MV-NEXT:    addi a1, zero, 1
 ; RV64MV-NEXT:    sltu a0, a1, a0
 ; RV64MV-NEXT:    ret
   %urem = urem i9 %X, -5
@@ -428,38 +352,44 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
 ; RV32-NEXT:    lw a1, 0(s0)
 ; RV32-NEXT:    slli a0, a0, 10
 ; RV32-NEXT:    srli a2, a1, 22
-; RV32-NEXT:    or a0, a2, a0
-; RV32-NEXT:    andi s2, a0, 2047
-; RV32-NEXT:    andi s1, a1, 2047
-; RV32-NEXT:    srli a0, a1, 11
+; RV32-NEXT:    or s1, a2, a0
+; RV32-NEXT:    srli s2, a1, 11
+; RV32-NEXT:    andi a0, a1, 2047
+; RV32-NEXT:    addi a1, zero, 683
+; RV32-NEXT:    call __mulsi3 at plt
+; RV32-NEXT:    slli a1, a0, 10
+; RV32-NEXT:    andi a0, a0, 2046
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    andi a0, a0, 2047
-; RV32-NEXT:    addi a1, zero, 7
-; RV32-NEXT:    call __umodsi3 at plt
-; RV32-NEXT:    mv s3, a0
-; RV32-NEXT:    addi a1, zero, 6
+; RV32-NEXT:    addi a1, zero, 341
+; RV32-NEXT:    sltu s3, a1, a0
+; RV32-NEXT:    addi a1, zero, 819
 ; RV32-NEXT:    mv a0, s1
-; RV32-NEXT:    call __umodsi3 at plt
-; RV32-NEXT:    mv s1, a0
-; RV32-NEXT:    addi a1, zero, 2043
+; RV32-NEXT:    call __mulsi3 at plt
+; RV32-NEXT:    addi a0, a0, -1638
+; RV32-NEXT:    andi a0, a0, 2047
+; RV32-NEXT:    addi a1, zero, 1
+; RV32-NEXT:    sltu s1, a1, a0
+; RV32-NEXT:    addi a1, zero, 1463
 ; RV32-NEXT:    mv a0, s2
-; RV32-NEXT:    call __umodsi3 at plt
-; RV32-NEXT:    addi a0, a0, -2
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    snez a1, s1
-; RV32-NEXT:    addi a2, s3, -1
-; RV32-NEXT:    snez a2, a2
-; RV32-NEXT:    neg a2, a2
-; RV32-NEXT:    neg a1, a1
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    srli a3, a3, 10
-; RV32-NEXT:    andi a3, a3, 1
-; RV32-NEXT:    sb a3, 4(s0)
+; RV32-NEXT:    call __mulsi3 at plt
+; RV32-NEXT:    addi a0, a0, -1463
+; RV32-NEXT:    andi a0, a0, 2047
+; RV32-NEXT:    addi a1, zero, 292
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    neg a1, s3
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    neg a2, s1
+; RV32-NEXT:    srli a2, a2, 10
+; RV32-NEXT:    andi a2, a2, 1
+; RV32-NEXT:    sb a2, 4(s0)
 ; RV32-NEXT:    andi a1, a1, 2047
-; RV32-NEXT:    andi a2, a2, 2047
-; RV32-NEXT:    slli a2, a2, 11
-; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    slli a0, a0, 22
-; RV32-NEXT:    sub a0, a1, a0
+; RV32-NEXT:    andi a0, a0, 2047
+; RV32-NEXT:    slli a0, a0, 11
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    slli a1, s1, 22
+; RV32-NEXT:    sub a0, a0, a1
 ; RV32-NEXT:    sw a0, 0(s0)
 ; RV32-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
@@ -477,45 +407,53 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
 ; RV64-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    lbu a0, 4(a0)
 ; RV64-NEXT:    lwu a1, 0(s0)
 ; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    or a0, a1, a0
-; RV64-NEXT:    srli s2, a0, 22
-; RV64-NEXT:    andi s1, a0, 2047
-; RV64-NEXT:    srli a0, a0, 11
+; RV64-NEXT:    srli s2, a0, 11
+; RV64-NEXT:    srli s1, a0, 22
 ; RV64-NEXT:    andi a0, a0, 2047
-; RV64-NEXT:    addi a1, zero, 7
-; RV64-NEXT:    call __umoddi3 at plt
-; RV64-NEXT:    mv s3, a0
-; RV64-NEXT:    addi a1, zero, 6
+; RV64-NEXT:    addi a1, zero, 683
+; RV64-NEXT:    call __muldi3 at plt
+; RV64-NEXT:    slli a1, a0, 10
+; RV64-NEXT:    andi a0, a0, 2046
+; RV64-NEXT:    srli a0, a0, 1
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    andi a0, a0, 2047
+; RV64-NEXT:    addi a1, zero, 341
+; RV64-NEXT:    sltu s3, a1, a0
+; RV64-NEXT:    addi a1, zero, 819
 ; RV64-NEXT:    mv a0, s1
-; RV64-NEXT:    call __umoddi3 at plt
-; RV64-NEXT:    mv s1, a0
-; RV64-NEXT:    addi a1, zero, 2043
+; RV64-NEXT:    call __muldi3 at plt
+; RV64-NEXT:    addi a0, a0, -1638
+; RV64-NEXT:    andi a0, a0, 2047
+; RV64-NEXT:    addi s4, zero, 1
+; RV64-NEXT:    sltu s1, s4, a0
+; RV64-NEXT:    addi a1, zero, 1463
 ; RV64-NEXT:    mv a0, s2
-; RV64-NEXT:    call __umoddi3 at plt
-; RV64-NEXT:    addi a0, a0, -2
-; RV64-NEXT:    snez a0, a0
-; RV64-NEXT:    snez a1, s1
-; RV64-NEXT:    addi a2, s3, -1
-; RV64-NEXT:    snez a2, a2
-; RV64-NEXT:    neg a2, a2
-; RV64-NEXT:    neg a1, a1
+; RV64-NEXT:    call __muldi3 at plt
+; RV64-NEXT:    addi a0, a0, -1463
+; RV64-NEXT:    andi a0, a0, 2047
+; RV64-NEXT:    addi a1, zero, 292
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    neg a1, s3
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    andi a1, a1, 2047
-; RV64-NEXT:    andi a2, a2, 2047
-; RV64-NEXT:    slli a2, a2, 11
-; RV64-NEXT:    or a1, a1, a2
-; RV64-NEXT:    slli a0, a0, 22
-; RV64-NEXT:    sub a0, a1, a0
+; RV64-NEXT:    andi a0, a0, 2047
+; RV64-NEXT:    slli a0, a0, 11
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    slli a1, s1, 22
+; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    sw a0, 0(s0)
-; RV64-NEXT:    addi a1, zero, 1
-; RV64-NEXT:    slli a1, a1, 33
+; RV64-NEXT:    slli a1, s4, 33
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    srli a0, a0, 32
 ; RV64-NEXT:    sb a0, 4(s0)
+; RV64-NEXT:    ld s4, 0(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -531,45 +469,38 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
 ; RV32M-NEXT:    slli a1, a1, 10
 ; RV32M-NEXT:    srli a3, a2, 22
 ; RV32M-NEXT:    or a1, a3, a1
-; RV32M-NEXT:    andi a1, a1, 2047
 ; RV32M-NEXT:    srli a3, a2, 11
-; RV32M-NEXT:    andi a3, a3, 2047
 ; RV32M-NEXT:    andi a2, a2, 2047
-; RV32M-NEXT:    lui a4, 699051
-; RV32M-NEXT:    addi a4, a4, -1365
-; RV32M-NEXT:    mulhu a4, a2, a4
-; RV32M-NEXT:    srli a4, a4, 2
-; RV32M-NEXT:    addi a5, zero, 6
-; RV32M-NEXT:    mul a4, a4, a5
-; RV32M-NEXT:    sub a2, a2, a4
-; RV32M-NEXT:    lui a4, 536863
-; RV32M-NEXT:    addi a4, a4, -1229
+; RV32M-NEXT:    addi a4, zero, 683
+; RV32M-NEXT:    mul a2, a2, a4
+; RV32M-NEXT:    slli a4, a2, 10
+; RV32M-NEXT:    andi a2, a2, 2046
+; RV32M-NEXT:    srli a2, a2, 1
+; RV32M-NEXT:    or a2, a2, a4
+; RV32M-NEXT:    andi a2, a2, 2047
+; RV32M-NEXT:    addi a4, zero, 341
+; RV32M-NEXT:    sltu a2, a4, a2
+; RV32M-NEXT:    addi a4, zero, 819
 ; RV32M-NEXT:    mul a1, a1, a4
-; RV32M-NEXT:    lui a4, 1023427
-; RV32M-NEXT:    addi a4, a4, -1638
-; RV32M-NEXT:    add a1, a1, a4
-; RV32M-NEXT:    lui a4, 513
-; RV32M-NEXT:    addi a4, a4, 1036
+; RV32M-NEXT:    addi a1, a1, -1638
+; RV32M-NEXT:    andi a1, a1, 2047
+; RV32M-NEXT:    addi a4, zero, 1
 ; RV32M-NEXT:    sltu a1, a4, a1
-; RV32M-NEXT:    lui a4, 748983
-; RV32M-NEXT:    addi a4, a4, -585
+; RV32M-NEXT:    addi a4, zero, 1463
 ; RV32M-NEXT:    mul a3, a3, a4
-; RV32M-NEXT:    lui a4, 299593
-; RV32M-NEXT:    addi a4, a4, 585
-; RV32M-NEXT:    add a3, a3, a4
-; RV32M-NEXT:    lui a4, 149797
-; RV32M-NEXT:    addi a4, a4, -1756
+; RV32M-NEXT:    addi a3, a3, -1463
+; RV32M-NEXT:    andi a3, a3, 2047
+; RV32M-NEXT:    addi a4, zero, 292
 ; RV32M-NEXT:    sltu a3, a4, a3
-; RV32M-NEXT:    snez a2, a2
 ; RV32M-NEXT:    neg a2, a2
 ; RV32M-NEXT:    neg a3, a3
 ; RV32M-NEXT:    neg a4, a1
 ; RV32M-NEXT:    srli a4, a4, 10
 ; RV32M-NEXT:    andi a4, a4, 1
 ; RV32M-NEXT:    sb a4, 4(a0)
+; RV32M-NEXT:    andi a2, a2, 2047
 ; RV32M-NEXT:    andi a3, a3, 2047
 ; RV32M-NEXT:    slli a3, a3, 11
-; RV32M-NEXT:    andi a2, a2, 2047
 ; RV32M-NEXT:    or a2, a2, a3
 ; RV32M-NEXT:    slli a1, a1, 22
 ; RV32M-NEXT:    sub a1, a2, a1
@@ -583,75 +514,29 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
 ; RV64M-NEXT:    slli a1, a1, 32
 ; RV64M-NEXT:    or a1, a2, a1
 ; RV64M-NEXT:    srli a2, a1, 11
-; RV64M-NEXT:    andi a2, a2, 2047
 ; RV64M-NEXT:    srli a3, a1, 22
 ; RV64M-NEXT:    andi a1, a1, 2047
-; RV64M-NEXT:    lui a4, 1026731
-; RV64M-NEXT:    addiw a4, a4, -1365
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, -1365
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, -1365
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, -1365
-; RV64M-NEXT:    mulhu a4, a1, a4
-; RV64M-NEXT:    srli a4, a4, 2
-; RV64M-NEXT:    addi a5, zero, 6
-; RV64M-NEXT:    mul a4, a4, a5
-; RV64M-NEXT:    sub a1, a1, a4
-; RV64M-NEXT:    snez a1, a1
-; RV64M-NEXT:    lui a4, 14948
-; RV64M-NEXT:    addiw a4, a4, 2029
-; RV64M-NEXT:    slli a4, a4, 13
-; RV64M-NEXT:    addi a4, a4, -381
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, 287
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, -1229
+; RV64M-NEXT:    addi a4, zero, 683
+; RV64M-NEXT:    mul a1, a1, a4
+; RV64M-NEXT:    slli a4, a1, 10
+; RV64M-NEXT:    andi a1, a1, 2046
+; RV64M-NEXT:    srli a1, a1, 1
+; RV64M-NEXT:    or a1, a1, a4
+; RV64M-NEXT:    andi a1, a1, 2047
+; RV64M-NEXT:    addi a4, zero, 341
+; RV64M-NEXT:    sltu a1, a4, a1
+; RV64M-NEXT:    addi a4, zero, 819
 ; RV64M-NEXT:    mul a3, a3, a4
-; RV64M-NEXT:    lui a4, 1436
-; RV64M-NEXT:    addiw a4, a4, -2029
-; RV64M-NEXT:    slli a4, a4, 13
-; RV64M-NEXT:    addi a4, a4, 381
-; RV64M-NEXT:    slli a4, a4, 13
-; RV64M-NEXT:    addi a4, a4, -573
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, -1638
-; RV64M-NEXT:    add a3, a3, a4
-; RV64M-NEXT:    lui a4, 16424
-; RV64M-NEXT:    addiw a4, a4, 401
-; RV64M-NEXT:    slli a4, a4, 14
-; RV64M-NEXT:    addi a4, a4, -345
-; RV64M-NEXT:    slli a4, a4, 13
-; RV64M-NEXT:    addi a4, a4, 1295
+; RV64M-NEXT:    addi a3, a3, -1638
+; RV64M-NEXT:    andi a3, a3, 2047
+; RV64M-NEXT:    addi a4, zero, 1
 ; RV64M-NEXT:    sltu a3, a4, a3
-; RV64M-NEXT:    lui a4, 28087
-; RV64M-NEXT:    addiw a4, a4, -585
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, -585
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, -585
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, -585
-; RV64M-NEXT:    mul a2, a2, a4
-; RV64M-NEXT:    lui a4, 1020489
-; RV64M-NEXT:    addiw a4, a4, 585
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, 585
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, 585
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, 585
-; RV64M-NEXT:    add a2, a2, a4
-; RV64M-NEXT:    lui a4, 4681
-; RV64M-NEXT:    addiw a4, a4, 585
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, 585
-; RV64M-NEXT:    slli a4, a4, 12
-; RV64M-NEXT:    addi a4, a4, 585
-; RV64M-NEXT:    slli a4, a4, 13
-; RV64M-NEXT:    addi a4, a4, 1170
-; RV64M-NEXT:    sltu a2, a4, a2
+; RV64M-NEXT:    addi a5, zero, 1463
+; RV64M-NEXT:    mul a2, a2, a5
+; RV64M-NEXT:    addi a2, a2, -1463
+; RV64M-NEXT:    andi a2, a2, 2047
+; RV64M-NEXT:    addi a5, zero, 292
+; RV64M-NEXT:    sltu a2, a5, a2
 ; RV64M-NEXT:    neg a1, a1
 ; RV64M-NEXT:    neg a2, a2
 ; RV64M-NEXT:    andi a1, a1, 2047
@@ -661,8 +546,7 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
 ; RV64M-NEXT:    slli a2, a3, 22
 ; RV64M-NEXT:    sub a1, a1, a2
 ; RV64M-NEXT:    sw a1, 0(a0)
-; RV64M-NEXT:    addi a2, zero, 1
-; RV64M-NEXT:    slli a2, a2, 33
+; RV64M-NEXT:    slli a2, a4, 33
 ; RV64M-NEXT:    addi a2, a2, -1
 ; RV64M-NEXT:    and a1, a1, a2
 ; RV64M-NEXT:    srli a1, a1, 32
@@ -672,56 +556,46 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
 ; RV32MV-LABEL: test_urem_vec:
 ; RV32MV:       # %bb.0:
 ; RV32MV-NEXT:    addi sp, sp, -16
-; RV32MV-NEXT:    lb a1, 4(a0)
-; RV32MV-NEXT:    lw a2, 0(a0)
-; RV32MV-NEXT:    slli a1, a1, 10
-; RV32MV-NEXT:    srli a3, a2, 22
-; RV32MV-NEXT:    or a1, a3, a1
-; RV32MV-NEXT:    andi a1, a1, 2047
-; RV32MV-NEXT:    srli a3, a2, 11
-; RV32MV-NEXT:    andi a3, a3, 2047
-; RV32MV-NEXT:    andi a2, a2, 2047
-; RV32MV-NEXT:    lui a4, 699051
-; RV32MV-NEXT:    addi a4, a4, -1365
-; RV32MV-NEXT:    mulhu a4, a2, a4
-; RV32MV-NEXT:    srli a4, a4, 2
-; RV32MV-NEXT:    addi a5, zero, 6
-; RV32MV-NEXT:    mul a4, a4, a5
-; RV32MV-NEXT:    sub a2, a2, a4
+; RV32MV-NEXT:    lw a1, 0(a0)
+; RV32MV-NEXT:    andi a2, a1, 2047
 ; RV32MV-NEXT:    sh a2, 8(sp)
-; RV32MV-NEXT:    lui a2, 2566
-; RV32MV-NEXT:    addi a2, a2, 1087
-; RV32MV-NEXT:    mulhu a2, a1, a2
-; RV32MV-NEXT:    sub a4, a1, a2
-; RV32MV-NEXT:    srli a4, a4, 1
-; RV32MV-NEXT:    add a2, a4, a2
-; RV32MV-NEXT:    srli a2, a2, 10
-; RV32MV-NEXT:    addi a4, zero, 2043
-; RV32MV-NEXT:    mul a2, a2, a4
-; RV32MV-NEXT:    sub a1, a1, a2
+; RV32MV-NEXT:    srli a2, a1, 11
+; RV32MV-NEXT:    andi a2, a2, 2047
+; RV32MV-NEXT:    sh a2, 10(sp)
+; RV32MV-NEXT:    lb a2, 4(a0)
+; RV32MV-NEXT:    slli a2, a2, 10
+; RV32MV-NEXT:    srli a1, a1, 22
+; RV32MV-NEXT:    or a1, a1, a2
+; RV32MV-NEXT:    andi a1, a1, 2047
 ; RV32MV-NEXT:    sh a1, 12(sp)
-; RV32MV-NEXT:    lui a1, 149797
-; RV32MV-NEXT:    addi a1, a1, -1755
-; RV32MV-NEXT:    mulhu a1, a3, a1
-; RV32MV-NEXT:    sub a2, a3, a1
-; RV32MV-NEXT:    srli a2, a2, 1
-; RV32MV-NEXT:    add a1, a2, a1
-; RV32MV-NEXT:    srli a1, a1, 2
-; RV32MV-NEXT:    slli a2, a1, 3
-; RV32MV-NEXT:    sub a1, a1, a2
-; RV32MV-NEXT:    add a1, a3, a1
-; RV32MV-NEXT:    sh a1, 10(sp)
 ; RV32MV-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
 ; RV32MV-NEXT:    addi a1, sp, 8
 ; RV32MV-NEXT:    vle16.v v25, (a1)
 ; RV32MV-NEXT:    lui a1, %hi(.LCPI4_0)
 ; RV32MV-NEXT:    addi a1, a1, %lo(.LCPI4_0)
 ; RV32MV-NEXT:    vle16.v v26, (a1)
+; RV32MV-NEXT:    vid.v v27
+; RV32MV-NEXT:    vsub.vv v25, v25, v27
+; RV32MV-NEXT:    vmul.vv v25, v25, v26
+; RV32MV-NEXT:    vsll.vi v26, v25, 1
+; RV32MV-NEXT:    vmv.v.i v27, 10
+; RV32MV-NEXT:    addi a1, zero, 9
+; RV32MV-NEXT:    vmv.s.x v27, a1
+; RV32MV-NEXT:    vsll.vv v26, v26, v27
 ; RV32MV-NEXT:    addi a1, zero, 2047
 ; RV32MV-NEXT:    vand.vx v25, v25, a1
-; RV32MV-NEXT:    vmsne.vv v0, v25, v26
-; RV32MV-NEXT:    vmv.v.i v25, 0
-; RV32MV-NEXT:    vmerge.vim v25, v25, -1, v0
+; RV32MV-NEXT:    vmv.v.i v27, 0
+; RV32MV-NEXT:    addi a2, zero, 1
+; RV32MV-NEXT:    vmv1r.v v28, v27
+; RV32MV-NEXT:    vmv.s.x v28, a2
+; RV32MV-NEXT:    lui a2, %hi(.LCPI4_1)
+; RV32MV-NEXT:    addi a2, a2, %lo(.LCPI4_1)
+; RV32MV-NEXT:    vle16.v v29, (a2)
+; RV32MV-NEXT:    vsrl.vv v25, v25, v28
+; RV32MV-NEXT:    vor.vv v25, v25, v26
+; RV32MV-NEXT:    vand.vx v25, v25, a1
+; RV32MV-NEXT:    vmsltu.vv v0, v29, v25
+; RV32MV-NEXT:    vmerge.vim v25, v27, -1, v0
 ; RV32MV-NEXT:    vsetivli a1, 1, e16,m1,ta,mu
 ; RV32MV-NEXT:    vslidedown.vi v26, v25, 2
 ; RV32MV-NEXT:    vmv.x.s a1, v26
@@ -748,54 +622,12 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
 ; RV64MV-NEXT:    lwu a2, 0(a0)
 ; RV64MV-NEXT:    slli a1, a1, 32
 ; RV64MV-NEXT:    or a1, a2, a1
-; RV64MV-NEXT:    srli a2, a1, 11
-; RV64MV-NEXT:    andi a2, a2, 2047
-; RV64MV-NEXT:    andi a3, a1, 2047
-; RV64MV-NEXT:    srli a1, a1, 22
-; RV64MV-NEXT:    lui a4, 1027
-; RV64MV-NEXT:    addiw a4, a4, -2023
-; RV64MV-NEXT:    slli a4, a4, 15
-; RV64MV-NEXT:    addi a4, a4, 2005
-; RV64MV-NEXT:    slli a4, a4, 12
-; RV64MV-NEXT:    addi a4, a4, -431
-; RV64MV-NEXT:    slli a4, a4, 13
-; RV64MV-NEXT:    addi a4, a4, -429
-; RV64MV-NEXT:    mulhu a4, a1, a4
-; RV64MV-NEXT:    srli a4, a4, 9
-; RV64MV-NEXT:    addi a5, zero, 2043
-; RV64MV-NEXT:    mul a4, a4, a5
-; RV64MV-NEXT:    sub a1, a1, a4
-; RV64MV-NEXT:    sh a1, 12(sp)
-; RV64MV-NEXT:    lui a1, 1026731
-; RV64MV-NEXT:    addiw a1, a1, -1365
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -1365
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -1365
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, -1365
-; RV64MV-NEXT:    mulhu a1, a3, a1
-; RV64MV-NEXT:    srli a1, a1, 2
-; RV64MV-NEXT:    addi a4, zero, 6
-; RV64MV-NEXT:    mul a1, a1, a4
-; RV64MV-NEXT:    sub a1, a3, a1
-; RV64MV-NEXT:    sh a1, 8(sp)
-; RV64MV-NEXT:    lui a1, 4681
-; RV64MV-NEXT:    addiw a1, a1, 585
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, 585
-; RV64MV-NEXT:    slli a1, a1, 12
-; RV64MV-NEXT:    addi a1, a1, 585
-; RV64MV-NEXT:    slli a1, a1, 13
-; RV64MV-NEXT:    addi a1, a1, 1171
-; RV64MV-NEXT:    mulhu a1, a2, a1
-; RV64MV-NEXT:    sub a3, a2, a1
-; RV64MV-NEXT:    srli a3, a3, 1
-; RV64MV-NEXT:    add a1, a3, a1
-; RV64MV-NEXT:    srli a1, a1, 2
-; RV64MV-NEXT:    slli a3, a1, 3
-; RV64MV-NEXT:    sub a1, a1, a3
-; RV64MV-NEXT:    add a1, a2, a1
+; RV64MV-NEXT:    srli a2, a1, 22
+; RV64MV-NEXT:    sh a2, 12(sp)
+; RV64MV-NEXT:    andi a2, a1, 2047
+; RV64MV-NEXT:    sh a2, 8(sp)
+; RV64MV-NEXT:    srli a1, a1, 11
+; RV64MV-NEXT:    andi a1, a1, 2047
 ; RV64MV-NEXT:    sh a1, 10(sp)
 ; RV64MV-NEXT:    vsetivli a1, 4, e16,m1,ta,mu
 ; RV64MV-NEXT:    addi a1, sp, 8
@@ -803,14 +635,30 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
 ; RV64MV-NEXT:    lui a1, %hi(.LCPI4_0)
 ; RV64MV-NEXT:    addi a1, a1, %lo(.LCPI4_0)
 ; RV64MV-NEXT:    vle16.v v26, (a1)
+; RV64MV-NEXT:    vid.v v27
+; RV64MV-NEXT:    vsub.vv v25, v25, v27
+; RV64MV-NEXT:    vmul.vv v25, v25, v26
+; RV64MV-NEXT:    vsll.vi v26, v25, 1
+; RV64MV-NEXT:    vmv.v.i v27, 10
+; RV64MV-NEXT:    addi a1, zero, 9
+; RV64MV-NEXT:    vmv.s.x v27, a1
+; RV64MV-NEXT:    vsll.vv v26, v26, v27
 ; RV64MV-NEXT:    addi a1, zero, 2047
 ; RV64MV-NEXT:    vand.vx v25, v25, a1
-; RV64MV-NEXT:    vmsne.vv v0, v25, v26
-; RV64MV-NEXT:    vmv.v.i v25, 0
-; RV64MV-NEXT:    vmerge.vim v25, v25, -1, v0
+; RV64MV-NEXT:    vmv.v.i v27, 0
+; RV64MV-NEXT:    addi a2, zero, 1
+; RV64MV-NEXT:    vmv1r.v v28, v27
+; RV64MV-NEXT:    vmv.s.x v28, a2
+; RV64MV-NEXT:    lui a3, %hi(.LCPI4_1)
+; RV64MV-NEXT:    addi a3, a3, %lo(.LCPI4_1)
+; RV64MV-NEXT:    vle16.v v29, (a3)
+; RV64MV-NEXT:    vsrl.vv v25, v25, v28
+; RV64MV-NEXT:    vor.vv v25, v25, v26
+; RV64MV-NEXT:    vand.vx v25, v25, a1
+; RV64MV-NEXT:    vmsltu.vv v0, v29, v25
+; RV64MV-NEXT:    vmerge.vim v25, v27, -1, v0
 ; RV64MV-NEXT:    vmv.x.s a1, v25
 ; RV64MV-NEXT:    andi a1, a1, 2047
-; RV64MV-NEXT:    addi a2, zero, 1
 ; RV64MV-NEXT:    vsetivli a3, 1, e16,m1,ta,mu
 ; RV64MV-NEXT:    vslidedown.vi v26, v25, 1
 ; RV64MV-NEXT:    vmv.x.s a3, v26

diff  --git a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
index 2ec65eb9d771..2eac1f450dcc 100644
--- a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
@@ -4,8 +4,6 @@
 define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK-LABEL: test_srem_odd:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    lsls r0, r0, #3
-; CHECK-NEXT:    asrs r0, r0, #3
 ; CHECK-NEXT:    ldr r1, .LCPI0_0
 ; CHECK-NEXT:    muls r1, r0, r1
 ; CHECK-NEXT:    ldr r0, .LCPI0_1
@@ -22,11 +20,11 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI0_0:
-; CHECK-NEXT:    .long 3210379595 @ 0xbf5a814b
+; CHECK-NEXT:    .long 4208200280 @ 0xfad40a58
 ; CHECK-NEXT:  .LCPI0_1:
-; CHECK-NEXT:    .long 21691754 @ 0x14afd6a
+; CHECK-NEXT:    .long 21691752 @ 0x14afd68
 ; CHECK-NEXT:  .LCPI0_2:
-; CHECK-NEXT:    .long 43383509 @ 0x295fad5
+; CHECK-NEXT:    .long 43383512 @ 0x295fad8
   %srem = srem i29 %X, 99
   %cmp = icmp eq i29 %srem, 0
   ret i1 %cmp

diff  --git a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
index f1c05f759e3f..571f8e5026e5 100644
--- a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
@@ -5,11 +5,9 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    ldr r1, .LCPI0_0
-; CHECK-NEXT:    ands r1, r0
+; CHECK-NEXT:    muls r1, r0, r1
 ; CHECK-NEXT:    ldr r0, .LCPI0_1
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    ldr r1, .LCPI0_2
-; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    blo .LBB0_2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:    movs r0, #0
@@ -20,11 +18,9 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI0_0:
-; CHECK-NEXT:    .long 8191 @ 0x1fff
+; CHECK-NEXT:    .long 1718091776 @ 0x66680000
 ; CHECK-NEXT:  .LCPI0_1:
-; CHECK-NEXT:    .long 3435973837 @ 0xcccccccd
-; CHECK-NEXT:  .LCPI0_2:
-; CHECK-NEXT:    .long 858993460 @ 0x33333334
+; CHECK-NEXT:    .long 859308032 @ 0x33380000
   %urem = urem i13 %X, 5
   %cmp = icmp eq i13 %urem, 0
   ret i1 %cmp
@@ -33,26 +29,31 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK-LABEL: test_urem_even:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    movs r1, #31
-; CHECK-NEXT:    lsls r1, r1, #27
-; CHECK-NEXT:    bics r0, r1
 ; CHECK-NEXT:    ldr r1, .LCPI1_0
 ; CHECK-NEXT:    muls r1, r0, r1
-; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    rors r1, r0
+; CHECK-NEXT:    lsls r0, r1, #26
 ; CHECK-NEXT:    ldr r2, .LCPI1_1
-; CHECK-NEXT:    cmp r1, r2
+; CHECK-NEXT:    ands r2, r1
+; CHECK-NEXT:    lsrs r1, r2, #1
+; CHECK-NEXT:    adds r0, r1, r0
+; CHECK-NEXT:    lsls r0, r0, #5
+; CHECK-NEXT:    ldr r1, .LCPI1_2
+; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo .LBB1_2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI1_0:
-; CHECK-NEXT:    .long 3067833783 @ 0xb6db6db7
+; CHECK-NEXT:    .long 115043767 @ 0x6db6db7
 ; CHECK-NEXT:  .LCPI1_1:
-; CHECK-NEXT:    .long 306783379 @ 0x12492493
+; CHECK-NEXT:    .long 134217726 @ 0x7fffffe
+; CHECK-NEXT:  .LCPI1_2:
+; CHECK-NEXT:    .long 306783392 @ 0x124924a0
   %urem = urem i27 %X, 14
   %cmp = icmp eq i27 %urem, 0
   ret i1 %cmp
@@ -61,12 +62,11 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_setne:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    movs r1, #15
-; CHECK-NEXT:    ands r1, r0
-; CHECK-NEXT:    ldr r0, .LCPI2_0
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    ldr r1, .LCPI2_1
-; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    movs r1, #13
+; CHECK-NEXT:    muls r1, r0, r1
+; CHECK-NEXT:    movs r0, #15
+; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    cmp r0, #3
 ; CHECK-NEXT:    bhi .LBB2_2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:    movs r0, #0
@@ -74,12 +74,6 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:  .LCPI2_0:
-; CHECK-NEXT:    .long 3435973837 @ 0xcccccccd
-; CHECK-NEXT:  .LCPI2_1:
-; CHECK-NEXT:    .long 858993459 @ 0x33333333
   %urem = urem i4 %X, 5
   %cmp = icmp ne i4 %urem, 0
   ret i1 %cmp
@@ -88,12 +82,12 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; CHECK-LABEL: test_urem_negative_odd:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    ldr r1, .LCPI3_0
-; CHECK-NEXT:    ands r1, r0
-; CHECK-NEXT:    ldr r0, .LCPI3_1
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    ldr r1, .LCPI3_2
-; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    movs r1, #255
+; CHECK-NEXT:    adds r1, #52
+; CHECK-NEXT:    muls r1, r0, r1
+; CHECK-NEXT:    ldr r0, .LCPI3_0
+; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    cmp r0, #1
 ; CHECK-NEXT:    bhi .LBB3_2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:    movs r0, #0
@@ -105,10 +99,6 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:  .LCPI3_0:
 ; CHECK-NEXT:    .long 511 @ 0x1ff
-; CHECK-NEXT:  .LCPI3_1:
-; CHECK-NEXT:    .long 2837897523 @ 0xa926e133
-; CHECK-NEXT:  .LCPI3_2:
-; CHECK-NEXT:    .long 8471335 @ 0x814327
   %urem = urem i9 %X, -5
   %cmp = icmp ne i9 %urem, 0
   ret i1 %cmp
@@ -117,71 +107,73 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-LABEL: test_urem_vec:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, r5, r6, lr}
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    movs r3, r2
-; CHECK-NEXT:    ldr r5, .LCPI4_0
-; CHECK-NEXT:    ands r0, r5
-; CHECK-NEXT:    ldr r6, .LCPI4_1
-; CHECK-NEXT:    muls r6, r0, r6
-; CHECK-NEXT:    movs r2, #1
-; CHECK-NEXT:    rors r6, r2
-; CHECK-NEXT:    ldr r0, .LCPI4_2
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    ldr r3, .LCPI4_0
+; CHECK-NEXT:    muls r3, r0, r3
+; CHECK-NEXT:    lsls r0, r3, #10
+; CHECK-NEXT:    ldr r4, .LCPI4_1
+; CHECK-NEXT:    ands r4, r3
+; CHECK-NEXT:    lsrs r3, r4, #1
+; CHECK-NEXT:    adds r0, r3, r0
+; CHECK-NEXT:    ldr r3, .LCPI4_2
+; CHECK-NEXT:    ands r3, r0
+; CHECK-NEXT:    lsrs r0, r3, #1
+; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    cmp r6, r0
-; CHECK-NEXT:    push {r2}
+; CHECK-NEXT:    cmp r0, #170
+; CHECK-NEXT:    push {r3}
 ; CHECK-NEXT:    pop {r0}
 ; CHECK-NEXT:    bhi .LBB4_2
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:    movs r0, r4
 ; CHECK-NEXT:  .LBB4_2:
-; CHECK-NEXT:    ands r1, r5
-; CHECK-NEXT:    ldr r6, .LCPI4_3
-; CHECK-NEXT:    muls r6, r1, r6
+; CHECK-NEXT:    ldr r5, .LCPI4_3
+; CHECK-NEXT:    muls r5, r1, r5
 ; CHECK-NEXT:    ldr r1, .LCPI4_4
-; CHECK-NEXT:    adds r1, r6, r1
-; CHECK-NEXT:    ldr r6, .LCPI4_5
-; CHECK-NEXT:    cmp r1, r6
-; CHECK-NEXT:    push {r2}
+; CHECK-NEXT:    adds r1, r5, r1
+; CHECK-NEXT:    movs r5, #73
+; CHECK-NEXT:    lsls r5, r5, #23
+; CHECK-NEXT:    cmp r1, r5
+; CHECK-NEXT:    push {r3}
 ; CHECK-NEXT:    pop {r1}
 ; CHECK-NEXT:    bhi .LBB4_4
 ; CHECK-NEXT:  @ %bb.3:
 ; CHECK-NEXT:    movs r1, r4
 ; CHECK-NEXT:  .LBB4_4:
-; CHECK-NEXT:    ands r3, r5
-; CHECK-NEXT:    ldr r5, .LCPI4_6
-; CHECK-NEXT:    muls r5, r3, r5
-; CHECK-NEXT:    ldr r3, .LCPI4_7
-; CHECK-NEXT:    adds r3, r5, r3
-; CHECK-NEXT:    ldr r5, .LCPI4_8
-; CHECK-NEXT:    cmp r3, r5
+; CHECK-NEXT:    ldr r5, .LCPI4_5
+; CHECK-NEXT:    muls r5, r2, r5
+; CHECK-NEXT:    ldr r2, .LCPI4_6
+; CHECK-NEXT:    adds r2, r5, r2
+; CHECK-NEXT:    ldr r5, .LCPI4_7
+; CHECK-NEXT:    ands r5, r2
+; CHECK-NEXT:    cmp r5, #1
 ; CHECK-NEXT:    bhi .LBB4_6
 ; CHECK-NEXT:  @ %bb.5:
-; CHECK-NEXT:    movs r2, r4
+; CHECK-NEXT:    movs r3, r4
 ; CHECK-NEXT:  .LBB4_6:
-; CHECK-NEXT:    pop {r4, r5, r6}
+; CHECK-NEXT:    movs r2, r3
+; CHECK-NEXT:    pop {r4, r5, r7}
 ; CHECK-NEXT:    pop {r3}
 ; CHECK-NEXT:    bx r3
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.7:
 ; CHECK-NEXT:  .LCPI4_0:
-; CHECK-NEXT:    .long 2047 @ 0x7ff
+; CHECK-NEXT:    .long 683 @ 0x2ab
 ; CHECK-NEXT:  .LCPI4_1:
-; CHECK-NEXT:    .long 2863311531 @ 0xaaaaaaab
+; CHECK-NEXT:    .long 2044 @ 0x7fc
 ; CHECK-NEXT:  .LCPI4_2:
-; CHECK-NEXT:    .long 715827882 @ 0x2aaaaaaa
+; CHECK-NEXT:    .long 2046 @ 0x7fe
 ; CHECK-NEXT:  .LCPI4_3:
-; CHECK-NEXT:    .long 3067833783 @ 0xb6db6db7
+; CHECK-NEXT:    .long 3068133376 @ 0xb6e00000
 ; CHECK-NEXT:  .LCPI4_4:
-; CHECK-NEXT:    .long 1227133513 @ 0x49249249
+; CHECK-NEXT:    .long 1226833920 @ 0x49200000
 ; CHECK-NEXT:  .LCPI4_5:
-; CHECK-NEXT:    .long 613566756 @ 0x24924924
+; CHECK-NEXT:    .long 819 @ 0x333
 ; CHECK-NEXT:  .LCPI4_6:
-; CHECK-NEXT:    .long 2198989619 @ 0x8311eb33
+; CHECK-NEXT:    .long 4294965658 @ 0xfffff99a
 ; CHECK-NEXT:  .LCPI4_7:
-; CHECK-NEXT:    .long 4191955354 @ 0xf9dc299a
-; CHECK-NEXT:  .LCPI4_8:
-; CHECK-NEXT:    .long 2102284 @ 0x20140c
+; CHECK-NEXT:    .long 2047 @ 0x7ff
   %urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
   %cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>
   ret <3 x i1> %cmp

diff  --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
index 61734ae53c23..a0c3125cd7f8 100644
--- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
@@ -4,15 +4,15 @@
 define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK-LABEL: test_srem_odd:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    movw r1, #64874
+; CHECK-NEXT:    movw r1, #24493
 ; CHECK-NEXT:    movw r2, #33099
-; CHECK-NEXT:    sbfx r0, r0, #0, #29
-; CHECK-NEXT:    movt r1, #330
-; CHECK-NEXT:    movt r2, #48986
-; CHECK-NEXT:    mla r1, r0, r2, r1
-; CHECK-NEXT:    movw r2, #64213
+; CHECK-NEXT:    movt r1, #41
+; CHECK-NEXT:    movt r2, #8026
+; CHECK-NEXT:    mla r0, r0, r2, r1
+; CHECK-NEXT:    movw r2, #48987
+; CHECK-NEXT:    movt r2, #82
+; CHECK-NEXT:    bic r1, r0, #-536870912
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    movt r2, #661
 ; CHECK-NEXT:    cmp r1, r2
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1

diff  --git a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
index feb88de65054..a0247c29f257 100644
--- a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
@@ -4,12 +4,12 @@
 define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    movw r1, #52429
-; CHECK-NEXT:    bfc r0, #13, #19
-; CHECK-NEXT:    movt r1, #52428
+; CHECK-NEXT:    movw r1, #3277
+; CHECK-NEXT:    movw r2, #1639
 ; CHECK-NEXT:    muls r1, r0, r1
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    cmn.w r1, #-858993460
+; CHECK-NEXT:    bfc r1, #13, #19
+; CHECK-NEXT:    cmp r1, r2
 ; CHECK-NEXT:    it lo
 ; CHECK-NEXT:    movlo r0, #1
 ; CHECK-NEXT:    bx lr
@@ -22,12 +22,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK-LABEL: test_urem_even:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    movw r1, #28087
-; CHECK-NEXT:    bic r0, r0, #-134217728
-; CHECK-NEXT:    movt r1, #46811
-; CHECK-NEXT:    movw r2, #9363
+; CHECK-NEXT:    movw r2, #18725
+; CHECK-NEXT:    movt r1, #1755
+; CHECK-NEXT:    movt r2, #146
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    movt r2, #4681
-; CHECK-NEXT:    ror.w r1, r0, #1
+; CHECK-NEXT:    ubfx r1, r0, #1, #26
+; CHECK-NEXT:    orr.w r0, r1, r0, lsl #26
+; CHECK-NEXT:    bic r1, r0, #-134217728
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    cmp r1, r2
 ; CHECK-NEXT:    it lo
@@ -41,12 +42,11 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_setne:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    movw r1, #52429
-; CHECK-NEXT:    and r0, r0, #15
-; CHECK-NEXT:    movt r1, #52428
-; CHECK-NEXT:    muls r1, r0, r1
+; CHECK-NEXT:    movs r1, #13
+; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    and r1, r0, #15
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    cmp.w r1, #858993459
+; CHECK-NEXT:    cmp r1, #3
 ; CHECK-NEXT:    it hi
 ; CHECK-NEXT:    movhi r0, #1
 ; CHECK-NEXT:    bx lr
@@ -58,14 +58,11 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; CHECK-LABEL: test_urem_negative_odd:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    movw r1, #57651
-; CHECK-NEXT:    bfc r0, #9, #23
-; CHECK-NEXT:    movt r1, #43302
-; CHECK-NEXT:    movw r2, #17191
+; CHECK-NEXT:    movw r1, #307
 ; CHECK-NEXT:    muls r1, r0, r1
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    movt r2, #129
-; CHECK-NEXT:    cmp r1, r2
+; CHECK-NEXT:    bfc r1, #9, #23
+; CHECK-NEXT:    cmp r1, #1
 ; CHECK-NEXT:    it hi
 ; CHECK-NEXT:    movhi r0, #1
 ; CHECK-NEXT:    bx lr
@@ -77,50 +74,54 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-LABEL: test_urem_vec:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    movw r3, #18725
-; CHECK-NEXT:    bfc r1, #11, #21
-; CHECK-NEXT:    movt r3, #9362
-; CHECK-NEXT:    bfc r2, #11, #21
-; CHECK-NEXT:    umull r3, r12, r1, r3
-; CHECK-NEXT:    bfc r0, #11, #21
-; CHECK-NEXT:    movw r3, #25663
-; CHECK-NEXT:    movt r3, #160
-; CHECK-NEXT:    umull r3, lr, r2, r3
-; CHECK-NEXT:    vldr d17, .LCPI4_0
-; CHECK-NEXT:    movw r3, #43691
-; CHECK-NEXT:    movt r3, #43690
-; CHECK-NEXT:    umull r3, r4, r0, r3
-; CHECK-NEXT:    sub.w r3, r1, r12
-; CHECK-NEXT:    add.w r3, r12, r3, lsr #1
-; CHECK-NEXT:    lsr.w r12, r3, #2
-; CHECK-NEXT:    sub.w r3, r2, lr
-; CHECK-NEXT:    lsrs r4, r4, #2
-; CHECK-NEXT:    add.w r4, r4, r4, lsl #1
-; CHECK-NEXT:    add.w r3, lr, r3, lsr #1
-; CHECK-NEXT:    sub.w r0, r0, r4, lsl #1
-; CHECK-NEXT:    lsr.w lr, r3, #10
-; CHECK-NEXT:    movw r3, #2043
 ; CHECK-NEXT:    vmov.16 d16[0], r0
-; CHECK-NEXT:    sub.w r0, r12, r12, lsl #3
-; CHECK-NEXT:    mls r2, lr, r3, r2
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    vmov.16 d16[1], r0
+; CHECK-NEXT:    vldr d17, .LCPI4_0
+; CHECK-NEXT:    vmov.16 d16[1], r1
+; CHECK-NEXT:    vldr d19, .LCPI4_3
 ; CHECK-NEXT:    vmov.16 d16[2], r2
+; CHECK-NEXT:    vsub.i16 d16, d16, d17
+; CHECK-NEXT:    vldr d17, .LCPI4_1
+; CHECK-NEXT:    vmul.i16 d16, d16, d17
+; CHECK-NEXT:    vldr d17, .LCPI4_2
+; CHECK-NEXT:    vneg.s16 d17, d17
+; CHECK-NEXT:    vshl.i16 d18, d16, #1
+; CHECK-NEXT:    vbic.i16 d16, #0xf800
+; CHECK-NEXT:    vshl.u16 d16, d16, d17
+; CHECK-NEXT:    vshl.u16 d17, d18, d19
+; CHECK-NEXT:    vorr d16, d16, d17
+; CHECK-NEXT:    vldr d17, .LCPI4_4
 ; CHECK-NEXT:    vbic.i16 d16, #0xf800
-; CHECK-NEXT:    vceq.i16 d16, d16, d17
-; CHECK-NEXT:    vmvn d16, d16
+; CHECK-NEXT:    vcgt.u16 d16, d16, d17
 ; CHECK-NEXT:    vmov.u16 r0, d16[0]
 ; CHECK-NEXT:    vmov.u16 r1, d16[1]
 ; CHECK-NEXT:    vmov.u16 r2, d16[2]
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 3
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI4_0:
 ; CHECK-NEXT:    .short 0 @ 0x0
 ; CHECK-NEXT:    .short 1 @ 0x1
 ; CHECK-NEXT:    .short 2 @ 0x2
+; CHECK-NEXT:    .zero 2
+; CHECK-NEXT:  .LCPI4_1:
+; CHECK-NEXT:    .short 683 @ 0x2ab
+; CHECK-NEXT:    .short 1463 @ 0x5b7
+; CHECK-NEXT:    .short 819 @ 0x333
+; CHECK-NEXT:    .zero 2
+; CHECK-NEXT:  .LCPI4_2:
+; CHECK-NEXT:    .short 1 @ 0x1
+; CHECK-NEXT:    .short 0 @ 0x0
+; CHECK-NEXT:    .short 0 @ 0x0
+; CHECK-NEXT:    .short 0 @ 0x0
+; CHECK-NEXT:  .LCPI4_3:
+; CHECK-NEXT:    .short 9 @ 0x9
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:    .short 10 @ 0xa
+; CHECK-NEXT:  .LCPI4_4:
+; CHECK-NEXT:    .short 341 @ 0x155
+; CHECK-NEXT:    .short 292 @ 0x124
+; CHECK-NEXT:    .short 1 @ 0x1
 ; CHECK-NEXT:    .short 0 @ 0x0
   %urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
   %cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>

diff  --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 534c7121ffcb..637814b756c8 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -83,33 +83,29 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    psrld $2, %xmm2
-; SSE2-NEXT:    pmaddwd {{.*}}(%rip), %xmm2
-; SSE2-NEXT:    psubd %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    pslld $31, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: p4_vector_urem_by_const__splat:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; SSE4-NEXT:    pmuludq %xmm2, %xmm1
-; SSE4-NEXT:    pmuludq %xmm0, %xmm2
-; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE4-NEXT:    psrld $2, %xmm2
-; SSE4-NEXT:    pmaddwd {{.*}}(%rip), %xmm2
-; SSE4-NEXT:    psubd %xmm2, %xmm0
-; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; SSE4-NEXT:    pminud %xmm0, %xmm1
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -117,16 +113,11 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaddwd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 128, i32 128> ; clearly a power-of-two or zero
@@ -140,59 +131,50 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrld $2, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,3,1]
-; SSE2-NEXT:    pmaddwd {{.*}}(%rip), %xmm2
-; SSE2-NEXT:    psubd %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: p5_vector_urem_by_const__nonsplat:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177]
-; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE4-NEXT:    pmuludq %xmm2, %xmm3
-; SSE4-NEXT:    pmuludq %xmm0, %xmm1
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE4-NEXT:    movdqa %xmm1, %xmm2
-; SSE4-NEXT:    psrld $2, %xmm2
-; SSE4-NEXT:    psrld $1, %xmm1
-; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7]
-; SSE4-NEXT:    pmaddwd {{.*}}(%rip), %xmm1
-; SSE4-NEXT:    psubd %xmm1, %xmm0
-; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE4-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; SSE4-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE4-NEXT:    por %xmm2, %xmm1
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [1431655765,858993459,715827882,477218588]
+; SSE4-NEXT:    pminud %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX2-LABEL: p5_vector_urem_by_const__nonsplat:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpmaddwd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %t0 = and <4 x i32> %x, <i32 128, i32 2, i32 4, i32 8>
@@ -206,39 +188,32 @@ define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    psrld $2, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [6,6,6,6]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    psubd %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    pslld $31, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: p6_vector_urem_by_const__nonsplat_undef0:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; SSE4-NEXT:    pmuludq %xmm2, %xmm1
-; SSE4-NEXT:    pmuludq %xmm0, %xmm2
-; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE4-NEXT:    psrld $2, %xmm2
-; SSE4-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; SSE4-NEXT:    psubd %xmm2, %xmm0
-; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    pslld $31, %xmm0
+; SSE4-NEXT:    por %xmm1, %xmm0
+; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; SSE4-NEXT:    pminud %xmm0, %xmm1
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -246,17 +221,13 @@ define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128]
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6]
-; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 undef, i32 128>

diff  --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index abb310d3a518..166bf345d89b 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -10,22 +10,19 @@
 define i1 @test_srem_odd(i29 %X) nounwind {
 ; X86-LABEL: test_srem_odd:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $3, %eax
-; X86-NEXT:    sarl $3, %eax
-; X86-NEXT:    imull $-1084587701, %eax, %eax # imm = 0xBF5A814B
-; X86-NEXT:    addl $21691754, %eax # imm = 0x14AFD6A
-; X86-NEXT:    cmpl $43383509, %eax # imm = 0x295FAD5
+; X86-NEXT:    imull $526025035, {{[0-9]+}}(%esp), %eax # imm = 0x1F5A814B
+; X86-NEXT:    addl $2711469, %eax # imm = 0x295FAD
+; X86-NEXT:    andl $536870911, %eax # imm = 0x1FFFFFFF
+; X86-NEXT:    cmpl $5422939, %eax # imm = 0x52BF5B
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_srem_odd:
 ; X64:       # %bb.0:
-; X64-NEXT:    shll $3, %edi
-; X64-NEXT:    sarl $3, %edi
-; X64-NEXT:    imull $-1084587701, %edi, %eax # imm = 0xBF5A814B
-; X64-NEXT:    addl $21691754, %eax # imm = 0x14AFD6A
-; X64-NEXT:    cmpl $43383509, %eax # imm = 0x295FAD5
+; X64-NEXT:    imull $526025035, %edi, %eax # imm = 0x1F5A814B
+; X64-NEXT:    addl $2711469, %eax # imm = 0x295FAD
+; X64-NEXT:    andl $536870911, %eax # imm = 0x1FFFFFFF
+; X64-NEXT:    cmpl $5422939, %eax # imm = 0x52BF5B
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %srem = srem i29 %X, 99

diff  --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index d19bd1a52288..1a15175fa7a6 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -9,122 +9,80 @@
 define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1717986919,2454267027,1374389535,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    psrad $1, %xmm5
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,14,25,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT:    psrad $3, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm5
-; CHECK-SSE41-NEXT:    psrad $1, %xmm5
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3],xmm5[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    paddd %xmm5, %xmm3
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,171798690,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm5
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1],xmm2[2,3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -270,112 +228,60 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = <1,u,4294967295,u>
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrld $1, %xmm1
+; CHECK-SSE2-NEXT:    pslld $31, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,4294967295,1]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    psrad $3, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $1, %xmm1
+; CHECK-SSE41-NEXT:    pslld $31, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_allones_eq:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378]
+; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -396,113 +302,60 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-SSE2-NEXT:    pmuludq %xmm6, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm6
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm6 = <1,u,4294967295,u>
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm6
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; CHECK-SSE2-NEXT:    paddd %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    movdqa %xmm6, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm6, %xmm5
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm6
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm6
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm6
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm6
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pandn %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrld $1, %xmm1
+; CHECK-SSE2-NEXT:    pslld $31, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,4294967295,1]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    psrad $3, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $1, %xmm1
+; CHECK-SSE41-NEXT:    pslld $31, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpandn {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_allones_ne:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378]
+; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -526,135 +379,80 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4294967295,0]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    psrad $1, %xmm5
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,14,4294967295,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4294967295,0]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    psrad $3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -675,135 +473,80 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4294967295,0]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    psrad $1, %xmm5
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,14,4294967295,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4294967295,0]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    psrad $3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpandn {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -829,103 +572,73 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483649,1717986919]
-; CHECK-SSE2-NEXT:    pand %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    psrad $1, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn %xmm4, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm4 = <1717986919,u,2147483649,u>
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm4
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm2, %xmm4
-; CHECK-SSE41-NEXT:    movdqa %xmm4, %xmm2
-; CHECK-SSE41-NEXT:    psrad $3, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm4, %xmm3
-; CHECK-SSE41-NEXT:    psrad $1, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm4
-; CHECK-SSE41-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm4
-; CHECK-SSE41-NEXT:    psubd %xmm4, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,858993458,268435454,858993458]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm4
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm2, %xmm3
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm2, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm4, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm4
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm2, %xmm3
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -949,88 +662,73 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483649,2454267027]
-; CHECK-SSE2-NEXT:    pand %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u>
+; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u>
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    psrad $3, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,306783378,268435454,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpsrad $3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1054,122 +752,80 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483649,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    psrad $1, %xmm5
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,14,16,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT:    psrad $3, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm5
-; CHECK-SSE41-NEXT:    psrad $1, %xmm5
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3],xmm5[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    paddd %xmm5, %xmm3
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,268435454,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm5
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1255,101 +911,60 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm5
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrld $1, %xmm1
+; CHECK-SSE2-NEXT:    pslld $31, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $3, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $1, %xmm1
+; CHECK-SSE41-NEXT:    pslld $31, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378]
+; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1372,129 +987,80 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1717986919,2454267027,0,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    psrad $1, %xmm5
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,14,1,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT:    psrad $3, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    psrad $1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1603,111 +1169,99 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483647,2454267027]
-; CHECK-SSE2-NEXT:    pand %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <3067833783,u,1,u>
+; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = <1,u,4294967295,u>
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    paddd %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    psrad $3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    psrad $30, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = <2147483648,u,2,u>
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pxor %xmm5, %xmm3
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; CHECK-SSE2-NEXT:    psrld $31, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <2454267027,u,2147483647,u>
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,4294967295,1]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    psrad $30, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    psrad $3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    paddd %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783]
+; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
+; CHECK-SSE41-NEXT:    paddd %xmm3, %xmm2
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT:    por %xmm5, %xmm4
+; CHECK-SSE41-NEXT:    pminud %xmm4, %xmm3
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; CHECK-SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $30, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
+; CHECK-AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm4, %xmm4
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm5, %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
+; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm2, %xmm4
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1735,135 +1289,103 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4294967295,0]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483647,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    psrad $30, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    psrad $1, %xmm5
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,14,2147483648,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm5, %xmm3
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; CHECK-SSE2-NEXT:    psrld $31, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4294967295,0]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    psrad $3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $30, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE41-NEXT:    psrad $1, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm4, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm4
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm4
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm3, %xmm5
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm4
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm3, %xmm2
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
+; CHECK-SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $30, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm4
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm4, %xmm4
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm5, %xmm3
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm2, %xmm3
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm2, %xmm3
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm2, %xmm3
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1892,130 +1414,76 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,0,1,3435973837]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1717986919]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $1, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,4294967295,16,5]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn %xmm4, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $1, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE41-NEXT:    psrad $3, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm4, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,858993458]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -2038,121 +1506,80 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,2454267027]
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm6, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,4294967295,1,1]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm6, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm5
-; CHECK-SSE2-NEXT:    movdqa %xmm5, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm5
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [14,4294967295,16,14]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,0,1,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,4294967295,1,1]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $3, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -2175,134 +1602,80 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,0,1,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $1, %xmm3
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,4294967295,16,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    psrad $3, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE41-NEXT:    psrad $1, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm4, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -2388,120 +1761,60 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_allones_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,0,4294967295]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2454267027,0,0,2454267027]
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,2,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm6, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,4294967295,1,1]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm6, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm5
-; CHECK-SSE2-NEXT:    movdqa %xmm5, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[1,2]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,3,1]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm5
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm5
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [14,4294967295,1,14]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrld $1, %xmm1
+; CHECK-SSE2-NEXT:    pslld $31, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,4294967295,1,1]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $3, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $1, %xmm1
+; CHECK-SSE41-NEXT:    pslld $31, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,4294967295,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_allones_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378]
+; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -2524,127 +1837,80 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,0,0,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1717986919,0,0,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $1, %xmm3
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,4294967295,1,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    psrad $1, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,4294967295,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -2669,124 +1935,80 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1717986919]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,1,0,3435973837]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $1, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    psrad $3, %xmm5
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,16,1,5]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,268435456,1,1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,268435456,1,1]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    psrad $1, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT:    psrad $3, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,858993458]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -2809,109 +2031,80 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2454267027,2147483649,0,2454267027]
-; CHECK-SSE2-NEXT:    pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm5
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [14,16,1,14]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,1,0,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    psrad $3, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,268435454,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -2934,129 +2127,80 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,1,0,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT:    psrad $1, %xmm5
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,16,1,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT:    psrad $3, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    psrad $1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm4
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -3080,108 +2224,60 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,4294967295,1,1]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1717986919,0,2147483649,0]
-; CHECK-SSE2-NEXT:    pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    psrlq $32, %xmm2
-; CHECK-SSE2-NEXT:    psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrad $3, %xmm3
-; CHECK-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    psrad $1, %xmm4
-; CHECK-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [5,4294967295,16,1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,4294967295,1,1]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u>
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    psrlq $32, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $3, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    psrad $1, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    psrlq $32, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -3203,102 +2299,60 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,4294967295,1,1]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm3, %xmm4
 ; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm5, %xmm5
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2454267027,0,2147483649,0]
-; CHECK-SSE2-NEXT:    pand %xmm6, %xmm5
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm6
-; CHECK-SSE2-NEXT:    psrlq $32, %xmm6
-; CHECK-SSE2-NEXT:    psubd %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    paddd %xmm2, %xmm6
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm6, %xmm4
-; CHECK-SSE2-NEXT:    psrad $3, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm6
-; CHECK-SSE2-NEXT:    pand %xmm3, %xmm6
-; CHECK-SSE2-NEXT:    paddd %xmm4, %xmm6
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [14,4294967295,16,1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm6
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,4294967295,1,1]
-; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u>
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    psrlq $32, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrad $3, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm2
-; CHECK-SSE41-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    psrlq $32, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index e1e841d921c2..e8c0b947baa1 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -71,87 +71,60 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_100:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm3
-; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    pand %xmm1, %xmm4
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $31, %xmm1
-; CHECK-SSE2-NEXT:    psrad $5, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [100,100,100,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrld $2, %xmm1
+; CHECK-SSE2-NEXT:    pslld $30, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_100:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-SSE41-NEXT:    pmuldq %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $2, %xmm1
+; CHECK-SSE41-NEXT:    pslld $30, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_100:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $30, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_100:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpsrad $5, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
-; CHECK-AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344]
+; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $30, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -239,87 +212,60 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_neg100:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2920577761,1374389535,2920577761,1374389535]
-; CHECK-SSE2-NEXT:    pand %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4294967295,0,4294967295,0]
-; CHECK-SSE2-NEXT:    pand %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    paddd %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    psrad $5, %xmm2
-; CHECK-SSE2-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrld $2, %xmm1
+; CHECK-SSE2-NEXT:    pslld $30, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_neg100:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <2920577761,u,2920577761,u>
-; CHECK-SSE41-NEXT:    pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    psrad $5, %xmm2
-; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $2, %xmm1
+; CHECK-SSE41-NEXT:    pslld $30, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_neg100:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $30, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_neg100:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761]
-; CHECK-AVX2-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT:    vpsrad $5, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344]
+; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $30, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index aad3157b9ebd..073161189f97 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -10,20 +10,17 @@
 define i1 @test_urem_odd(i13 %X) nounwind {
 ; X86-LABEL: test_urem_odd:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $3277, {{[0-9]+}}(%esp), %eax # imm = 0xCCD
 ; X86-NEXT:    andl $8191, %eax # imm = 0x1FFF
-; X86-NEXT:    imull $-13107, %eax, %eax # imm = 0xCCCD
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    cmpl $13108, %eax # imm = 0x3334
+; X86-NEXT:    cmpl $1639, %eax # imm = 0x667
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_urem_odd:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $8191, %edi # imm = 0x1FFF
-; X64-NEXT:    imull $-13107, %edi, %eax # imm = 0xCCCD
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    cmpl $13108, %eax # imm = 0x3334
+; X64-NEXT:    imull $3277, %edi, %eax # imm = 0xCCD
+; X64-NEXT:    andl $8191, %eax # imm = 0x1FFF
+; X64-NEXT:    cmpl $1639, %eax # imm = 0x667
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %urem = urem i13 %X, 5
@@ -34,20 +31,27 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 define i1 @test_urem_even(i27 %X) nounwind {
 ; X86-LABEL: test_urem_even:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl $134217727, %eax # imm = 0x7FFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $-1227133513, %eax, %eax # imm = 0xB6DB6DB7
-; X86-NEXT:    rorl %eax
-; X86-NEXT:    cmpl $306783379, %eax # imm = 0x12492493
+; X86-NEXT:    imull $115043767, {{[0-9]+}}(%esp), %eax # imm = 0x6DB6DB7
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $26, %ecx
+; X86-NEXT:    andl $134217726, %eax # imm = 0x7FFFFFE
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    andl $134217727, %eax # imm = 0x7FFFFFF
+; X86-NEXT:    cmpl $9586981, %eax # imm = 0x924925
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_urem_even:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $134217727, %edi # imm = 0x7FFFFFF
-; X64-NEXT:    imull $-1227133513, %edi, %eax # imm = 0xB6DB6DB7
-; X64-NEXT:    rorl %eax
-; X64-NEXT:    cmpl $306783379, %eax # imm = 0x12492493
+; X64-NEXT:    imull $115043767, %edi, %eax # imm = 0x6DB6DB7
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shll $26, %ecx
+; X64-NEXT:    andl $134217726, %eax # imm = 0x7FFFFFE
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl $134217727, %eax # imm = 0x7FFFFFF
+; X64-NEXT:    cmpl $9586981, %eax # imm = 0x924925
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %urem = urem i27 %X, 14
@@ -58,20 +62,21 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; X86-LABEL: test_urem_odd_setne:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    andb $15, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    imull $-51, %eax, %eax
-; X86-NEXT:    cmpb $51, %al
+; X86-NEXT:    cmpb $3, %al
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_urem_odd_setne:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $15, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    imull $-51, %eax, %eax
-; X64-NEXT:    cmpb $51, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    andb $15, %al
+; X64-NEXT:    cmpb $3, %al
 ; X64-NEXT:    seta %al
 ; X64-NEXT:    retq
   %urem = urem i4 %X, 5
@@ -82,20 +87,17 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; X86-LABEL: test_urem_negative_odd:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $307, {{[0-9]+}}(%esp), %eax # imm = 0x133
 ; X86-NEXT:    andl $511, %eax # imm = 0x1FF
-; X86-NEXT:    imull $-7885, %eax, %eax # imm = 0xE133
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    cmpl $129, %eax
+; X86-NEXT:    cmpw $1, %ax
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_urem_negative_odd:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $511, %edi # imm = 0x1FF
-; X64-NEXT:    imull $-7885, %edi, %eax # imm = 0xE133
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    cmpl $129, %eax
+; X64-NEXT:    imull $307, %edi, %eax # imm = 0x133
+; X64-NEXT:    andl $511, %eax # imm = 0x1FF
+; X64-NEXT:    cmpw $1, %ax
 ; X64-NEXT:    seta %al
 ; X64-NEXT:    retq
   %urem = urem i9 %X, -5
@@ -106,67 +108,55 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; X86-LABEL: test_urem_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $2047, %edx # imm = 0x7FF
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $683, {{[0-9]+}}(%esp), %eax # imm = 0x2AB
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $10, %ecx
+; X86-NEXT:    andl $2046, %eax # imm = 0x7FE
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    andl $2047, %eax # imm = 0x7FF
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $2047, %ecx # imm = 0x7FF
-; X86-NEXT:    imull $-5325, %ecx, %ecx # imm = 0xEB33
-; X86-NEXT:    addl $10650, %ecx # imm = 0x299A
-; X86-NEXT:    cmpw $32, %cx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    imull $-21845, %eax, %eax # imm = 0xAAAB
-; X86-NEXT:    rorw %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    cmpl $10922, %eax # imm = 0x2AAA
+; X86-NEXT:    cmpl $341, %eax # imm = 0x155
 ; X86-NEXT:    seta %al
-; X86-NEXT:    imull $28087, %edx, %edx # imm = 0x6DB7
-; X86-NEXT:    addl $-28087, %edx # imm = 0x9249
-; X86-NEXT:    movzwl %dx, %edx
-; X86-NEXT:    cmpl $9362, %edx # imm = 0x2492
+; X86-NEXT:    imull $1463, {{[0-9]+}}(%esp), %ecx # imm = 0x5B7
+; X86-NEXT:    addl $-1463, %ecx # imm = 0xFA49
+; X86-NEXT:    andl $2047, %ecx # imm = 0x7FF
+; X86-NEXT:    cmpl $292, %ecx # imm = 0x124
 ; X86-NEXT:    seta %dl
+; X86-NEXT:    imull $819, {{[0-9]+}}(%esp), %ecx # imm = 0x333
+; X86-NEXT:    addl $-1638, %ecx # imm = 0xF99A
+; X86-NEXT:    andl $2047, %ecx # imm = 0x7FF
+; X86-NEXT:    cmpw $1, %cx
+; X86-NEXT:    seta %cl
 ; X86-NEXT:    retl
 ;
 ; SSE2-LABEL: test_urem_vec:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl %esi, %eax
-; SSE2-NEXT:    andl $2047, %eax # imm = 0x7FF
-; SSE2-NEXT:    imull $9363, %eax, %ecx # imm = 0x2493
-; SSE2-NEXT:    shrl $16, %ecx
-; SSE2-NEXT:    subl %ecx, %eax
-; SSE2-NEXT:    movzwl %ax, %eax
-; SSE2-NEXT:    shrl %eax
-; SSE2-NEXT:    addl %ecx, %eax
-; SSE2-NEXT:    shrl $2, %eax
-; SSE2-NEXT:    leal (,%rax,8), %ecx
-; SSE2-NEXT:    subl %ecx, %eax
-; SSE2-NEXT:    addl %esi, %eax
-; SSE2-NEXT:    andl $2047, %edi # imm = 0x7FF
-; SSE2-NEXT:    imull $43691, %edi, %ecx # imm = 0xAAAB
-; SSE2-NEXT:    shrl $17, %ecx
-; SSE2-NEXT:    andl $-2, %ecx
-; SSE2-NEXT:    leal (%rcx,%rcx,2), %ecx
-; SSE2-NEXT:    subl %ecx, %edi
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    pinsrw $2, %eax, %xmm0
-; SSE2-NEXT:    movl %edx, %eax
-; SSE2-NEXT:    andl $2047, %eax # imm = 0x7FF
-; SSE2-NEXT:    imull $161, %eax, %ecx
-; SSE2-NEXT:    shrl $16, %ecx
-; SSE2-NEXT:    subl %ecx, %eax
-; SSE2-NEXT:    movzwl %ax, %eax
-; SSE2-NEXT:    shrl %eax
-; SSE2-NEXT:    addl %ecx, %eax
-; SSE2-NEXT:    shrl $10, %eax
-; SSE2-NEXT:    imull $2043, %eax, %eax # imm = 0x7FB
-; SSE2-NEXT:    subl %eax, %edx
-; SSE2-NEXT:    pinsrw $4, %edx, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    psubd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = <683,1463,819,u>
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2047,2047,2047,2047]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3]
+; SSE2-NEXT:    pslld $10, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    andps %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %dl
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
@@ -174,45 +164,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ;
 ; SSE41-LABEL: test_urem_vec:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movl %esi, %eax
-; SSE41-NEXT:    andl $2047, %eax # imm = 0x7FF
-; SSE41-NEXT:    imull $9363, %eax, %ecx # imm = 0x2493
-; SSE41-NEXT:    shrl $16, %ecx
-; SSE41-NEXT:    subl %ecx, %eax
-; SSE41-NEXT:    movzwl %ax, %eax
-; SSE41-NEXT:    shrl %eax
-; SSE41-NEXT:    addl %ecx, %eax
-; SSE41-NEXT:    shrl $2, %eax
-; SSE41-NEXT:    leal (,%rax,8), %ecx
-; SSE41-NEXT:    subl %ecx, %eax
-; SSE41-NEXT:    addl %esi, %eax
-; SSE41-NEXT:    andl $2047, %edi # imm = 0x7FF
-; SSE41-NEXT:    imull $43691, %edi, %ecx # imm = 0xAAAB
-; SSE41-NEXT:    shrl $17, %ecx
-; SSE41-NEXT:    andl $-2, %ecx
-; SSE41-NEXT:    leal (%rcx,%rcx,2), %ecx
-; SSE41-NEXT:    subl %ecx, %edi
 ; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
-; SSE41-NEXT:    movl %edx, %eax
-; SSE41-NEXT:    andl $2047, %eax # imm = 0x7FF
-; SSE41-NEXT:    imull $161, %eax, %ecx
-; SSE41-NEXT:    shrl $16, %ecx
-; SSE41-NEXT:    subl %ecx, %eax
-; SSE41-NEXT:    movzwl %ax, %eax
-; SSE41-NEXT:    shrl %eax
-; SSE41-NEXT:    addl %ecx, %eax
-; SSE41-NEXT:    shrl $10, %eax
-; SSE41-NEXT:    imull $2043, %eax, %eax # imm = 0x7FB
-; SSE41-NEXT:    subl %eax, %edx
-; SSE41-NEXT:    pinsrw $4, %edx, %xmm0
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    pextrb $4, %xmm1, %edx
-; SSE41-NEXT:    pextrb $8, %xmm1, %ecx
+; SSE41-NEXT:    pinsrd $1, %esi, %xmm0
+; SSE41-NEXT:    pinsrd $2, %edx, %xmm0
+; SSE41-NEXT:    psubd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    psrld $1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    pslld $10, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; SSE41-NEXT:    por %xmm2, %xmm3
+; SSE41-NEXT:    pand %xmm1, %xmm3
+; SSE41-NEXT:    pcmpgtd {{.*}}(%rip), %xmm3
+; SSE41-NEXT:    movd %xmm3, %eax
+; SSE41-NEXT:    pextrb $4, %xmm3, %edx
+; SSE41-NEXT:    pextrb $8, %xmm3, %ecx
 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE41-NEXT:    # kill: def $dl killed $dl killed $edx
 ; SSE41-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -220,42 +190,21 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ;
 ; AVX1-LABEL: test_urem_vec:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    movl %esi, %eax
-; AVX1-NEXT:    andl $2047, %eax # imm = 0x7FF
-; AVX1-NEXT:    imull $9363, %eax, %ecx # imm = 0x2493
-; AVX1-NEXT:    shrl $16, %ecx
-; AVX1-NEXT:    subl %ecx, %eax
-; AVX1-NEXT:    movzwl %ax, %eax
-; AVX1-NEXT:    shrl %eax
-; AVX1-NEXT:    addl %ecx, %eax
-; AVX1-NEXT:    shrl $2, %eax
-; AVX1-NEXT:    leal (,%rax,8), %ecx
-; AVX1-NEXT:    subl %ecx, %eax
-; AVX1-NEXT:    addl %esi, %eax
-; AVX1-NEXT:    andl $2047, %edi # imm = 0x7FF
-; AVX1-NEXT:    imull $43691, %edi, %ecx # imm = 0xAAAB
-; AVX1-NEXT:    shrl $17, %ecx
-; AVX1-NEXT:    andl $-2, %ecx
-; AVX1-NEXT:    leal (%rcx,%rcx,2), %ecx
-; AVX1-NEXT:    subl %ecx, %edi
 ; AVX1-NEXT:    vmovd %edi, %xmm0
-; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movl %edx, %eax
-; AVX1-NEXT:    andl $2047, %eax # imm = 0x7FF
-; AVX1-NEXT:    imull $161, %eax, %ecx
-; AVX1-NEXT:    shrl $16, %ecx
-; AVX1-NEXT:    subl %ecx, %eax
-; AVX1-NEXT:    movzwl %ax, %eax
-; AVX1-NEXT:    shrl %eax
-; AVX1-NEXT:    addl %ecx, %eax
-; AVX1-NEXT:    shrl $10, %eax
-; AVX1-NEXT:    imull $2043, %eax, %eax # imm = 0x7FB
-; AVX1-NEXT:    subl %eax, %edx
-; AVX1-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpslld $10, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vpextrb $4, %xmm0, %edx
 ; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
@@ -266,43 +215,18 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ;
 ; AVX2-LABEL: test_urem_vec:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    andl $2047, %esi # imm = 0x7FF
-; AVX2-NEXT:    imull $9363, %esi, %eax # imm = 0x2493
-; AVX2-NEXT:    shrl $16, %eax
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    subl %eax, %ecx
-; AVX2-NEXT:    movzwl %cx, %ecx
-; AVX2-NEXT:    shrl %ecx
-; AVX2-NEXT:    addl %eax, %ecx
-; AVX2-NEXT:    shrl $2, %ecx
-; AVX2-NEXT:    leal (,%rcx,8), %eax
-; AVX2-NEXT:    subl %eax, %ecx
-; AVX2-NEXT:    addl %esi, %ecx
-; AVX2-NEXT:    andl $2047, %edi # imm = 0x7FF
-; AVX2-NEXT:    imull $43691, %edi, %eax # imm = 0xAAAB
-; AVX2-NEXT:    shrl $17, %eax
-; AVX2-NEXT:    andl $-2, %eax
-; AVX2-NEXT:    leal (%rax,%rax,2), %eax
-; AVX2-NEXT:    subl %eax, %edi
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT:    andl $2047, %edx # imm = 0x7FF
-; AVX2-NEXT:    imull $161, %edx, %eax
-; AVX2-NEXT:    shrl $16, %eax
-; AVX2-NEXT:    movl %edx, %ecx
-; AVX2-NEXT:    subl %eax, %ecx
-; AVX2-NEXT:    movzwl %cx, %ecx
-; AVX2-NEXT:    shrl %ecx
-; AVX2-NEXT:    addl %eax, %ecx
-; AVX2-NEXT:    shrl $10, %ecx
-; AVX2-NEXT:    imull $2043, %ecx, %eax # imm = 0x7FB
-; AVX2-NEXT:    subl %eax, %edx
-; AVX2-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2047,2047,2047,2047]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
 ; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
@@ -313,40 +237,17 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ;
 ; AVX512VL-LABEL: test_urem_vec:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    andl $2047, %esi # imm = 0x7FF
-; AVX512VL-NEXT:    imull $9363, %esi, %eax # imm = 0x2493
-; AVX512VL-NEXT:    shrl $16, %eax
-; AVX512VL-NEXT:    movl %esi, %ecx
-; AVX512VL-NEXT:    subl %eax, %ecx
-; AVX512VL-NEXT:    movzwl %cx, %ecx
-; AVX512VL-NEXT:    shrl %ecx
-; AVX512VL-NEXT:    addl %eax, %ecx
-; AVX512VL-NEXT:    shrl $2, %ecx
-; AVX512VL-NEXT:    leal (,%rcx,8), %eax
-; AVX512VL-NEXT:    subl %eax, %ecx
-; AVX512VL-NEXT:    addl %esi, %ecx
-; AVX512VL-NEXT:    andl $2047, %edi # imm = 0x7FF
-; AVX512VL-NEXT:    imull $43691, %edi, %eax # imm = 0xAAAB
-; AVX512VL-NEXT:    shrl $17, %eax
-; AVX512VL-NEXT:    andl $-2, %eax
-; AVX512VL-NEXT:    leal (%rax,%rax,2), %eax
-; AVX512VL-NEXT:    subl %eax, %edi
 ; AVX512VL-NEXT:    vmovd %edi, %xmm0
-; AVX512VL-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX512VL-NEXT:    andl $2047, %edx # imm = 0x7FF
-; AVX512VL-NEXT:    imull $161, %edx, %eax
-; AVX512VL-NEXT:    shrl $16, %eax
-; AVX512VL-NEXT:    movl %edx, %ecx
-; AVX512VL-NEXT:    subl %eax, %ecx
-; AVX512VL-NEXT:    movzwl %cx, %ecx
-; AVX512VL-NEXT:    shrl %ecx
-; AVX512VL-NEXT:    addl %eax, %ecx
-; AVX512VL-NEXT:    shrl $10, %ecx
-; AVX512VL-NEXT:    imull $2043, %ecx, %eax # imm = 0x7FB
-; AVX512VL-NEXT:    subl %eax, %edx
-; AVX512VL-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpcmpneqd {{.*}}(%rip), %xmm0, %k0
+; AVX512VL-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpternlogd $200, %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpcmpnleud {{.*}}(%rip), %xmm0, %k0
 ; AVX512VL-NEXT:    kshiftrw $1, %k0, %k1
 ; AVX512VL-NEXT:    kmovw %k1, %edx
 ; AVX512VL-NEXT:    kshiftrw $2, %k0, %k1

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
index 5099b5605d70..671d7c21013d 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
@@ -295,17 +295,19 @@ define i1 @t8_3_2(i8 %X) nounwind {
 define i1 @t64_3_2(i64 %X) nounwind {
 ; X86-LABEL: t64_3_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __umoddi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    xorl $2, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    imull $-1431655765, {{[0-9]+}}(%esp), %edx # imm = 0xAAAAAAAB
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    adcl $-1431655766, %edx # imm = 0xAAAAAAAA
+; X86-NEXT:    cmpl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    sbbl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t64_3_2:

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 091a351239d8..8f9c45ae3b27 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -9,98 +9,71 @@
 define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    psrld $1, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $3, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,14,25,100]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $5, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [3435973837,2454267027,1374389535,1374389535]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    psrld $5, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    psrld $3, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,171798691,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,1374389535,1374389535]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrld $3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -197,84 +170,64 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    psrld $1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm3
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_eq:
 ; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -295,87 +248,66 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    psrld $1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783379,306783379,2,306783379]
+; CHECK-SSE41-NEXT:    pmaxud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm3
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_ne:
 ; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; CHECK-AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne:
@@ -396,98 +328,71 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    psrld $1, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,4294967295,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,14,4294967295,100]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $5, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    psrld $5, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -508,101 +413,73 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    psrld $1, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,4294967295,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $31, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,14,4294967295,100]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $5, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    psrld $5, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    psrld $31, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993460,306783379,2,42949673]
+; CHECK-SSE41-NEXT:    pmaxud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmaxud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; CHECK-AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne:
@@ -625,73 +502,64 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u>
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u>
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,268435455,858993459]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -714,81 +582,64 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    psrld $1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,306783378,268435455,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -811,95 +662,71 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    psrld $1, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2147483648,268435456,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $2, %xmm3
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,14,16,100]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $5, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [3435973837,2454267027,268435456,1374389535]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    psrld $5, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,268435455,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,268435456,1374389535]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -979,81 +806,54 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_one:
 ; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    psrld $1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pslld $31, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_one:
 ; CHECK-SSE41:       # %bb.0:
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pslld $31, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_one:
 ; CHECK-AVX1:       # %bb.0:
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1076,93 +876,71 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    psrld $1, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    psrld $5, %xmm1
-; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [5,14,1,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [3435973837,2454267027,0,1374389535]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    psrld $5, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,0,1374389535]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1187,73 +965,64 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <3435973837,u,2,u>
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <3435973837,u,2,u>
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,1,858993459]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1276,81 +1045,64 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    psrld $1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1373,95 +1125,71 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT:    psrld $1, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2147483648,2,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    psrld $2, %xmm3
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,14,2147483648,100]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $5, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    psrld $5, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1486,89 +1214,66 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,3435973837]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,4294967295,16,5]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,1,268435455,858993459]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1591,98 +1296,71 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    psrld $1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [14,4294967295,16,14]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm4
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,4294967295,1,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,1,268435456,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm4
-; CHECK-SSE41-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    psrld $31, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,1,268435455,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1705,93 +1383,71 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,268435456,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [5,4294967295,16,100]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    psrld $5, %xmm4
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    psrld $5, %xmm2
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,1,268435455,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm2, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1855,95 +1511,71 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    psrld $1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $31, %xmm4
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [14,4294967295,1,14]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,4294967295,0,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,1,1,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm4
-; CHECK-SSE41-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    psrld $31, %xmm4
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm4
-; CHECK-SSE41-NEXT:    psubd %xmm4, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,1,1,2147483648]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,1,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483648,1,1,2147483648]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1966,88 +1598,71 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    psrld $5, %xmm2
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [5,4294967295,1,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,4294967295,0,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    psrld $5, %xmm2
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,1,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm2, %xmm3
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -2072,83 +1687,71 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,1,0,3435973837]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,268435456,1,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [5,16,1,5]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,268435456,1,1]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,858993459]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -2171,92 +1774,71 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    psrld $1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [14,16,1,14]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,1,0,3067833783]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,268435456,1,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm4
-; CHECK-SSE41-NEXT:    pmuludq %xmm3, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,268435455,4294967295,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -2279,85 +1861,71 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    psrld $5, %xmm2
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [5,16,1,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,1,0,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,268435456,1,1073741824]
+; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    psrld $5, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm2
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm2, %xmm3
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -2381,90 +1949,66 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,0]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    psrld $2, %xmm1
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [5,4294967295,16,1]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,1,268435455,4294967295]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -2486,97 +2030,66 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    psrld $1, %xmm4
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [14,4294967295,16,1]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    psrld $31, %xmm3
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,4294967295,1,0]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    por %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT:    psrld $1, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm1
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT:    psrld $31, %xmm3
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT:    por %xmm2, %xmm1
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [306783378,1,268435455,4294967295]
+; CHECK-SSE41-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT:    vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0]
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
index 0d2c8062aa8b..f98f7428164f 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
@@ -119,67 +119,59 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
 define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: t32_6_part0:
 ; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    psubd {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [6,6,6,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    psrld $1, %xmm0
+; CHECK-SSE2-NEXT:    pslld $31, %xmm3
+; CHECK-SSE2-NEXT:    por %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: t32_6_part0:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $1, %xmm1
+; CHECK-SSE41-NEXT:    pslld $31, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: t32_6_part0:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: t32_6_part0:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6]
-; CHECK-AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; CHECK-AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-AVX512VL-LABEL: t32_6_part0:
@@ -198,67 +190,58 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
 define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: t32_6_part1:
 ; CHECK-SSE2:       # %bb.0:
+; CHECK-SSE2-NEXT:    psubd {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [6,6,6,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; CHECK-SSE2-NEXT:    movdqa %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    psrld $1, %xmm0
+; CHECK-SSE2-NEXT:    pslld $31, %xmm3
+; CHECK-SSE2-NEXT:    por %xmm0, %xmm3
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: t32_6_part1:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $1, %xmm1
+; CHECK-SSE41-NEXT:    pslld $31, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [715827881,715827881,715827882,715827882]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: t32_6_part1:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: t32_6_part1:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6]
-; CHECK-AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-AVX512VL-LABEL: t32_6_part1:
@@ -277,71 +260,49 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
 define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: t32_tautological:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    psrld $1, %xmm3
-; CHECK-SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,2,3]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: t32_tautological:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    psrld $1, %xmm3
-; CHECK-SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    pxor %xmm0, %xmm0
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: t32_tautological:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: t32_tautological:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-AVX512VL-LABEL: t32_tautological:

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 1eea18758907..ef7a07092b77 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -65,73 +65,55 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_100:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psrld $5, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [100,100,100,100]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrld $2, %xmm1
+; CHECK-SSE2-NEXT:    pslld $30, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_100:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-SSE41-NEXT:    pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    psrld $5, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $2, %xmm1
+; CHECK-SSE41-NEXT:    pslld $30, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_100:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $30, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_100:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
-; CHECK-AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $30, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -196,74 +178,51 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_neg100:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    psrld $5, %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    psrld $2, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT:    psrld $27, %xmm2
-; CHECK-SSE2-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrld $31, %xmm0
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    psrld $2, %xmm1
+; CHECK-SSE2-NEXT:    pslld $30, %xmm0
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pandn {{.*}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_neg100:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    psrld $5, %xmm1
-; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    psrld $2, %xmm2
-; CHECK-SSE41-NEXT:    pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    psrld $27, %xmm2
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT:    psrld $2, %xmm1
+; CHECK-SSE41-NEXT:    pslld $30, %xmm0
+; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,42949672,1,42949672]
+; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_neg100:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpsrld $27, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpslld $30, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_neg100:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925]
-; CHECK-AVX2-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX2-NEXT:    vpslld $30, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpminud {{.*}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    retq


        


More information about the llvm-commits mailing list