[llvm] 777a58e - Support {S,U}REMEqFold before legalization
Simonas Kazlauskas via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 31 15:35:49 PDT 2021
Author: Simonas Kazlauskas
Date: 2021-04-01T01:35:41+03:00
New Revision: 777a58e05b22973d902e78091a2e06b99c71b65c
URL: https://github.com/llvm/llvm-project/commit/777a58e05b22973d902e78091a2e06b99c71b65c
DIFF: https://github.com/llvm/llvm-project/commit/777a58e05b22973d902e78091a2e06b99c71b65c.diff
LOG: Support {S,U}REMEqFold before legalization
This allows these optimisations to apply to e.g. `urem i16` directly
before `urem` is promoted to i32 on architectures where i16 operations
are not intrinsically legal (such as on Aarch64). The legalization then
later can happen more directly and generated code gets a chance to avoid
wasting time on computing results in types wider than necessary, in the end.
Seems like mostly an improvement in terms of results at least as far as x86_64 and aarch64 are concerned, with a few regressions here and there. It also helps in preventing regressions in changes like {D87976}.
Reviewed By: lebedev.ri
Differential Revision: https://reviews.llvm.org/D88785
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
llvm/test/CodeGen/AArch64/srem-seteq.ll
llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
llvm/test/CodeGen/AArch64/urem-seteq.ll
llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll
llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll
llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 04067ffd2303..4f14c7e72409 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5453,7 +5453,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
EVT ShSVT = ShVT.getScalarType();
// If MUL is unavailable, we cannot proceed in any case.
- if (!isOperationLegalOrCustom(ISD::MUL, VT))
+ if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT))
return SDValue();
bool ComparingWithAllZeros = true;
@@ -5583,7 +5583,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
}
if (!ComparingWithAllZeros && !AllComparisonsWithNonZerosAreTautological) {
- if (!isOperationLegalOrCustom(ISD::SUB, VT))
+ if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::SUB, VT))
return SDValue(); // FIXME: Could/should use `ISD::ADD`?
assert(CompTargetNode.getValueType() == N.getValueType() &&
"Expecting that the types on LHS and RHS of comparisons match.");
@@ -5598,7 +5598,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
// divisors as a performance improvement, since rotating by 0 is a no-op.
if (HadEvenDivisor) {
// We need ROTR to do this.
- if (!isOperationLegalOrCustom(ISD::ROTR, VT))
+ if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT))
return SDValue();
SDNodeFlags Flags;
Flags.setExact(true);
@@ -5628,6 +5628,8 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
DAG.getSetCC(DL, SETCCVT, D, CompTargetNode, ISD::SETULE);
Created.push_back(TautologicalInvertedChannels.getNode());
+ // NOTE: we avoid letting illegal types through even if we're before legalize
+ // ops – legalization has a hard time producing good code for this.
if (isOperationLegalOrCustom(ISD::VSELECT, SETCCVT)) {
// If we have a vector select, let's replace the comparison results in the
// affected lanes with the correct tautological result.
@@ -5638,6 +5640,8 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
}
// Else, we can just invert the comparison result in the appropriate lanes.
+ //
+ // NOTE: see the note above VSELECT above.
if (isOperationLegalOrCustom(ISD::XOR, SETCCVT))
return DAG.getNode(ISD::XOR, DL, SETCCVT, NewCC,
TautologicalInvertedChannels);
@@ -5692,8 +5696,9 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
EVT ShSVT = ShVT.getScalarType();
- // If MUL is unavailable, we cannot proceed in any case.
- if (!isOperationLegalOrCustom(ISD::MUL, VT))
+ // If we are after ops legalization, and MUL is unavailable, we can not
+ // proceed.
+ if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT))
return SDValue();
// TODO: Could support comparing with non-zero too.
@@ -5848,7 +5853,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
if (NeedToApplyOffset) {
// We need ADD to do this.
- if (!isOperationLegalOrCustom(ISD::ADD, VT))
+ if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ADD, VT))
return SDValue();
// (add (mul N, P), A)
@@ -5860,7 +5865,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
// divisors as a performance improvement, since rotating by 0 is a no-op.
if (HadEvenDivisor) {
// We need ROTR to do this.
- if (!isOperationLegalOrCustom(ISD::ROTR, VT))
+ if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT))
return SDValue();
SDNodeFlags Flags;
Flags.setExact(true);
@@ -5883,6 +5888,9 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
// we must fix-up results for said lanes.
assert(VT.isVector() && "Can/should only get here for vectors.");
+ // NOTE: we avoid letting illegal types through even if we're before legalize
+ // ops – legalization has a hard time producing good code for the code that
+ // follows.
if (!isOperationLegalOrCustom(ISD::SETEQ, VT) ||
!isOperationLegalOrCustom(ISD::AND, VT) ||
!isOperationLegalOrCustom(Cond, VT) ||
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index 80b8c2ade9b3..f4c86c082332 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -4,14 +4,14 @@
define i1 @test_srem_odd(i29 %X) nounwind {
; CHECK-LABEL: test_srem_odd:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w9, #33099
-; CHECK-NEXT: mov w10, #64874
-; CHECK-NEXT: sbfx w8, w0, #0, #29
-; CHECK-NEXT: movk w9, #48986, lsl #16
-; CHECK-NEXT: movk w10, #330, lsl #16
-; CHECK-NEXT: madd w8, w8, w9, w10
-; CHECK-NEXT: mov w9, #64213
-; CHECK-NEXT: movk w9, #661, lsl #16
+; CHECK-NEXT: mov w8, #33099
+; CHECK-NEXT: mov w9, #24493
+; CHECK-NEXT: movk w8, #8026, lsl #16
+; CHECK-NEXT: movk w9, #41, lsl #16
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #48987
+; CHECK-NEXT: and w8, w8, #0x1fffffff
+; CHECK-NEXT: movk w9, #82, lsl #16
; CHECK-NEXT: cmp w8, w9
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
index 3fb4fa03d421..cc6cf4064f59 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
@@ -9,20 +9,18 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: adrp x8, .LCPI0_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1]
-; CHECK-NEXT: adrp x8, .LCPI0_2
-; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2]
; CHECK-NEXT: adrp x8, .LCPI0_3
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT: adrp x8, .LCPI0_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_2]
+; CHECK-NEXT: adrp x8, .LCPI0_4
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: usra v3.4s, v1.4s, #31
-; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -82,27 +80,19 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_allones_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI3_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: adrp x8, .LCPI3_1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT: adrp x8, .LCPI3_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2]
-; CHECK-NEXT: adrp x8, .LCPI3_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_3]
-; CHECK-NEXT: adrp x8, .LCPI3_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_4]
-; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mov w8, #28087
+; CHECK-NEXT: mov w9, #9362
+; CHECK-NEXT: movk w8, #46811, lsl #16
+; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: adrp x10, .LCPI3_0
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI3_0]
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: shl v0.4s, v2.4s, #31
+; CHECK-NEXT: ushr v1.4s, v2.4s, #1
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -114,29 +104,21 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_allones_ne:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI4_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT: adrp x8, .LCPI4_1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT: adrp x8, .LCPI4_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT: adrp x8, .LCPI4_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT: adrp x8, .LCPI4_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_4]
-; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mov w8, #28087
+; CHECK-NEXT: mov w9, #9362
+; CHECK-NEXT: movk w8, #46811, lsl #16
+; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: adrp x10, .LCPI4_0
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI4_0]
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: shl v0.4s, v2.4s, #31
+; CHECK-NEXT: ushr v1.4s, v2.4s, #1
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v3.4s
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
%cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -152,23 +134,18 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
; CHECK-NEXT: adrp x8, .LCPI5_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1]
-; CHECK-NEXT: adrp x8, .LCPI5_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
; CHECK-NEXT: adrp x8, .LCPI5_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_3]
+; CHECK-NEXT: adrp x8, .LCPI5_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI5_2]
; CHECK-NEXT: adrp x8, .LCPI5_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -184,25 +161,20 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]
; CHECK-NEXT: adrp x8, .LCPI6_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1]
-; CHECK-NEXT: adrp x8, .LCPI6_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2]
; CHECK-NEXT: adrp x8, .LCPI6_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_3]
+; CHECK-NEXT: adrp x8, .LCPI6_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_2]
; CHECK-NEXT: adrp x8, .LCPI6_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
%cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -220,20 +192,18 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0]
; CHECK-NEXT: adrp x8, .LCPI7_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1]
-; CHECK-NEXT: adrp x8, .LCPI7_2
-; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2]
; CHECK-NEXT: adrp x8, .LCPI7_3
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT: adrp x8, .LCPI7_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI7_2]
+; CHECK-NEXT: adrp x8, .LCPI7_4
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: usra v3.4s, v1.4s, #31
-; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -251,14 +221,18 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
; CHECK-NEXT: adrp x8, .LCPI8_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1]
-; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v0.4s
-; CHECK-NEXT: sshr v3.4s, v1.4s, #3
-; CHECK-NEXT: usra v3.4s, v1.4s, #31
-; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: adrp x8, .LCPI8_3
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_3]
+; CHECK-NEXT: adrp x8, .LCPI8_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_2]
+; CHECK-NEXT: adrp x8, .LCPI8_4
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -276,20 +250,18 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0]
; CHECK-NEXT: adrp x8, .LCPI9_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1]
-; CHECK-NEXT: adrp x8, .LCPI9_2
-; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2]
; CHECK-NEXT: adrp x8, .LCPI9_3
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_3]
+; CHECK-NEXT: adrp x8, .LCPI9_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_2]
+; CHECK-NEXT: adrp x8, .LCPI9_4
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: usra v3.4s, v1.4s, #31
-; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -328,25 +300,19 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_one:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI11_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT: adrp x8, .LCPI11_1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1]
-; CHECK-NEXT: adrp x8, .LCPI11_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2]
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: adrp x8, .LCPI11_3
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_3]
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v0.4s
-; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mov w8, #28087
+; CHECK-NEXT: mov w9, #9362
+; CHECK-NEXT: movk w8, #46811, lsl #16
+; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: adrp x10, .LCPI11_0
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI11_0]
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: shl v0.4s, v2.4s, #31
+; CHECK-NEXT: ushr v1.4s, v2.4s, #1
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -364,24 +330,18 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
; CHECK-NEXT: adrp x8, .LCPI12_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1]
-; CHECK-NEXT: adrp x8, .LCPI12_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2]
; CHECK-NEXT: adrp x8, .LCPI12_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT: adrp x8, .LCPI12_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_2]
; CHECK-NEXT: adrp x8, .LCPI12_4
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -493,23 +453,18 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: adrp x8, .LCPI16_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1]
-; CHECK-NEXT: adrp x8, .LCPI16_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2]
; CHECK-NEXT: adrp x8, .LCPI16_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT: adrp x8, .LCPI16_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_2]
; CHECK-NEXT: adrp x8, .LCPI16_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -527,23 +482,18 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
; CHECK-NEXT: adrp x8, .LCPI17_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1]
-; CHECK-NEXT: adrp x8, .LCPI17_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2]
; CHECK-NEXT: adrp x8, .LCPI17_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_3]
+; CHECK-NEXT: adrp x8, .LCPI17_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_2]
; CHECK-NEXT: adrp x8, .LCPI17_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -561,23 +511,18 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0]
; CHECK-NEXT: adrp x8, .LCPI18_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1]
-; CHECK-NEXT: adrp x8, .LCPI18_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2]
; CHECK-NEXT: adrp x8, .LCPI18_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT: adrp x8, .LCPI18_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_2]
; CHECK-NEXT: adrp x8, .LCPI18_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -616,27 +561,19 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_allones_and_one:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI20_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT: adrp x8, .LCPI20_1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1]
-; CHECK-NEXT: adrp x8, .LCPI20_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2]
-; CHECK-NEXT: adrp x8, .LCPI20_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_3]
-; CHECK-NEXT: adrp x8, .LCPI20_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_4]
-; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mov w8, #28087
+; CHECK-NEXT: mov w9, #9362
+; CHECK-NEXT: movk w8, #46811, lsl #16
+; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: adrp x10, .LCPI20_0
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI20_0]
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: shl v0.4s, v2.4s, #31
+; CHECK-NEXT: ushr v1.4s, v2.4s, #1
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -654,23 +591,18 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0]
; CHECK-NEXT: adrp x8, .LCPI21_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1]
-; CHECK-NEXT: adrp x8, .LCPI21_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2]
; CHECK-NEXT: adrp x8, .LCPI21_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT: adrp x8, .LCPI21_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_2]
; CHECK-NEXT: adrp x8, .LCPI21_4
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -690,24 +622,18 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0]
; CHECK-NEXT: adrp x8, .LCPI22_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1]
-; CHECK-NEXT: adrp x8, .LCPI22_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2]
; CHECK-NEXT: adrp x8, .LCPI22_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT: adrp x8, .LCPI22_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_2]
; CHECK-NEXT: adrp x8, .LCPI22_4
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -725,21 +651,18 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0]
; CHECK-NEXT: adrp x8, .LCPI23_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1]
-; CHECK-NEXT: adrp x8, .LCPI23_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2]
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: adrp x8, .LCPI23_3
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_3]
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v0.4s
-; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_3]
+; CHECK-NEXT: adrp x8, .LCPI23_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_2]
+; CHECK-NEXT: adrp x8, .LCPI23_4
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -757,24 +680,18 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0]
; CHECK-NEXT: adrp x8, .LCPI24_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1]
-; CHECK-NEXT: adrp x8, .LCPI24_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2]
; CHECK-NEXT: adrp x8, .LCPI24_3
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3]
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_3]
+; CHECK-NEXT: adrp x8, .LCPI24_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI24_2]
; CHECK-NEXT: adrp x8, .LCPI24_4
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_4]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_4]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -793,22 +710,18 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0]
; CHECK-NEXT: adrp x8, .LCPI25_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1]
-; CHECK-NEXT: adrp x8, .LCPI25_2
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_2]
; CHECK-NEXT: adrp x8, .LCPI25_3
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_3]
-; CHECK-NEXT: neg v4.4s, v4.4s
-; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
-; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_3]
+; CHECK-NEXT: adrp x8, .LCPI25_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI25_2]
+; CHECK-NEXT: adrp x8, .LCPI25_4
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -825,22 +738,18 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0]
; CHECK-NEXT: adrp x8, .LCPI26_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1]
-; CHECK-NEXT: adrp x8, .LCPI26_2
-; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_2]
; CHECK-NEXT: adrp x8, .LCPI26_3
-; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3]
-; CHECK-NEXT: neg v4.4s, v4.4s
-; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
-; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #31
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_3]
+; CHECK-NEXT: adrp x8, .LCPI26_2
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI26_2]
+; CHECK-NEXT: adrp x8, .LCPI26_4
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index 328815b60138..34f7c6db5c39 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -29,17 +29,20 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_100:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #34079
-; CHECK-NEXT: movk w8, #20971, lsl #16
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT: sshr v3.4s, v2.4s, #5
-; CHECK-NEXT: movi v1.4s, #100
-; CHECK-NEXT: usra v3.4s, v2.4s, #31
-; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mov w8, #23593
+; CHECK-NEXT: mov w9, #47184
+; CHECK-NEXT: movk w8, #49807, lsl #16
+; CHECK-NEXT: movk w9, #1310, lsl #16
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: mov w10, #23592
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: movk w10, #655, lsl #16
+; CHECK-NEXT: shl v0.4s, v2.4s, #30
+; CHECK-NEXT: ushr v1.4s, v2.4s, #2
+; CHECK-NEXT: dup v3.4s, w10
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -79,17 +82,20 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_even_neg100:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI3_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: adrp x8, .LCPI3_1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: sshr v3.4s, v1.4s, #5
-; CHECK-NEXT: usra v3.4s, v1.4s, #31
-; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mov w8, #23593
+; CHECK-NEXT: mov w9, #47184
+; CHECK-NEXT: movk w8, #49807, lsl #16
+; CHECK-NEXT: movk w9, #1310, lsl #16
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: mov w10, #23592
+; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: movk w10, #655, lsl #16
+; CHECK-NEXT: shl v0.4s, v2.4s, #30
+; CHECK-NEXT: ushr v1.4s, v2.4s, #2
+; CHECK-NEXT: dup v3.4s, w10
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll
index 943b138124b5..65ffb6db414e 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll
@@ -83,15 +83,13 @@ define i32 @test_srem_odd_bit31(i32 %X) nounwind {
define i16 @test_srem_even(i16 %X) nounwind {
; CHECK-LABEL: test_srem_even:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: mov w9, #18725
-; CHECK-NEXT: mul w8, w8, w9
-; CHECK-NEXT: asr w9, w8, #18
-; CHECK-NEXT: add w8, w9, w8, lsr #31
-; CHECK-NEXT: mov w9, #14
-; CHECK-NEXT: msub w8, w8, w9, w0
-; CHECK-NEXT: tst w8, #0xffff
-; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: mov w8, #28087
+; CHECK-NEXT: mov w9, #4680
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: lsl w10, w8, #15
+; CHECK-NEXT: bfxil w10, w8, #1, #15
+; CHECK-NEXT: cmp w9, w10, uxth
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%srem = srem i16 %X, 14
%cmp = icmp ne i16 %srem, 0
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
index b626ce2f598a..a7642088fa26 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
@@ -4,13 +4,10 @@
define i1 @test_urem_odd(i13 %X) nounwind {
; CHECK-LABEL: test_urem_odd:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w9, #52429
-; CHECK-NEXT: and w8, w0, #0x1fff
-; CHECK-NEXT: movk w9, #52428, lsl #16
-; CHECK-NEXT: mul w8, w8, w9
-; CHECK-NEXT: mov w9, #13108
-; CHECK-NEXT: movk w9, #13107, lsl #16
-; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: mov w8, #3277
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: and w8, w8, #0x1fff
+; CHECK-NEXT: cmp w8, #1639 // =1639
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i13 %X, 5
@@ -21,13 +18,14 @@ define i1 @test_urem_odd(i13 %X) nounwind {
define i1 @test_urem_even(i27 %X) nounwind {
; CHECK-LABEL: test_urem_even:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w9, #28087
-; CHECK-NEXT: and w8, w0, #0x7ffffff
-; CHECK-NEXT: movk w9, #46811, lsl #16
-; CHECK-NEXT: mul w8, w8, w9
-; CHECK-NEXT: mov w9, #9363
-; CHECK-NEXT: ror w8, w8, #1
-; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: mov w8, #28087
+; CHECK-NEXT: movk w8, #1755, lsl #16
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: lsl w9, w8, #26
+; CHECK-NEXT: bfxil w9, w8, #1, #26
+; CHECK-NEXT: and w8, w9, #0x7ffffff
+; CHECK-NEXT: mov w9, #18725
+; CHECK-NEXT: movk w9, #146, lsl #16
; CHECK-NEXT: cmp w8, w9
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
@@ -39,12 +37,10 @@ define i1 @test_urem_even(i27 %X) nounwind {
define i1 @test_urem_odd_setne(i4 %X) nounwind {
; CHECK-LABEL: test_urem_odd_setne:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w9, #52429
-; CHECK-NEXT: and w8, w0, #0xf
-; CHECK-NEXT: movk w9, #52428, lsl #16
-; CHECK-NEXT: mul w8, w8, w9
-; CHECK-NEXT: mov w9, #858993459
-; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: mov w8, #13
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: cmp w8, #3 // =3
; CHECK-NEXT: cset w0, hi
; CHECK-NEXT: ret
%urem = urem i4 %X, 5
@@ -55,13 +51,10 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
define i1 @test_urem_negative_odd(i9 %X) nounwind {
; CHECK-LABEL: test_urem_negative_odd:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w9, #57651
-; CHECK-NEXT: and w8, w0, #0x1ff
-; CHECK-NEXT: movk w9, #43302, lsl #16
-; CHECK-NEXT: mul w8, w8, w9
-; CHECK-NEXT: mov w9, #17191
-; CHECK-NEXT: movk w9, #129, lsl #16
-; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: mov w8, #307
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: and w8, w8, #0x1ff
+; CHECK-NEXT: cmp w8, #1 // =1
; CHECK-NEXT: cset w0, hi
; CHECK-NEXT: ret
%urem = urem i9 %X, -5
@@ -72,41 +65,29 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; CHECK-LABEL: test_urem_vec:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w12, #43691
-; CHECK-NEXT: and w8, w0, #0x7ff
-; CHECK-NEXT: movk w12, #43690, lsl #16
-; CHECK-NEXT: umull x12, w8, w12
-; CHECK-NEXT: mov w11, #25663
-; CHECK-NEXT: mov w13, #6
-; CHECK-NEXT: lsr x12, x12, #34
-; CHECK-NEXT: and w10, w2, #0x7ff
-; CHECK-NEXT: movk w11, #160, lsl #16
-; CHECK-NEXT: msub w8, w12, w13, w8
-; CHECK-NEXT: mov w12, #18725
-; CHECK-NEXT: and w9, w1, #0x7ff
-; CHECK-NEXT: movk w12, #9362, lsl #16
-; CHECK-NEXT: umull x11, w10, w11
-; CHECK-NEXT: adrp x13, .LCPI4_0
-; CHECK-NEXT: umull x12, w9, w12
-; CHECK-NEXT: lsr x11, x11, #32
-; CHECK-NEXT: ldr d0, [x13, :lo12:.LCPI4_0]
-; CHECK-NEXT: lsr x12, x12, #32
-; CHECK-NEXT: sub w13, w10, w11
-; CHECK-NEXT: add w11, w11, w13, lsr #1
-; CHECK-NEXT: sub w13, w9, w12
-; CHECK-NEXT: add w12, w12, w13, lsr #1
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov w8, #2043
-; CHECK-NEXT: lsr w11, w11, #10
-; CHECK-NEXT: lsr w12, w12, #2
-; CHECK-NEXT: msub w8, w11, w8, w10
-; CHECK-NEXT: sub w10, w12, w12, lsl #3
-; CHECK-NEXT: add w9, w9, w10
-; CHECK-NEXT: mov v1.h[1], w9
-; CHECK-NEXT: mov v1.h[2], w8
-; CHECK-NEXT: bic v1.4h, #248, lsl #8
-; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: adrp x8, .LCPI4_0
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: adrp x9, .LCPI4_1
+; CHECK-NEXT: mov v0.h[1], w1
+; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI4_1]
+; CHECK-NEXT: adrp x8, .LCPI4_2
+; CHECK-NEXT: adrp x9, .LCPI4_3
+; CHECK-NEXT: mov v0.h[2], w2
+; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr d1, [x9, :lo12:.LCPI4_3]
+; CHECK-NEXT: mul v0.4h, v0.4h, v2.4h
+; CHECK-NEXT: adrp x8, .LCPI4_4
+; CHECK-NEXT: shl v2.4h, v0.4h, #1
+; CHECK-NEXT: ushl v2.4h, v2.4h, v3.4h
+; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_4]
+; CHECK-NEXT: neg v1.4h, v1.4h
+; CHECK-NEXT: bic v0.4h, #248, lsl #8
+; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: bic v0.4h, #248, lsl #8
+; CHECK-NEXT: cmhi v0.4h, v0.4h, v3.4h
; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: umov w1, v0.h[1]
; CHECK-NEXT: umov w2, v0.h[2]
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
index a2ea19dd2314..a3d5c7da7b6d 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll
@@ -195,15 +195,12 @@ define i1 @t32_6_5(i32 %X) nounwind {
define i1 @t16_3_2(i16 %X) nounwind {
; CHECK-LABEL: t16_3_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: mov w9, #43691
-; CHECK-NEXT: mul w8, w8, w9
-; CHECK-NEXT: lsr w8, w8, #17
-; CHECK-NEXT: add w8, w8, w8, lsl #1
-; CHECK-NEXT: sub w8, w0, w8
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #2 // =2
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: mov w8, #-21845
+; CHECK-NEXT: mov w9, #-21846
+; CHECK-NEXT: madd w8, w0, w8, w9
+; CHECK-NEXT: mov w9, #21845
+; CHECK-NEXT: cmp w9, w8, uxth
+; CHECK-NEXT: cset w0, hi
; CHECK-NEXT: ret
%urem = urem i16 %X, 3
%cmp = icmp eq i16 %urem, 2
@@ -213,15 +210,12 @@ define i1 @t16_3_2(i16 %X) nounwind {
define i1 @t8_3_2(i8 %X) nounwind {
; CHECK-LABEL: t8_3_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: mov w9, #171
-; CHECK-NEXT: mul w8, w8, w9
-; CHECK-NEXT: lsr w8, w8, #9
-; CHECK-NEXT: add w8, w8, w8, lsl #1
-; CHECK-NEXT: sub w8, w0, w8
+; CHECK-NEXT: mov w8, #-85
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: sub w8, w8, #86 // =86
; CHECK-NEXT: and w8, w8, #0xff
-; CHECK-NEXT: cmp w8, #2 // =2
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: cmp w8, #85 // =85
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i8 %X, 3
%cmp = icmp eq i8 %urem, 2
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
index 0faeebbacacc..bfdbd2b91d92 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
@@ -11,17 +11,14 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1]
; CHECK-NEXT: adrp x8, .LCPI0_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI0_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -79,17 +76,14 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1]
; CHECK-NEXT: adrp x8, .LCPI3_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI3_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -107,19 +101,16 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
; CHECK-NEXT: adrp x8, .LCPI4_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI4_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
%cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -137,17 +128,14 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1]
; CHECK-NEXT: adrp x8, .LCPI5_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI5_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -165,19 +153,16 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1]
; CHECK-NEXT: adrp x8, .LCPI6_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI6_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
%cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -197,13 +182,14 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1]
; CHECK-NEXT: adrp x8, .LCPI7_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2]
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: adrp x8, .LCPI7_3
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -223,17 +209,14 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1]
; CHECK-NEXT: adrp x8, .LCPI8_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI8_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -253,17 +236,14 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1]
; CHECK-NEXT: adrp x8, .LCPI9_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI9_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -299,26 +279,16 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_even_one:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI11_0
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT: adrp x8, .LCPI11_1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1]
-; CHECK-NEXT: adrp x8, .LCPI11_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
-; CHECK-NEXT: adrp x8, .LCPI11_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_3]
-; CHECK-NEXT: adrp x8, .LCPI11_4
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_4]
-; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mov w8, #28087
+; CHECK-NEXT: movk w8, #46811, lsl #16
+; CHECK-NEXT: adrp x9, .LCPI11_0
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0]
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: shl v1.4s, v0.4s, #31
+; CHECK-NEXT: ushr v0.4s, v0.4s, #1
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -338,20 +308,14 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1]
; CHECK-NEXT: adrp x8, .LCPI12_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI12_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_3]
-; CHECK-NEXT: adrp x8, .LCPI12_4
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_4]
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -373,13 +337,14 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1]
; CHECK-NEXT: adrp x8, .LCPI13_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2]
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: adrp x8, .LCPI13_3
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -399,17 +364,14 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1]
; CHECK-NEXT: adrp x8, .LCPI14_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI14_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -429,17 +391,14 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1]
; CHECK-NEXT: adrp x8, .LCPI15_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI15_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -461,13 +420,14 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1]
; CHECK-NEXT: adrp x8, .LCPI16_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2]
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: adrp x8, .LCPI16_3
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -487,17 +447,14 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1]
; CHECK-NEXT: adrp x8, .LCPI17_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI17_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -517,13 +474,14 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1]
; CHECK-NEXT: adrp x8, .LCPI18_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2]
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: adrp x8, .LCPI18_3
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -564,20 +522,14 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1]
; CHECK-NEXT: adrp x8, .LCPI20_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI20_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_3]
-; CHECK-NEXT: adrp x8, .LCPI20_4
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_4]
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -598,15 +550,13 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI21_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2]
; CHECK-NEXT: adrp x8, .LCPI21_3
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3]
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -629,15 +579,13 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI22_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2]
; CHECK-NEXT: adrp x8, .LCPI22_3
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3]
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -657,20 +605,14 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1]
; CHECK-NEXT: adrp x8, .LCPI23_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI23_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3]
-; CHECK-NEXT: adrp x8, .LCPI23_4
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_4]
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -691,15 +633,13 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI24_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2]
; CHECK-NEXT: adrp x8, .LCPI24_3
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3]
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -721,15 +661,13 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-NEXT: adrp x8, .LCPI25_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_2]
; CHECK-NEXT: adrp x8, .LCPI25_3
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_3]
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -748,20 +686,14 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1]
; CHECK-NEXT: adrp x8, .LCPI26_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI26_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3]
-; CHECK-NEXT: adrp x8, .LCPI26_4
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_4]
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_3]
; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
index 85abb4d7f830..3cf09fb2d5cd 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
@@ -45,18 +45,20 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_6_part0:
; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: adrp x9, .LCPI2_0
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_0]
-; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #2
-; CHECK-NEXT: movi v3.4s, #6
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: dup v2.4s, w8
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov w9, #43690
+; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: movk w9, #10922, lsl #16
+; CHECK-NEXT: shl v1.4s, v0.4s, #31
+; CHECK-NEXT: ushr v0.4s, v0.4s, #1
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: dup v1.4s, w9
+; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
@@ -67,18 +69,19 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_6_part1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #43691
-; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: adrp x9, .LCPI3_0
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_0]
-; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: ushr v1.4s, v1.4s, #2
-; CHECK-NEXT: movi v3.4s, #6
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT: mov w9, #43691
+; CHECK-NEXT: movk w9, #43690, lsl #16
+; CHECK-NEXT: adrp x8, .LCPI3_1
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_1]
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: shl v1.4s, v0.4s, #31
+; CHECK-NEXT: ushr v0.4s, v0.4s, #1
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
@@ -92,22 +95,16 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: adrp x8, .LCPI4_1
+; CHECK-NEXT: mov w9, #43691
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT: adrp x8, .LCPI4_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT: adrp x8, .LCPI4_3
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT: adrp x8, .LCPI4_4
-; CHECK-NEXT: umull2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s
-; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI4_4]
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s
+; CHECK-NEXT: movk w9, #43690, lsl #16
+; CHECK-NEXT: dup v3.4s, w9
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: movi d1, #0x00ffffffff0000
+; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 1, i32 1, i32 2, i32 3>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 1, i32 2, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
index ae51708e02ab..5a6add7eb49c 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
@@ -26,16 +26,17 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_even_100:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #34079
-; CHECK-NEXT: movk w8, #20971, lsl #16
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT: movi v1.4s, #100
-; CHECK-NEXT: ushr v2.4s, v2.4s, #5
-; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mov w8, #23593
+; CHECK-NEXT: movk w8, #49807, lsl #16
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: mov w9, #23592
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: movk w9, #655, lsl #16
+; CHECK-NEXT: shl v1.4s, v0.4s, #30
+; CHECK-NEXT: ushr v0.4s, v0.4s, #2
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -74,19 +75,11 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: adrp x8, .LCPI3_1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT: adrp x8, .LCPI3_2
-; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
-; CHECK-NEXT: adrp x8, .LCPI3_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3]
-; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: shl v1.4s, v0.4s, #30
+; CHECK-NEXT: ushr v0.4s, v0.4s, #2
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll
index 551d2591ba48..74659f44808b 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll
@@ -78,14 +78,14 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind {
define i16 @test_urem_even(i16 %X) nounwind {
; CHECK-LABEL: test_urem_even:
; CHECK: // %bb.0:
-; CHECK-NEXT: ubfx w8, w0, #1, #15
-; CHECK-NEXT: mov w9, #18725
-; CHECK-NEXT: mul w8, w8, w9
-; CHECK-NEXT: lsr w8, w8, #17
-; CHECK-NEXT: mov w9, #14
-; CHECK-NEXT: msub w8, w8, w9, w0
-; CHECK-NEXT: tst w8, #0xffff
-; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: mov w8, #28087
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: and w9, w8, #0xfffc
+; CHECK-NEXT: lsr w9, w9, #1
+; CHECK-NEXT: bfi w9, w8, #15, #17
+; CHECK-NEXT: ubfx w8, w9, #1, #15
+; CHECK-NEXT: cmp w8, #2340 // =2340
+; CHECK-NEXT: cset w0, hi
; CHECK-NEXT: ret
%urem = urem i16 %X, 14
%cmp = icmp ne i16 %urem, 0
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 02e1186a4984..9bafe57b786e 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -5,17 +5,12 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; CHECK-LABEL: test_srem_odd:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_bfe_i32 v0, v0, 0, 29
-; CHECK-NEXT: s_mov_b32 s5, 0xa57eb503
-; CHECK-NEXT: s_movk_i32 s4, 0x63
-; CHECK-NEXT: v_mul_hi_i32 v1, v0, s5
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0
-; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1
-; CHECK-NEXT: v_ashrrev_i32_e32 v1, 6, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_mul_lo_u32 v1, v1, s4
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_mov_b32 s4, 0x1f5a814b
+; CHECK-NEXT: s_mov_b32 s5, 0x52bf5b
+; CHECK-NEXT: v_mul_lo_u32 v0, v0, s4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, 0x295fad, v0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%srem = srem i29 %X, 99
diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index eee9a4e69738..96b1c77d0849 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -5,13 +5,12 @@ define i1 @test_urem_odd(i13 %X) nounwind {
; CHECK-LABEL: test_urem_odd:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v0, 0x1fff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0xcccccccd
-; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4
-; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_movk_i32 s4, 0x1fff
+; CHECK-NEXT: s_movk_i32 s5, 0x667
+; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0xccd, v0
+; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%urem = urem i13 %X, 5
@@ -23,14 +22,14 @@ define i1 @test_urem_even(i27 %X) nounwind {
; CHECK-LABEL: test_urem_even:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v1, 0x7ffffff, v0
+; CHECK-NEXT: s_mov_b32 s4, 0x6db6db7
+; CHECK-NEXT: s_mov_b32 s5, 0x924925
+; CHECK-NEXT: v_mul_lo_u32 v0, v0, s4
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 26, v0
; CHECK-NEXT: v_bfe_u32 v0, v0, 1, 26
-; CHECK-NEXT: s_mov_b32 s4, 0x92492493
-; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: v_mul_u32_u24_e32 v0, 14, v0
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%urem = urem i27 %X, 14
@@ -43,12 +42,9 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 15, v0
-; CHECK-NEXT: s_mov_b32 s4, 0xcccccccd
-; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4
-; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_mul_u32_u24_e32 v0, 13, v0
+; CHECK-NEXT: v_and_b32_e32 v0, 15, v0
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 3, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%urem = urem i4 %X, 5
@@ -60,13 +56,11 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
; CHECK-LABEL: test_urem_negative_odd:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0x2050c9f9
-; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4
-; CHECK-NEXT: v_lshrrev_b32_e32 v1, 6, v1
-; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x1fb, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_movk_i32 s4, 0x1ff
+; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0x133, v0
+; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%urem = urem i9 %X, -5
diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
index 1fa7c5ca450c..5ba72fcb427c 100644
--- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll
@@ -10,32 +10,30 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; ARM5-LABEL: test_srem_odd:
; ARM5: @ %bb.0:
; ARM5-NEXT: ldr r2, .LCPI0_1
-; ARM5-NEXT: lsl r0, r0, #3
-; ARM5-NEXT: asr r0, r0, #3
; ARM5-NEXT: ldr r1, .LCPI0_0
; ARM5-NEXT: mla r3, r0, r2, r1
-; ARM5-NEXT: ldr r1, .LCPI0_2
+; ARM5-NEXT: ldr r2, .LCPI0_2
; ARM5-NEXT: mov r0, #0
-; ARM5-NEXT: cmp r3, r1
+; ARM5-NEXT: bic r1, r3, #-536870912
+; ARM5-NEXT: cmp r1, r2
; ARM5-NEXT: movlo r0, #1
; ARM5-NEXT: bx lr
; ARM5-NEXT: .p2align 2
; ARM5-NEXT: @ %bb.1:
; ARM5-NEXT: .LCPI0_0:
-; ARM5-NEXT: .long 21691754 @ 0x14afd6a
+; ARM5-NEXT: .long 2711469 @ 0x295fad
; ARM5-NEXT: .LCPI0_1:
-; ARM5-NEXT: .long 3210379595 @ 0xbf5a814b
+; ARM5-NEXT: .long 526025035 @ 0x1f5a814b
; ARM5-NEXT: .LCPI0_2:
-; ARM5-NEXT: .long 43383509 @ 0x295fad5
+; ARM5-NEXT: .long 5422939 @ 0x52bf5b
;
; ARM6-LABEL: test_srem_odd:
; ARM6: @ %bb.0:
; ARM6-NEXT: ldr r2, .LCPI0_1
-; ARM6-NEXT: lsl r0, r0, #3
-; ARM6-NEXT: asr r0, r0, #3
; ARM6-NEXT: ldr r1, .LCPI0_0
-; ARM6-NEXT: mla r1, r0, r2, r1
+; ARM6-NEXT: mla r0, r0, r2, r1
; ARM6-NEXT: ldr r2, .LCPI0_2
+; ARM6-NEXT: bic r1, r0, #-536870912
; ARM6-NEXT: mov r0, #0
; ARM6-NEXT: cmp r1, r2
; ARM6-NEXT: movlo r0, #1
@@ -43,22 +41,22 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; ARM6-NEXT: .p2align 2
; ARM6-NEXT: @ %bb.1:
; ARM6-NEXT: .LCPI0_0:
-; ARM6-NEXT: .long 21691754 @ 0x14afd6a
+; ARM6-NEXT: .long 2711469 @ 0x295fad
; ARM6-NEXT: .LCPI0_1:
-; ARM6-NEXT: .long 3210379595 @ 0xbf5a814b
+; ARM6-NEXT: .long 526025035 @ 0x1f5a814b
; ARM6-NEXT: .LCPI0_2:
-; ARM6-NEXT: .long 43383509 @ 0x295fad5
+; ARM6-NEXT: .long 5422939 @ 0x52bf5b
;
; ARM7-LABEL: test_srem_odd:
; ARM7: @ %bb.0:
-; ARM7-NEXT: movw r1, #64874
+; ARM7-NEXT: movw r1, #24493
; ARM7-NEXT: movw r2, #33099
-; ARM7-NEXT: sbfx r0, r0, #0, #29
-; ARM7-NEXT: movt r1, #330
-; ARM7-NEXT: movt r2, #48986
-; ARM7-NEXT: mla r1, r0, r2, r1
-; ARM7-NEXT: movw r2, #64213
-; ARM7-NEXT: movt r2, #661
+; ARM7-NEXT: movt r1, #41
+; ARM7-NEXT: movt r2, #8026
+; ARM7-NEXT: mla r0, r0, r2, r1
+; ARM7-NEXT: movw r2, #48987
+; ARM7-NEXT: movt r2, #82
+; ARM7-NEXT: bic r1, r0, #-536870912
; ARM7-NEXT: mov r0, #0
; ARM7-NEXT: cmp r1, r2
; ARM7-NEXT: movwlo r0, #1
@@ -66,14 +64,14 @@ define i1 @test_srem_odd(i29 %X) nounwind {
;
; ARM8-LABEL: test_srem_odd:
; ARM8: @ %bb.0:
-; ARM8-NEXT: movw r1, #64874
+; ARM8-NEXT: movw r1, #24493
; ARM8-NEXT: movw r2, #33099
-; ARM8-NEXT: sbfx r0, r0, #0, #29
-; ARM8-NEXT: movt r1, #330
-; ARM8-NEXT: movt r2, #48986
-; ARM8-NEXT: mla r1, r0, r2, r1
-; ARM8-NEXT: movw r2, #64213
-; ARM8-NEXT: movt r2, #661
+; ARM8-NEXT: movt r1, #41
+; ARM8-NEXT: movt r2, #8026
+; ARM8-NEXT: mla r0, r0, r2, r1
+; ARM8-NEXT: movw r2, #48987
+; ARM8-NEXT: movt r2, #82
+; ARM8-NEXT: bic r1, r0, #-536870912
; ARM8-NEXT: mov r0, #0
; ARM8-NEXT: cmp r1, r2
; ARM8-NEXT: movwlo r0, #1
@@ -81,14 +79,14 @@ define i1 @test_srem_odd(i29 %X) nounwind {
;
; NEON7-LABEL: test_srem_odd:
; NEON7: @ %bb.0:
-; NEON7-NEXT: movw r1, #64874
+; NEON7-NEXT: movw r1, #24493
; NEON7-NEXT: movw r2, #33099
-; NEON7-NEXT: sbfx r0, r0, #0, #29
-; NEON7-NEXT: movt r1, #330
-; NEON7-NEXT: movt r2, #48986
-; NEON7-NEXT: mla r1, r0, r2, r1
-; NEON7-NEXT: movw r2, #64213
-; NEON7-NEXT: movt r2, #661
+; NEON7-NEXT: movt r1, #41
+; NEON7-NEXT: movt r2, #8026
+; NEON7-NEXT: mla r0, r0, r2, r1
+; NEON7-NEXT: movw r2, #48987
+; NEON7-NEXT: movt r2, #82
+; NEON7-NEXT: bic r1, r0, #-536870912
; NEON7-NEXT: mov r0, #0
; NEON7-NEXT: cmp r1, r2
; NEON7-NEXT: movwlo r0, #1
@@ -96,14 +94,14 @@ define i1 @test_srem_odd(i29 %X) nounwind {
;
; NEON8-LABEL: test_srem_odd:
; NEON8: @ %bb.0:
-; NEON8-NEXT: movw r1, #64874
+; NEON8-NEXT: movw r1, #24493
; NEON8-NEXT: movw r2, #33099
-; NEON8-NEXT: sbfx r0, r0, #0, #29
-; NEON8-NEXT: movt r1, #330
-; NEON8-NEXT: movt r2, #48986
-; NEON8-NEXT: mla r1, r0, r2, r1
-; NEON8-NEXT: movw r2, #64213
-; NEON8-NEXT: movt r2, #661
+; NEON8-NEXT: movt r1, #41
+; NEON8-NEXT: movt r2, #8026
+; NEON8-NEXT: mla r0, r0, r2, r1
+; NEON8-NEXT: movw r2, #48987
+; NEON8-NEXT: movt r2, #82
+; NEON8-NEXT: bic r1, r0, #-536870912
; NEON8-NEXT: mov r0, #0
; NEON8-NEXT: cmp r1, r2
; NEON8-NEXT: movwlo r0, #1
diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
index ec1781130b5e..dd20fd07dff0 100644
--- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
@@ -9,90 +9,74 @@
define i1 @test_urem_odd(i13 %X) nounwind {
; ARM5-LABEL: test_urem_odd:
; ARM5: @ %bb.0:
-; ARM5-NEXT: mov r1, #255
-; ARM5-NEXT: orr r1, r1, #7936
-; ARM5-NEXT: and r0, r0, r1
-; ARM5-NEXT: ldr r1, .LCPI0_0
+; ARM5-NEXT: mov r1, #205
+; ARM5-NEXT: orr r1, r1, #3072
; ARM5-NEXT: mul r2, r0, r1
-; ARM5-NEXT: ldr r1, .LCPI0_1
+; ARM5-NEXT: mov r0, #255
+; ARM5-NEXT: orr r0, r0, #7936
+; ARM5-NEXT: and r1, r2, r0
+; ARM5-NEXT: mov r2, #103
+; ARM5-NEXT: orr r2, r2, #1536
; ARM5-NEXT: mov r0, #0
-; ARM5-NEXT: cmp r2, r1
+; ARM5-NEXT: cmp r1, r2
; ARM5-NEXT: movlo r0, #1
; ARM5-NEXT: bx lr
-; ARM5-NEXT: .p2align 2
-; ARM5-NEXT: @ %bb.1:
-; ARM5-NEXT: .LCPI0_0:
-; ARM5-NEXT: .long 3435973837 @ 0xcccccccd
-; ARM5-NEXT: .LCPI0_1:
-; ARM5-NEXT: .long 858993460 @ 0x33333334
;
; ARM6-LABEL: test_urem_odd:
; ARM6: @ %bb.0:
+; ARM6-NEXT: mov r1, #205
+; ARM6-NEXT: mov r2, #103
+; ARM6-NEXT: orr r1, r1, #3072
+; ARM6-NEXT: orr r2, r2, #1536
+; ARM6-NEXT: mul r0, r0, r1
; ARM6-NEXT: mov r1, #255
-; ARM6-NEXT: ldr r2, .LCPI0_1
; ARM6-NEXT: orr r1, r1, #7936
-; ARM6-NEXT: and r0, r0, r1
-; ARM6-NEXT: ldr r1, .LCPI0_0
-; ARM6-NEXT: mul r1, r0, r1
+; ARM6-NEXT: and r1, r0, r1
; ARM6-NEXT: mov r0, #0
; ARM6-NEXT: cmp r1, r2
; ARM6-NEXT: movlo r0, #1
; ARM6-NEXT: bx lr
-; ARM6-NEXT: .p2align 2
-; ARM6-NEXT: @ %bb.1:
-; ARM6-NEXT: .LCPI0_0:
-; ARM6-NEXT: .long 3435973837 @ 0xcccccccd
-; ARM6-NEXT: .LCPI0_1:
-; ARM6-NEXT: .long 858993460 @ 0x33333334
;
; ARM7-LABEL: test_urem_odd:
; ARM7: @ %bb.0:
-; ARM7-NEXT: movw r1, #52429
-; ARM7-NEXT: bfc r0, #13, #19
-; ARM7-NEXT: movt r1, #52428
-; ARM7-NEXT: movw r2, #13108
+; ARM7-NEXT: movw r1, #3277
+; ARM7-NEXT: movw r2, #1639
; ARM7-NEXT: mul r1, r0, r1
-; ARM7-NEXT: movt r2, #13107
; ARM7-NEXT: mov r0, #0
+; ARM7-NEXT: bfc r1, #13, #19
; ARM7-NEXT: cmp r1, r2
; ARM7-NEXT: movwlo r0, #1
; ARM7-NEXT: bx lr
;
; ARM8-LABEL: test_urem_odd:
; ARM8: @ %bb.0:
-; ARM8-NEXT: movw r1, #52429
-; ARM8-NEXT: bfc r0, #13, #19
-; ARM8-NEXT: movt r1, #52428
-; ARM8-NEXT: movw r2, #13108
+; ARM8-NEXT: movw r1, #3277
+; ARM8-NEXT: movw r2, #1639
; ARM8-NEXT: mul r1, r0, r1
-; ARM8-NEXT: movt r2, #13107
; ARM8-NEXT: mov r0, #0
+; ARM8-NEXT: bfc r1, #13, #19
; ARM8-NEXT: cmp r1, r2
; ARM8-NEXT: movwlo r0, #1
; ARM8-NEXT: bx lr
;
; NEON7-LABEL: test_urem_odd:
; NEON7: @ %bb.0:
-; NEON7-NEXT: movw r1, #52429
-; NEON7-NEXT: bfc r0, #13, #19
-; NEON7-NEXT: movt r1, #52428
-; NEON7-NEXT: movw r2, #13108
+; NEON7-NEXT: movw r1, #3277
+; NEON7-NEXT: movw r2, #1639
; NEON7-NEXT: mul r1, r0, r1
-; NEON7-NEXT: movt r2, #13107
; NEON7-NEXT: mov r0, #0
+; NEON7-NEXT: bfc r1, #13, #19
; NEON7-NEXT: cmp r1, r2
; NEON7-NEXT: movwlo r0, #1
; NEON7-NEXT: bx lr
;
; NEON8-LABEL: test_urem_odd:
; NEON8: @ %bb.0:
-; NEON8-NEXT: movw r1, #52429
-; NEON8-NEXT: bfc r0, #13, #19
-; NEON8-NEXT: movt r1, #52428
-; NEON8-NEXT: movw r2, #13108
+; NEON8-NEXT: movw r1, #3277
+; NEON8-NEXT: movw r2, #1639
; NEON8-NEXT: mul r1, r0, r1
-; NEON8-NEXT: movt r2, #13107
; NEON8-NEXT: mov r0, #0
+; NEON8-NEXT: bfc r1, #13, #19
; NEON8-NEXT: cmp r1, r2
; NEON8-NEXT: movwlo r0, #1
; NEON8-NEXT: bx lr
@@ -105,28 +89,32 @@ define i1 @test_urem_even(i27 %X) nounwind {
; ARM5-LABEL: test_urem_even:
; ARM5: @ %bb.0:
; ARM5-NEXT: ldr r1, .LCPI1_0
-; ARM5-NEXT: bic r0, r0, #-134217728
; ARM5-NEXT: mul r2, r0, r1
-; ARM5-NEXT: mov r0, #0
-; ARM5-NEXT: ror r1, r2, #1
+; ARM5-NEXT: bic r0, r2, #-134217727
+; ARM5-NEXT: lsr r0, r0, #1
+; ARM5-NEXT: orr r0, r0, r2, lsl #26
; ARM5-NEXT: ldr r2, .LCPI1_1
+; ARM5-NEXT: bic r1, r0, #-134217728
+; ARM5-NEXT: mov r0, #0
; ARM5-NEXT: cmp r1, r2
; ARM5-NEXT: movlo r0, #1
; ARM5-NEXT: bx lr
; ARM5-NEXT: .p2align 2
; ARM5-NEXT: @ %bb.1:
; ARM5-NEXT: .LCPI1_0:
-; ARM5-NEXT: .long 3067833783 @ 0xb6db6db7
+; ARM5-NEXT: .long 115043767 @ 0x6db6db7
; ARM5-NEXT: .LCPI1_1:
-; ARM5-NEXT: .long 306783379 @ 0x12492493
+; ARM5-NEXT: .long 9586981 @ 0x924925
;
; ARM6-LABEL: test_urem_even:
; ARM6: @ %bb.0:
; ARM6-NEXT: ldr r1, .LCPI1_0
-; ARM6-NEXT: bic r0, r0, #-134217728
; ARM6-NEXT: ldr r2, .LCPI1_1
; ARM6-NEXT: mul r0, r0, r1
-; ARM6-NEXT: ror r1, r0, #1
+; ARM6-NEXT: bic r1, r0, #-134217727
+; ARM6-NEXT: lsr r1, r1, #1
+; ARM6-NEXT: orr r0, r1, r0, lsl #26
+; ARM6-NEXT: bic r1, r0, #-134217728
; ARM6-NEXT: mov r0, #0
; ARM6-NEXT: cmp r1, r2
; ARM6-NEXT: movlo r0, #1
@@ -134,19 +122,20 @@ define i1 @test_urem_even(i27 %X) nounwind {
; ARM6-NEXT: .p2align 2
; ARM6-NEXT: @ %bb.1:
; ARM6-NEXT: .LCPI1_0:
-; ARM6-NEXT: .long 3067833783 @ 0xb6db6db7
+; ARM6-NEXT: .long 115043767 @ 0x6db6db7
; ARM6-NEXT: .LCPI1_1:
-; ARM6-NEXT: .long 306783379 @ 0x12492493
+; ARM6-NEXT: .long 9586981 @ 0x924925
;
; ARM7-LABEL: test_urem_even:
; ARM7: @ %bb.0:
; ARM7-NEXT: movw r1, #28087
-; ARM7-NEXT: bic r0, r0, #-134217728
-; ARM7-NEXT: movt r1, #46811
-; ARM7-NEXT: movw r2, #9363
+; ARM7-NEXT: movw r2, #18725
+; ARM7-NEXT: movt r1, #1755
+; ARM7-NEXT: movt r2, #146
; ARM7-NEXT: mul r0, r0, r1
-; ARM7-NEXT: movt r2, #4681
-; ARM7-NEXT: ror r1, r0, #1
+; ARM7-NEXT: ubfx r1, r0, #1, #26
+; ARM7-NEXT: orr r0, r1, r0, lsl #26
+; ARM7-NEXT: bic r1, r0, #-134217728
; ARM7-NEXT: mov r0, #0
; ARM7-NEXT: cmp r1, r2
; ARM7-NEXT: movwlo r0, #1
@@ -155,12 +144,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
; ARM8-LABEL: test_urem_even:
; ARM8: @ %bb.0:
; ARM8-NEXT: movw r1, #28087
-; ARM8-NEXT: bic r0, r0, #-134217728
-; ARM8-NEXT: movt r1, #46811
-; ARM8-NEXT: movw r2, #9363
+; ARM8-NEXT: movw r2, #18725
+; ARM8-NEXT: movt r1, #1755
+; ARM8-NEXT: movt r2, #146
; ARM8-NEXT: mul r0, r0, r1
-; ARM8-NEXT: movt r2, #4681
-; ARM8-NEXT: ror r1, r0, #1
+; ARM8-NEXT: ubfx r1, r0, #1, #26
+; ARM8-NEXT: orr r0, r1, r0, lsl #26
+; ARM8-NEXT: bic r1, r0, #-134217728
; ARM8-NEXT: mov r0, #0
; ARM8-NEXT: cmp r1, r2
; ARM8-NEXT: movwlo r0, #1
@@ -169,12 +159,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
; NEON7-LABEL: test_urem_even:
; NEON7: @ %bb.0:
; NEON7-NEXT: movw r1, #28087
-; NEON7-NEXT: bic r0, r0, #-134217728
-; NEON7-NEXT: movt r1, #46811
-; NEON7-NEXT: movw r2, #9363
+; NEON7-NEXT: movw r2, #18725
+; NEON7-NEXT: movt r1, #1755
+; NEON7-NEXT: movt r2, #146
; NEON7-NEXT: mul r0, r0, r1
-; NEON7-NEXT: movt r2, #4681
-; NEON7-NEXT: ror r1, r0, #1
+; NEON7-NEXT: ubfx r1, r0, #1, #26
+; NEON7-NEXT: orr r0, r1, r0, lsl #26
+; NEON7-NEXT: bic r1, r0, #-134217728
; NEON7-NEXT: mov r0, #0
; NEON7-NEXT: cmp r1, r2
; NEON7-NEXT: movwlo r0, #1
@@ -183,12 +174,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
; NEON8-LABEL: test_urem_even:
; NEON8: @ %bb.0:
; NEON8-NEXT: movw r1, #28087
-; NEON8-NEXT: bic r0, r0, #-134217728
-; NEON8-NEXT: movt r1, #46811
-; NEON8-NEXT: movw r2, #9363
+; NEON8-NEXT: movw r2, #18725
+; NEON8-NEXT: movt r1, #1755
+; NEON8-NEXT: movt r2, #146
; NEON8-NEXT: mul r0, r0, r1
-; NEON8-NEXT: movt r2, #4681
-; NEON8-NEXT: ror r1, r0, #1
+; NEON8-NEXT: ubfx r1, r0, #1, #26
+; NEON8-NEXT: orr r0, r1, r0, lsl #26
+; NEON8-NEXT: bic r1, r0, #-134217728
; NEON8-NEXT: mov r0, #0
; NEON8-NEXT: cmp r1, r2
; NEON8-NEXT: movwlo r0, #1
@@ -201,87 +193,61 @@ define i1 @test_urem_even(i27 %X) nounwind {
define i1 @test_urem_odd_setne(i4 %X) nounwind {
; ARM5-LABEL: test_urem_odd_setne:
; ARM5: @ %bb.0:
-; ARM5-NEXT: ldr r1, .LCPI2_0
-; ARM5-NEXT: and r0, r0, #15
+; ARM5-NEXT: mov r1, #13
; ARM5-NEXT: mul r2, r0, r1
-; ARM5-NEXT: ldr r1, .LCPI2_1
; ARM5-NEXT: mov r0, #0
-; ARM5-NEXT: cmp r2, r1
+; ARM5-NEXT: and r1, r2, #15
+; ARM5-NEXT: cmp r1, #3
; ARM5-NEXT: movhi r0, #1
; ARM5-NEXT: bx lr
-; ARM5-NEXT: .p2align 2
-; ARM5-NEXT: @ %bb.1:
-; ARM5-NEXT: .LCPI2_0:
-; ARM5-NEXT: .long 3435973837 @ 0xcccccccd
-; ARM5-NEXT: .LCPI2_1:
-; ARM5-NEXT: .long 858993459 @ 0x33333333
;
; ARM6-LABEL: test_urem_odd_setne:
; ARM6: @ %bb.0:
-; ARM6-NEXT: ldr r1, .LCPI2_0
-; ARM6-NEXT: and r0, r0, #15
-; ARM6-NEXT: ldr r2, .LCPI2_1
-; ARM6-NEXT: mul r1, r0, r1
+; ARM6-NEXT: mov r1, #13
+; ARM6-NEXT: mul r0, r0, r1
+; ARM6-NEXT: and r1, r0, #15
; ARM6-NEXT: mov r0, #0
-; ARM6-NEXT: cmp r1, r2
+; ARM6-NEXT: cmp r1, #3
; ARM6-NEXT: movhi r0, #1
; ARM6-NEXT: bx lr
-; ARM6-NEXT: .p2align 2
-; ARM6-NEXT: @ %bb.1:
-; ARM6-NEXT: .LCPI2_0:
-; ARM6-NEXT: .long 3435973837 @ 0xcccccccd
-; ARM6-NEXT: .LCPI2_1:
-; ARM6-NEXT: .long 858993459 @ 0x33333333
;
; ARM7-LABEL: test_urem_odd_setne:
; ARM7: @ %bb.0:
-; ARM7-NEXT: movw r1, #52429
-; ARM7-NEXT: and r0, r0, #15
-; ARM7-NEXT: movt r1, #52428
-; ARM7-NEXT: movw r2, #13107
-; ARM7-NEXT: mul r1, r0, r1
-; ARM7-NEXT: movt r2, #13107
+; ARM7-NEXT: mov r1, #13
+; ARM7-NEXT: mul r0, r0, r1
+; ARM7-NEXT: and r1, r0, #15
; ARM7-NEXT: mov r0, #0
-; ARM7-NEXT: cmp r1, r2
+; ARM7-NEXT: cmp r1, #3
; ARM7-NEXT: movwhi r0, #1
; ARM7-NEXT: bx lr
;
; ARM8-LABEL: test_urem_odd_setne:
; ARM8: @ %bb.0:
-; ARM8-NEXT: movw r1, #52429
-; ARM8-NEXT: and r0, r0, #15
-; ARM8-NEXT: movt r1, #52428
-; ARM8-NEXT: movw r2, #13107
-; ARM8-NEXT: mul r1, r0, r1
-; ARM8-NEXT: movt r2, #13107
+; ARM8-NEXT: mov r1, #13
+; ARM8-NEXT: mul r0, r0, r1
+; ARM8-NEXT: and r1, r0, #15
; ARM8-NEXT: mov r0, #0
-; ARM8-NEXT: cmp r1, r2
+; ARM8-NEXT: cmp r1, #3
; ARM8-NEXT: movwhi r0, #1
; ARM8-NEXT: bx lr
;
; NEON7-LABEL: test_urem_odd_setne:
; NEON7: @ %bb.0:
-; NEON7-NEXT: movw r1, #52429
-; NEON7-NEXT: and r0, r0, #15
-; NEON7-NEXT: movt r1, #52428
-; NEON7-NEXT: movw r2, #13107
-; NEON7-NEXT: mul r1, r0, r1
-; NEON7-NEXT: movt r2, #13107
+; NEON7-NEXT: mov r1, #13
+; NEON7-NEXT: mul r0, r0, r1
+; NEON7-NEXT: and r1, r0, #15
; NEON7-NEXT: mov r0, #0
-; NEON7-NEXT: cmp r1, r2
+; NEON7-NEXT: cmp r1, #3
; NEON7-NEXT: movwhi r0, #1
; NEON7-NEXT: bx lr
;
; NEON8-LABEL: test_urem_odd_setne:
; NEON8: @ %bb.0:
-; NEON8-NEXT: movw r1, #52429
-; NEON8-NEXT: and r0, r0, #15
-; NEON8-NEXT: movt r1, #52428
-; NEON8-NEXT: movw r2, #13107
-; NEON8-NEXT: mul r1, r0, r1
-; NEON8-NEXT: movt r2, #13107
+; NEON8-NEXT: mov r1, #13
+; NEON8-NEXT: mul r0, r0, r1
+; NEON8-NEXT: and r1, r0, #15
; NEON8-NEXT: mov r0, #0
-; NEON8-NEXT: cmp r1, r2
+; NEON8-NEXT: cmp r1, #3
; NEON8-NEXT: movwhi r0, #1
; NEON8-NEXT: bx lr
%urem = urem i4 %X, 5
@@ -292,91 +258,67 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
define i1 @test_urem_negative_odd(i9 %X) nounwind {
; ARM5-LABEL: test_urem_negative_odd:
; ARM5: @ %bb.0:
-; ARM5-NEXT: mov r1, #255
+; ARM5-NEXT: mov r1, #51
; ARM5-NEXT: orr r1, r1, #256
-; ARM5-NEXT: and r0, r0, r1
-; ARM5-NEXT: ldr r1, .LCPI3_0
; ARM5-NEXT: mul r2, r0, r1
-; ARM5-NEXT: ldr r1, .LCPI3_1
+; ARM5-NEXT: mov r0, #255
+; ARM5-NEXT: orr r0, r0, #256
+; ARM5-NEXT: and r1, r2, r0
; ARM5-NEXT: mov r0, #0
-; ARM5-NEXT: cmp r2, r1
+; ARM5-NEXT: cmp r1, #1
; ARM5-NEXT: movhi r0, #1
; ARM5-NEXT: bx lr
-; ARM5-NEXT: .p2align 2
-; ARM5-NEXT: @ %bb.1:
-; ARM5-NEXT: .LCPI3_0:
-; ARM5-NEXT: .long 2837897523 @ 0xa926e133
-; ARM5-NEXT: .LCPI3_1:
-; ARM5-NEXT: .long 8471335 @ 0x814327
;
; ARM6-LABEL: test_urem_negative_odd:
; ARM6: @ %bb.0:
+; ARM6-NEXT: mov r1, #51
+; ARM6-NEXT: orr r1, r1, #256
+; ARM6-NEXT: mul r0, r0, r1
; ARM6-NEXT: mov r1, #255
-; ARM6-NEXT: ldr r2, .LCPI3_1
; ARM6-NEXT: orr r1, r1, #256
-; ARM6-NEXT: and r0, r0, r1
-; ARM6-NEXT: ldr r1, .LCPI3_0
-; ARM6-NEXT: mul r1, r0, r1
+; ARM6-NEXT: and r1, r0, r1
; ARM6-NEXT: mov r0, #0
-; ARM6-NEXT: cmp r1, r2
+; ARM6-NEXT: cmp r1, #1
; ARM6-NEXT: movhi r0, #1
; ARM6-NEXT: bx lr
-; ARM6-NEXT: .p2align 2
-; ARM6-NEXT: @ %bb.1:
-; ARM6-NEXT: .LCPI3_0:
-; ARM6-NEXT: .long 2837897523 @ 0xa926e133
-; ARM6-NEXT: .LCPI3_1:
-; ARM6-NEXT: .long 8471335 @ 0x814327
;
; ARM7-LABEL: test_urem_negative_odd:
; ARM7: @ %bb.0:
-; ARM7-NEXT: movw r1, #57651
-; ARM7-NEXT: bfc r0, #9, #23
-; ARM7-NEXT: movt r1, #43302
-; ARM7-NEXT: movw r2, #17191
+; ARM7-NEXT: movw r1, #307
; ARM7-NEXT: mul r1, r0, r1
-; ARM7-NEXT: movt r2, #129
; ARM7-NEXT: mov r0, #0
-; ARM7-NEXT: cmp r1, r2
+; ARM7-NEXT: bfc r1, #9, #23
+; ARM7-NEXT: cmp r1, #1
; ARM7-NEXT: movwhi r0, #1
; ARM7-NEXT: bx lr
;
; ARM8-LABEL: test_urem_negative_odd:
; ARM8: @ %bb.0:
-; ARM8-NEXT: movw r1, #57651
-; ARM8-NEXT: bfc r0, #9, #23
-; ARM8-NEXT: movt r1, #43302
-; ARM8-NEXT: movw r2, #17191
+; ARM8-NEXT: movw r1, #307
; ARM8-NEXT: mul r1, r0, r1
-; ARM8-NEXT: movt r2, #129
; ARM8-NEXT: mov r0, #0
-; ARM8-NEXT: cmp r1, r2
+; ARM8-NEXT: bfc r1, #9, #23
+; ARM8-NEXT: cmp r1, #1
; ARM8-NEXT: movwhi r0, #1
; ARM8-NEXT: bx lr
;
; NEON7-LABEL: test_urem_negative_odd:
; NEON7: @ %bb.0:
-; NEON7-NEXT: movw r1, #57651
-; NEON7-NEXT: bfc r0, #9, #23
-; NEON7-NEXT: movt r1, #43302
-; NEON7-NEXT: movw r2, #17191
+; NEON7-NEXT: movw r1, #307
; NEON7-NEXT: mul r1, r0, r1
-; NEON7-NEXT: movt r2, #129
; NEON7-NEXT: mov r0, #0
-; NEON7-NEXT: cmp r1, r2
+; NEON7-NEXT: bfc r1, #9, #23
+; NEON7-NEXT: cmp r1, #1
; NEON7-NEXT: movwhi r0, #1
; NEON7-NEXT: bx lr
;
; NEON8-LABEL: test_urem_negative_odd:
; NEON8: @ %bb.0:
-; NEON8-NEXT: movw r1, #57651
-; NEON8-NEXT: bfc r0, #9, #23
-; NEON8-NEXT: movt r1, #43302
-; NEON8-NEXT: movw r2, #17191
+; NEON8-NEXT: movw r1, #307
; NEON8-NEXT: mul r1, r0, r1
-; NEON8-NEXT: movt r2, #129
; NEON8-NEXT: mov r0, #0
-; NEON8-NEXT: cmp r1, r2
+; NEON8-NEXT: bfc r1, #9, #23
+; NEON8-NEXT: cmp r1, #1
; NEON8-NEXT: movwhi r0, #1
; NEON8-NEXT: bx lr
%urem = urem i9 %X, -5
@@ -388,289 +330,291 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; ARM5-LABEL: test_urem_vec:
; ARM5: @ %bb.0:
; ARM5-NEXT: push {r4, r5, r11, lr}
+; ARM5-NEXT: mov r3, #183
+; ARM5-NEXT: mvn r12, #182
+; ARM5-NEXT: orr r3, r3, #1280
+; ARM5-NEXT: sub r12, r12, #1280
+; ARM5-NEXT: mov r4, #51
+; ARM5-NEXT: mla lr, r1, r3, r12
; ARM5-NEXT: mov r12, #255
-; ARM5-NEXT: ldr r3, .LCPI4_1
; ARM5-NEXT: orr r12, r12, #1792
-; ARM5-NEXT: ldr lr, .LCPI4_0
-; ARM5-NEXT: and r1, r1, r12
-; ARM5-NEXT: and r2, r2, r12
-; ARM5-NEXT: and r0, r0, r12
-; ARM5-NEXT: mla r4, r1, r3, lr
-; ARM5-NEXT: ldr r1, .LCPI4_2
-; ARM5-NEXT: ldr lr, .LCPI4_3
+; ARM5-NEXT: orr r4, r4, #768
; ARM5-NEXT: mov r3, #0
-; ARM5-NEXT: cmp r4, r1
-; ARM5-NEXT: ldr r4, .LCPI4_4
+; ARM5-NEXT: and r1, lr, r12
+; ARM5-NEXT: mvn lr, #101
+; ARM5-NEXT: sub lr, lr, #1536
+; ARM5-NEXT: cmp r1, #292
+; ARM5-NEXT: mla r5, r2, r4, lr
; ARM5-NEXT: mov r1, #0
; ARM5-NEXT: movhi r1, #1
-; ARM5-NEXT: mla r5, r2, r4, lr
-; ARM5-NEXT: ldr r2, .LCPI4_5
-; ARM5-NEXT: cmp r5, r2
-; ARM5-NEXT: ldr r5, .LCPI4_6
+; ARM5-NEXT: and r2, r5, r12
+; ARM5-NEXT: mov r5, #171
+; ARM5-NEXT: orr r5, r5, #512
+; ARM5-NEXT: cmp r2, #1
; ARM5-NEXT: mov r2, #0
-; ARM5-NEXT: movhi r2, #1
; ARM5-NEXT: mul r4, r0, r5
-; ARM5-NEXT: ldr r5, .LCPI4_7
-; ARM5-NEXT: ror r0, r4, #1
-; ARM5-NEXT: cmp r0, r5
+; ARM5-NEXT: mov r0, #1020
+; ARM5-NEXT: orr r0, r0, #1024
+; ARM5-NEXT: mov r5, #254
+; ARM5-NEXT: movhi r2, #1
+; ARM5-NEXT: orr r5, r5, #1792
+; ARM5-NEXT: and r0, r4, r0
+; ARM5-NEXT: lsr r0, r0, #1
+; ARM5-NEXT: orr r0, r0, r4, lsl #10
+; ARM5-NEXT: and r0, r0, r5
+; ARM5-NEXT: lsr r0, r0, #1
+; ARM5-NEXT: cmp r0, #170
; ARM5-NEXT: movhi r3, #1
; ARM5-NEXT: mov r0, r3
; ARM5-NEXT: pop {r4, r5, r11, pc}
-; ARM5-NEXT: .p2align 2
-; ARM5-NEXT: @ %bb.1:
-; ARM5-NEXT: .LCPI4_0:
-; ARM5-NEXT: .long 1227133513 @ 0x49249249
-; ARM5-NEXT: .LCPI4_1:
-; ARM5-NEXT: .long 3067833783 @ 0xb6db6db7
-; ARM5-NEXT: .LCPI4_2:
-; ARM5-NEXT: .long 613566756 @ 0x24924924
-; ARM5-NEXT: .LCPI4_3:
-; ARM5-NEXT: .long 4191955354 @ 0xf9dc299a
-; ARM5-NEXT: .LCPI4_4:
-; ARM5-NEXT: .long 2198989619 @ 0x8311eb33
-; ARM5-NEXT: .LCPI4_5:
-; ARM5-NEXT: .long 2102284 @ 0x20140c
-; ARM5-NEXT: .LCPI4_6:
-; ARM5-NEXT: .long 2863311531 @ 0xaaaaaaab
-; ARM5-NEXT: .LCPI4_7:
-; ARM5-NEXT: .long 715827882 @ 0x2aaaaaaa
;
; ARM6-LABEL: test_urem_vec:
; ARM6: @ %bb.0:
; ARM6-NEXT: push {r4, lr}
+; ARM6-NEXT: mov r4, #51
+; ARM6-NEXT: mvn lr, #101
+; ARM6-NEXT: orr r4, r4, #768
+; ARM6-NEXT: sub lr, lr, #1536
+; ARM6-NEXT: mov r3, #183
+; ARM6-NEXT: mvn r12, #182
+; ARM6-NEXT: mla r2, r2, r4, lr
+; ARM6-NEXT: mov r4, #171
+; ARM6-NEXT: orr r4, r4, #512
+; ARM6-NEXT: orr r3, r3, #1280
+; ARM6-NEXT: sub r12, r12, #1280
+; ARM6-NEXT: mul r0, r0, r4
+; ARM6-NEXT: mov r4, #1020
+; ARM6-NEXT: orr r4, r4, #1024
+; ARM6-NEXT: mla r1, r1, r3, r12
; ARM6-NEXT: mov r12, #255
-; ARM6-NEXT: ldr r3, .LCPI4_1
; ARM6-NEXT: orr r12, r12, #1792
-; ARM6-NEXT: ldr lr, .LCPI4_0
-; ARM6-NEXT: and r1, r1, r12
-; ARM6-NEXT: ldr r4, .LCPI4_4
; ARM6-NEXT: and r2, r2, r12
-; ARM6-NEXT: and r0, r0, r12
-; ARM6-NEXT: mla r1, r1, r3, lr
-; ARM6-NEXT: ldr lr, .LCPI4_2
; ARM6-NEXT: mov r3, #0
-; ARM6-NEXT: cmp r1, lr
-; ARM6-NEXT: ldr lr, .LCPI4_3
-; ARM6-NEXT: mla r2, r2, r4, lr
-; ARM6-NEXT: ldr r4, .LCPI4_5
+; ARM6-NEXT: and r4, r0, r4
+; ARM6-NEXT: lsr r4, r4, #1
+; ARM6-NEXT: orr r0, r4, r0, lsl #10
+; ARM6-NEXT: mov r4, #254
+; ARM6-NEXT: and r1, r1, r12
+; ARM6-NEXT: orr r4, r4, #1792
+; ARM6-NEXT: cmp r1, #292
; ARM6-NEXT: mov r1, #0
+; ARM6-NEXT: and r0, r0, r4
; ARM6-NEXT: movhi r1, #1
-; ARM6-NEXT: cmp r2, r4
-; ARM6-NEXT: ldr r4, .LCPI4_6
+; ARM6-NEXT: cmp r2, #1
; ARM6-NEXT: mov r2, #0
+; ARM6-NEXT: lsr r0, r0, #1
; ARM6-NEXT: movhi r2, #1
-; ARM6-NEXT: mul r0, r0, r4
-; ARM6-NEXT: ldr r4, .LCPI4_7
-; ARM6-NEXT: ror r0, r0, #1
-; ARM6-NEXT: cmp r0, r4
+; ARM6-NEXT: cmp r0, #170
; ARM6-NEXT: movhi r3, #1
; ARM6-NEXT: mov r0, r3
; ARM6-NEXT: pop {r4, pc}
-; ARM6-NEXT: .p2align 2
-; ARM6-NEXT: @ %bb.1:
-; ARM6-NEXT: .LCPI4_0:
-; ARM6-NEXT: .long 1227133513 @ 0x49249249
-; ARM6-NEXT: .LCPI4_1:
-; ARM6-NEXT: .long 3067833783 @ 0xb6db6db7
-; ARM6-NEXT: .LCPI4_2:
-; ARM6-NEXT: .long 613566756 @ 0x24924924
-; ARM6-NEXT: .LCPI4_3:
-; ARM6-NEXT: .long 4191955354 @ 0xf9dc299a
-; ARM6-NEXT: .LCPI4_4:
-; ARM6-NEXT: .long 2198989619 @ 0x8311eb33
-; ARM6-NEXT: .LCPI4_5:
-; ARM6-NEXT: .long 2102284 @ 0x20140c
-; ARM6-NEXT: .LCPI4_6:
-; ARM6-NEXT: .long 2863311531 @ 0xaaaaaaab
-; ARM6-NEXT: .LCPI4_7:
-; ARM6-NEXT: .long 715827882 @ 0x2aaaaaaa
;
; ARM7-LABEL: test_urem_vec:
; ARM7: @ %bb.0:
-; ARM7-NEXT: push {r4, lr}
-; ARM7-NEXT: movw r3, #18725
-; ARM7-NEXT: bfc r1, #11, #21
-; ARM7-NEXT: movt r3, #9362
-; ARM7-NEXT: bfc r2, #11, #21
-; ARM7-NEXT: umull r3, r12, r1, r3
-; ARM7-NEXT: bfc r0, #11, #21
-; ARM7-NEXT: movw r3, #25663
-; ARM7-NEXT: movt r3, #160
-; ARM7-NEXT: umull r3, lr, r2, r3
-; ARM7-NEXT: vldr d17, .LCPI4_0
-; ARM7-NEXT: movw r3, #43691
-; ARM7-NEXT: movt r3, #43690
-; ARM7-NEXT: umull r3, r4, r0, r3
-; ARM7-NEXT: sub r3, r1, r12
-; ARM7-NEXT: add r3, r12, r3, lsr #1
-; ARM7-NEXT: lsr r12, r3, #2
-; ARM7-NEXT: sub r3, r2, lr
-; ARM7-NEXT: lsr r4, r4, #2
-; ARM7-NEXT: add r4, r4, r4, lsl #1
-; ARM7-NEXT: add r3, lr, r3, lsr #1
-; ARM7-NEXT: sub r0, r0, r4, lsl #1
-; ARM7-NEXT: lsr lr, r3, #10
-; ARM7-NEXT: movw r3, #2043
; ARM7-NEXT: vmov.16 d16[0], r0
-; ARM7-NEXT: sub r0, r12, r12, lsl #3
-; ARM7-NEXT: mls r2, lr, r3, r2
-; ARM7-NEXT: add r0, r1, r0
-; ARM7-NEXT: vmov.16 d16[1], r0
+; ARM7-NEXT: vldr d17, .LCPI4_0
+; ARM7-NEXT: vmov.16 d16[1], r1
+; ARM7-NEXT: vldr d19, .LCPI4_3
; ARM7-NEXT: vmov.16 d16[2], r2
+; ARM7-NEXT: vsub.i16 d16, d16, d17
+; ARM7-NEXT: vldr d17, .LCPI4_1
+; ARM7-NEXT: vmul.i16 d16, d16, d17
+; ARM7-NEXT: vldr d17, .LCPI4_2
+; ARM7-NEXT: vneg.s16 d17, d17
+; ARM7-NEXT: vshl.i16 d18, d16, #1
; ARM7-NEXT: vbic.i16 d16, #0xf800
-; ARM7-NEXT: vceq.i16 d16, d16, d17
-; ARM7-NEXT: vmvn d16, d16
+; ARM7-NEXT: vshl.u16 d16, d16, d17
+; ARM7-NEXT: vshl.u16 d17, d18, d19
+; ARM7-NEXT: vorr d16, d16, d17
+; ARM7-NEXT: vldr d17, .LCPI4_4
+; ARM7-NEXT: vbic.i16 d16, #0xf800
+; ARM7-NEXT: vcgt.u16 d16, d16, d17
; ARM7-NEXT: vmov.u16 r0, d16[0]
; ARM7-NEXT: vmov.u16 r1, d16[1]
; ARM7-NEXT: vmov.u16 r2, d16[2]
-; ARM7-NEXT: pop {r4, pc}
+; ARM7-NEXT: bx lr
; ARM7-NEXT: .p2align 3
; ARM7-NEXT: @ %bb.1:
; ARM7-NEXT: .LCPI4_0:
; ARM7-NEXT: .short 0 @ 0x0
; ARM7-NEXT: .short 1 @ 0x1
; ARM7-NEXT: .short 2 @ 0x2
+; ARM7-NEXT: .zero 2
+; ARM7-NEXT: .LCPI4_1:
+; ARM7-NEXT: .short 683 @ 0x2ab
+; ARM7-NEXT: .short 1463 @ 0x5b7
+; ARM7-NEXT: .short 819 @ 0x333
+; ARM7-NEXT: .zero 2
+; ARM7-NEXT: .LCPI4_2:
+; ARM7-NEXT: .short 1 @ 0x1
+; ARM7-NEXT: .short 0 @ 0x0
+; ARM7-NEXT: .short 0 @ 0x0
+; ARM7-NEXT: .short 0 @ 0x0
+; ARM7-NEXT: .LCPI4_3:
+; ARM7-NEXT: .short 9 @ 0x9
+; ARM7-NEXT: .short 10 @ 0xa
+; ARM7-NEXT: .short 10 @ 0xa
+; ARM7-NEXT: .short 10 @ 0xa
+; ARM7-NEXT: .LCPI4_4:
+; ARM7-NEXT: .short 341 @ 0x155
+; ARM7-NEXT: .short 292 @ 0x124
+; ARM7-NEXT: .short 1 @ 0x1
; ARM7-NEXT: .short 0 @ 0x0
;
; ARM8-LABEL: test_urem_vec:
; ARM8: @ %bb.0:
-; ARM8-NEXT: push {r4, lr}
-; ARM8-NEXT: movw r3, #18725
-; ARM8-NEXT: bfc r1, #11, #21
-; ARM8-NEXT: movt r3, #9362
-; ARM8-NEXT: bfc r2, #11, #21
-; ARM8-NEXT: umull r3, r12, r1, r3
-; ARM8-NEXT: bfc r0, #11, #21
-; ARM8-NEXT: movw r3, #25663
-; ARM8-NEXT: movt r3, #160
-; ARM8-NEXT: umull r3, lr, r2, r3
-; ARM8-NEXT: vldr d17, .LCPI4_0
-; ARM8-NEXT: movw r3, #43691
-; ARM8-NEXT: movt r3, #43690
-; ARM8-NEXT: umull r3, r4, r0, r3
-; ARM8-NEXT: sub r3, r1, r12
-; ARM8-NEXT: add r3, r12, r3, lsr #1
-; ARM8-NEXT: lsr r12, r3, #2
-; ARM8-NEXT: sub r3, r2, lr
-; ARM8-NEXT: lsr r4, r4, #2
-; ARM8-NEXT: add r4, r4, r4, lsl #1
-; ARM8-NEXT: add r3, lr, r3, lsr #1
-; ARM8-NEXT: sub r0, r0, r4, lsl #1
-; ARM8-NEXT: lsr lr, r3, #10
-; ARM8-NEXT: movw r3, #2043
; ARM8-NEXT: vmov.16 d16[0], r0
-; ARM8-NEXT: sub r0, r12, r12, lsl #3
-; ARM8-NEXT: mls r2, lr, r3, r2
-; ARM8-NEXT: add r0, r1, r0
-; ARM8-NEXT: vmov.16 d16[1], r0
+; ARM8-NEXT: vldr d17, .LCPI4_0
+; ARM8-NEXT: vmov.16 d16[1], r1
+; ARM8-NEXT: vldr d19, .LCPI4_3
; ARM8-NEXT: vmov.16 d16[2], r2
+; ARM8-NEXT: vsub.i16 d16, d16, d17
+; ARM8-NEXT: vldr d17, .LCPI4_1
+; ARM8-NEXT: vmul.i16 d16, d16, d17
+; ARM8-NEXT: vldr d17, .LCPI4_2
+; ARM8-NEXT: vneg.s16 d17, d17
+; ARM8-NEXT: vshl.i16 d18, d16, #1
+; ARM8-NEXT: vbic.i16 d16, #0xf800
+; ARM8-NEXT: vshl.u16 d16, d16, d17
+; ARM8-NEXT: vshl.u16 d17, d18, d19
+; ARM8-NEXT: vorr d16, d16, d17
+; ARM8-NEXT: vldr d17, .LCPI4_4
; ARM8-NEXT: vbic.i16 d16, #0xf800
-; ARM8-NEXT: vceq.i16 d16, d16, d17
-; ARM8-NEXT: vmvn d16, d16
+; ARM8-NEXT: vcgt.u16 d16, d16, d17
; ARM8-NEXT: vmov.u16 r0, d16[0]
; ARM8-NEXT: vmov.u16 r1, d16[1]
; ARM8-NEXT: vmov.u16 r2, d16[2]
-; ARM8-NEXT: pop {r4, pc}
+; ARM8-NEXT: bx lr
; ARM8-NEXT: .p2align 3
; ARM8-NEXT: @ %bb.1:
; ARM8-NEXT: .LCPI4_0:
; ARM8-NEXT: .short 0 @ 0x0
; ARM8-NEXT: .short 1 @ 0x1
; ARM8-NEXT: .short 2 @ 0x2
+; ARM8-NEXT: .zero 2
+; ARM8-NEXT: .LCPI4_1:
+; ARM8-NEXT: .short 683 @ 0x2ab
+; ARM8-NEXT: .short 1463 @ 0x5b7
+; ARM8-NEXT: .short 819 @ 0x333
+; ARM8-NEXT: .zero 2
+; ARM8-NEXT: .LCPI4_2:
+; ARM8-NEXT: .short 1 @ 0x1
+; ARM8-NEXT: .short 0 @ 0x0
+; ARM8-NEXT: .short 0 @ 0x0
+; ARM8-NEXT: .short 0 @ 0x0
+; ARM8-NEXT: .LCPI4_3:
+; ARM8-NEXT: .short 9 @ 0x9
+; ARM8-NEXT: .short 10 @ 0xa
+; ARM8-NEXT: .short 10 @ 0xa
+; ARM8-NEXT: .short 10 @ 0xa
+; ARM8-NEXT: .LCPI4_4:
+; ARM8-NEXT: .short 341 @ 0x155
+; ARM8-NEXT: .short 292 @ 0x124
+; ARM8-NEXT: .short 1 @ 0x1
; ARM8-NEXT: .short 0 @ 0x0
;
; NEON7-LABEL: test_urem_vec:
; NEON7: @ %bb.0:
-; NEON7-NEXT: push {r4, lr}
-; NEON7-NEXT: movw r3, #18725
-; NEON7-NEXT: bfc r1, #11, #21
-; NEON7-NEXT: movt r3, #9362
-; NEON7-NEXT: bfc r2, #11, #21
-; NEON7-NEXT: umull r3, r12, r1, r3
-; NEON7-NEXT: bfc r0, #11, #21
-; NEON7-NEXT: movw r3, #25663
-; NEON7-NEXT: movt r3, #160
-; NEON7-NEXT: umull r3, lr, r2, r3
-; NEON7-NEXT: vldr d17, .LCPI4_0
-; NEON7-NEXT: movw r3, #43691
-; NEON7-NEXT: movt r3, #43690
-; NEON7-NEXT: umull r3, r4, r0, r3
-; NEON7-NEXT: sub r3, r1, r12
-; NEON7-NEXT: add r3, r12, r3, lsr #1
-; NEON7-NEXT: lsr r12, r3, #2
-; NEON7-NEXT: sub r3, r2, lr
-; NEON7-NEXT: lsr r4, r4, #2
-; NEON7-NEXT: add r4, r4, r4, lsl #1
-; NEON7-NEXT: add r3, lr, r3, lsr #1
-; NEON7-NEXT: sub r0, r0, r4, lsl #1
-; NEON7-NEXT: lsr lr, r3, #10
-; NEON7-NEXT: movw r3, #2043
; NEON7-NEXT: vmov.16 d16[0], r0
-; NEON7-NEXT: sub r0, r12, r12, lsl #3
-; NEON7-NEXT: mls r2, lr, r3, r2
-; NEON7-NEXT: add r0, r1, r0
-; NEON7-NEXT: vmov.16 d16[1], r0
+; NEON7-NEXT: vldr d17, .LCPI4_0
+; NEON7-NEXT: vmov.16 d16[1], r1
+; NEON7-NEXT: vldr d19, .LCPI4_3
; NEON7-NEXT: vmov.16 d16[2], r2
+; NEON7-NEXT: vsub.i16 d16, d16, d17
+; NEON7-NEXT: vldr d17, .LCPI4_1
+; NEON7-NEXT: vmul.i16 d16, d16, d17
+; NEON7-NEXT: vldr d17, .LCPI4_2
+; NEON7-NEXT: vneg.s16 d17, d17
+; NEON7-NEXT: vshl.i16 d18, d16, #1
+; NEON7-NEXT: vbic.i16 d16, #0xf800
+; NEON7-NEXT: vshl.u16 d16, d16, d17
+; NEON7-NEXT: vshl.u16 d17, d18, d19
+; NEON7-NEXT: vorr d16, d16, d17
+; NEON7-NEXT: vldr d17, .LCPI4_4
; NEON7-NEXT: vbic.i16 d16, #0xf800
-; NEON7-NEXT: vceq.i16 d16, d16, d17
-; NEON7-NEXT: vmvn d16, d16
+; NEON7-NEXT: vcgt.u16 d16, d16, d17
; NEON7-NEXT: vmov.u16 r0, d16[0]
; NEON7-NEXT: vmov.u16 r1, d16[1]
; NEON7-NEXT: vmov.u16 r2, d16[2]
-; NEON7-NEXT: pop {r4, pc}
+; NEON7-NEXT: bx lr
; NEON7-NEXT: .p2align 3
; NEON7-NEXT: @ %bb.1:
; NEON7-NEXT: .LCPI4_0:
; NEON7-NEXT: .short 0 @ 0x0
; NEON7-NEXT: .short 1 @ 0x1
; NEON7-NEXT: .short 2 @ 0x2
+; NEON7-NEXT: .zero 2
+; NEON7-NEXT: .LCPI4_1:
+; NEON7-NEXT: .short 683 @ 0x2ab
+; NEON7-NEXT: .short 1463 @ 0x5b7
+; NEON7-NEXT: .short 819 @ 0x333
+; NEON7-NEXT: .zero 2
+; NEON7-NEXT: .LCPI4_2:
+; NEON7-NEXT: .short 1 @ 0x1
+; NEON7-NEXT: .short 0 @ 0x0
+; NEON7-NEXT: .short 0 @ 0x0
+; NEON7-NEXT: .short 0 @ 0x0
+; NEON7-NEXT: .LCPI4_3:
+; NEON7-NEXT: .short 9 @ 0x9
+; NEON7-NEXT: .short 10 @ 0xa
+; NEON7-NEXT: .short 10 @ 0xa
+; NEON7-NEXT: .short 10 @ 0xa
+; NEON7-NEXT: .LCPI4_4:
+; NEON7-NEXT: .short 341 @ 0x155
+; NEON7-NEXT: .short 292 @ 0x124
+; NEON7-NEXT: .short 1 @ 0x1
; NEON7-NEXT: .short 0 @ 0x0
;
; NEON8-LABEL: test_urem_vec:
; NEON8: @ %bb.0:
-; NEON8-NEXT: push {r4, lr}
-; NEON8-NEXT: movw r3, #18725
-; NEON8-NEXT: bfc r1, #11, #21
-; NEON8-NEXT: movt r3, #9362
-; NEON8-NEXT: bfc r2, #11, #21
-; NEON8-NEXT: umull r3, r12, r1, r3
-; NEON8-NEXT: bfc r0, #11, #21
-; NEON8-NEXT: movw r3, #25663
-; NEON8-NEXT: movt r3, #160
-; NEON8-NEXT: umull r3, lr, r2, r3
-; NEON8-NEXT: vldr d17, .LCPI4_0
-; NEON8-NEXT: movw r3, #43691
-; NEON8-NEXT: movt r3, #43690
-; NEON8-NEXT: umull r3, r4, r0, r3
-; NEON8-NEXT: sub r3, r1, r12
-; NEON8-NEXT: add r3, r12, r3, lsr #1
-; NEON8-NEXT: lsr r12, r3, #2
-; NEON8-NEXT: sub r3, r2, lr
-; NEON8-NEXT: lsr r4, r4, #2
-; NEON8-NEXT: add r4, r4, r4, lsl #1
-; NEON8-NEXT: add r3, lr, r3, lsr #1
-; NEON8-NEXT: sub r0, r0, r4, lsl #1
-; NEON8-NEXT: lsr lr, r3, #10
-; NEON8-NEXT: movw r3, #2043
; NEON8-NEXT: vmov.16 d16[0], r0
-; NEON8-NEXT: sub r0, r12, r12, lsl #3
-; NEON8-NEXT: mls r2, lr, r3, r2
-; NEON8-NEXT: add r0, r1, r0
-; NEON8-NEXT: vmov.16 d16[1], r0
+; NEON8-NEXT: vldr d17, .LCPI4_0
+; NEON8-NEXT: vmov.16 d16[1], r1
+; NEON8-NEXT: vldr d19, .LCPI4_3
; NEON8-NEXT: vmov.16 d16[2], r2
+; NEON8-NEXT: vsub.i16 d16, d16, d17
+; NEON8-NEXT: vldr d17, .LCPI4_1
+; NEON8-NEXT: vmul.i16 d16, d16, d17
+; NEON8-NEXT: vldr d17, .LCPI4_2
+; NEON8-NEXT: vneg.s16 d17, d17
+; NEON8-NEXT: vshl.i16 d18, d16, #1
; NEON8-NEXT: vbic.i16 d16, #0xf800
-; NEON8-NEXT: vceq.i16 d16, d16, d17
-; NEON8-NEXT: vmvn d16, d16
+; NEON8-NEXT: vshl.u16 d16, d16, d17
+; NEON8-NEXT: vshl.u16 d17, d18, d19
+; NEON8-NEXT: vorr d16, d16, d17
+; NEON8-NEXT: vldr d17, .LCPI4_4
+; NEON8-NEXT: vbic.i16 d16, #0xf800
+; NEON8-NEXT: vcgt.u16 d16, d16, d17
; NEON8-NEXT: vmov.u16 r0, d16[0]
; NEON8-NEXT: vmov.u16 r1, d16[1]
; NEON8-NEXT: vmov.u16 r2, d16[2]
-; NEON8-NEXT: pop {r4, pc}
+; NEON8-NEXT: bx lr
; NEON8-NEXT: .p2align 3
; NEON8-NEXT: @ %bb.1:
; NEON8-NEXT: .LCPI4_0:
; NEON8-NEXT: .short 0 @ 0x0
; NEON8-NEXT: .short 1 @ 0x1
; NEON8-NEXT: .short 2 @ 0x2
+; NEON8-NEXT: .zero 2
+; NEON8-NEXT: .LCPI4_1:
+; NEON8-NEXT: .short 683 @ 0x2ab
+; NEON8-NEXT: .short 1463 @ 0x5b7
+; NEON8-NEXT: .short 819 @ 0x333
+; NEON8-NEXT: .zero 2
+; NEON8-NEXT: .LCPI4_2:
+; NEON8-NEXT: .short 1 @ 0x1
+; NEON8-NEXT: .short 0 @ 0x0
+; NEON8-NEXT: .short 0 @ 0x0
+; NEON8-NEXT: .short 0 @ 0x0
+; NEON8-NEXT: .LCPI4_3:
+; NEON8-NEXT: .short 9 @ 0x9
+; NEON8-NEXT: .short 10 @ 0xa
+; NEON8-NEXT: .short 10 @ 0xa
+; NEON8-NEXT: .short 10 @ 0xa
+; NEON8-NEXT: .LCPI4_4:
+; NEON8-NEXT: .short 341 @ 0x155
+; NEON8-NEXT: .short 292 @ 0x124
+; NEON8-NEXT: .short 1 @ 0x1
; NEON8-NEXT: .short 0 @ 0x0
%urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
%cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>
@@ -680,86 +624,150 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
define i1 @test_urem_larger(i63 %X) nounwind {
; ARM5-LABEL: test_urem_larger:
; ARM5: @ %bb.0:
-; ARM5-NEXT: push {r11, lr}
-; ARM5-NEXT: ldr r2, .LCPI5_0
-; ARM5-NEXT: bic r1, r1, #-2147483648
-; ARM5-NEXT: mov r3, #0
-; ARM5-NEXT: bl __umoddi3
-; ARM5-NEXT: orr r0, r0, r1
-; ARM5-NEXT: clz r0, r0
-; ARM5-NEXT: lsr r0, r0, #5
-; ARM5-NEXT: pop {r11, pc}
+; ARM5-NEXT: push {r4, lr}
+; ARM5-NEXT: ldr r12, .LCPI5_0
+; ARM5-NEXT: ldr r2, .LCPI5_1
+; ARM5-NEXT: umull r3, lr, r0, r12
+; ARM5-NEXT: mla r4, r0, r2, lr
+; ARM5-NEXT: mla r0, r1, r12, r4
+; ARM5-NEXT: bic r0, r0, #-2147483648
+; ARM5-NEXT: lsrs r0, r0, #1
+; ARM5-NEXT: rrx r1, r3
+; ARM5-NEXT: orr r0, r0, r3, lsl #30
+; ARM5-NEXT: ldr r3, .LCPI5_2
+; ARM5-NEXT: bic r2, r0, #-2147483648
+; ARM5-NEXT: mov r0, #0
+; ARM5-NEXT: subs r1, r1, r3
+; ARM5-NEXT: sbcs r1, r2, #1
+; ARM5-NEXT: movlo r0, #1
+; ARM5-NEXT: pop {r4, pc}
; ARM5-NEXT: .p2align 2
; ARM5-NEXT: @ %bb.1:
; ARM5-NEXT: .LCPI5_0:
-; ARM5-NEXT: .long 1234567890 @ 0x499602d2
+; ARM5-NEXT: .long 3456474841 @ 0xce059ed9
+; ARM5-NEXT: .LCPI5_1:
+; ARM5-NEXT: .long 790204738 @ 0x2f199142
+; ARM5-NEXT: .LCPI5_2:
+; ARM5-NEXT: .long 3175964122 @ 0xbd4d5dda
;
; ARM6-LABEL: test_urem_larger:
; ARM6: @ %bb.0:
; ARM6-NEXT: push {r11, lr}
-; ARM6-NEXT: ldr r2, .LCPI5_0
-; ARM6-NEXT: bic r1, r1, #-2147483648
-; ARM6-NEXT: mov r3, #0
-; ARM6-NEXT: bl __umoddi3
-; ARM6-NEXT: orr r0, r0, r1
-; ARM6-NEXT: clz r0, r0
-; ARM6-NEXT: lsr r0, r0, #5
+; ARM6-NEXT: ldr r12, .LCPI5_0
+; ARM6-NEXT: ldr r2, .LCPI5_1
+; ARM6-NEXT: umull r3, lr, r0, r12
+; ARM6-NEXT: mla r0, r0, r2, lr
+; ARM6-NEXT: mla r0, r1, r12, r0
+; ARM6-NEXT: bic r0, r0, #-2147483648
+; ARM6-NEXT: lsrs r0, r0, #1
+; ARM6-NEXT: rrx r1, r3
+; ARM6-NEXT: orr r0, r0, r3, lsl #30
+; ARM6-NEXT: ldr r3, .LCPI5_2
+; ARM6-NEXT: bic r2, r0, #-2147483648
+; ARM6-NEXT: mov r0, #0
+; ARM6-NEXT: subs r1, r1, r3
+; ARM6-NEXT: sbcs r1, r2, #1
+; ARM6-NEXT: movlo r0, #1
; ARM6-NEXT: pop {r11, pc}
; ARM6-NEXT: .p2align 2
; ARM6-NEXT: @ %bb.1:
; ARM6-NEXT: .LCPI5_0:
-; ARM6-NEXT: .long 1234567890 @ 0x499602d2
+; ARM6-NEXT: .long 3456474841 @ 0xce059ed9
+; ARM6-NEXT: .LCPI5_1:
+; ARM6-NEXT: .long 790204738 @ 0x2f199142
+; ARM6-NEXT: .LCPI5_2:
+; ARM6-NEXT: .long 3175964122 @ 0xbd4d5dda
;
; ARM7-LABEL: test_urem_larger:
; ARM7: @ %bb.0:
; ARM7-NEXT: push {r11, lr}
-; ARM7-NEXT: movw r2, #722
-; ARM7-NEXT: bic r1, r1, #-2147483648
-; ARM7-NEXT: movt r2, #18838
-; ARM7-NEXT: mov r3, #0
-; ARM7-NEXT: bl __umoddi3
-; ARM7-NEXT: orr r0, r0, r1
-; ARM7-NEXT: clz r0, r0
-; ARM7-NEXT: lsr r0, r0, #5
+; ARM7-NEXT: movw r12, #40665
+; ARM7-NEXT: movw r2, #37186
+; ARM7-NEXT: movt r12, #52741
+; ARM7-NEXT: movt r2, #12057
+; ARM7-NEXT: umull r3, lr, r0, r12
+; ARM7-NEXT: mla r0, r0, r2, lr
+; ARM7-NEXT: mla r0, r1, r12, r0
+; ARM7-NEXT: bic r0, r0, #-2147483648
+; ARM7-NEXT: lsrs r0, r0, #1
+; ARM7-NEXT: rrx r1, r3
+; ARM7-NEXT: orr r0, r0, r3, lsl #30
+; ARM7-NEXT: movw r3, #24026
+; ARM7-NEXT: bic r2, r0, #-2147483648
+; ARM7-NEXT: movt r3, #48461
+; ARM7-NEXT: subs r1, r1, r3
+; ARM7-NEXT: mov r0, #0
+; ARM7-NEXT: sbcs r1, r2, #1
+; ARM7-NEXT: movwlo r0, #1
; ARM7-NEXT: pop {r11, pc}
;
; ARM8-LABEL: test_urem_larger:
; ARM8: @ %bb.0:
; ARM8-NEXT: push {r11, lr}
-; ARM8-NEXT: movw r2, #722
-; ARM8-NEXT: bic r1, r1, #-2147483648
-; ARM8-NEXT: movt r2, #18838
-; ARM8-NEXT: mov r3, #0
-; ARM8-NEXT: bl __umoddi3
-; ARM8-NEXT: orr r0, r0, r1
-; ARM8-NEXT: clz r0, r0
-; ARM8-NEXT: lsr r0, r0, #5
+; ARM8-NEXT: movw r12, #40665
+; ARM8-NEXT: movw r2, #37186
+; ARM8-NEXT: movt r12, #52741
+; ARM8-NEXT: movt r2, #12057
+; ARM8-NEXT: umull r3, lr, r0, r12
+; ARM8-NEXT: mla r0, r0, r2, lr
+; ARM8-NEXT: mla r0, r1, r12, r0
+; ARM8-NEXT: bic r0, r0, #-2147483648
+; ARM8-NEXT: lsrs r0, r0, #1
+; ARM8-NEXT: rrx r1, r3
+; ARM8-NEXT: orr r0, r0, r3, lsl #30
+; ARM8-NEXT: movw r3, #24026
+; ARM8-NEXT: bic r2, r0, #-2147483648
+; ARM8-NEXT: movt r3, #48461
+; ARM8-NEXT: subs r1, r1, r3
+; ARM8-NEXT: mov r0, #0
+; ARM8-NEXT: sbcs r1, r2, #1
+; ARM8-NEXT: movwlo r0, #1
; ARM8-NEXT: pop {r11, pc}
;
; NEON7-LABEL: test_urem_larger:
; NEON7: @ %bb.0:
; NEON7-NEXT: push {r11, lr}
-; NEON7-NEXT: movw r2, #722
-; NEON7-NEXT: bic r1, r1, #-2147483648
-; NEON7-NEXT: movt r2, #18838
-; NEON7-NEXT: mov r3, #0
-; NEON7-NEXT: bl __umoddi3
-; NEON7-NEXT: orr r0, r0, r1
-; NEON7-NEXT: clz r0, r0
-; NEON7-NEXT: lsr r0, r0, #5
+; NEON7-NEXT: movw r12, #40665
+; NEON7-NEXT: movw r2, #37186
+; NEON7-NEXT: movt r12, #52741
+; NEON7-NEXT: movt r2, #12057
+; NEON7-NEXT: umull r3, lr, r0, r12
+; NEON7-NEXT: mla r0, r0, r2, lr
+; NEON7-NEXT: mla r0, r1, r12, r0
+; NEON7-NEXT: bic r0, r0, #-2147483648
+; NEON7-NEXT: lsrs r0, r0, #1
+; NEON7-NEXT: rrx r1, r3
+; NEON7-NEXT: orr r0, r0, r3, lsl #30
+; NEON7-NEXT: movw r3, #24026
+; NEON7-NEXT: bic r2, r0, #-2147483648
+; NEON7-NEXT: movt r3, #48461
+; NEON7-NEXT: subs r1, r1, r3
+; NEON7-NEXT: mov r0, #0
+; NEON7-NEXT: sbcs r1, r2, #1
+; NEON7-NEXT: movwlo r0, #1
; NEON7-NEXT: pop {r11, pc}
;
; NEON8-LABEL: test_urem_larger:
; NEON8: @ %bb.0:
; NEON8-NEXT: push {r11, lr}
-; NEON8-NEXT: movw r2, #722
-; NEON8-NEXT: bic r1, r1, #-2147483648
-; NEON8-NEXT: movt r2, #18838
-; NEON8-NEXT: mov r3, #0
-; NEON8-NEXT: bl __umoddi3
-; NEON8-NEXT: orr r0, r0, r1
-; NEON8-NEXT: clz r0, r0
-; NEON8-NEXT: lsr r0, r0, #5
+; NEON8-NEXT: movw r12, #40665
+; NEON8-NEXT: movw r2, #37186
+; NEON8-NEXT: movt r12, #52741
+; NEON8-NEXT: movt r2, #12057
+; NEON8-NEXT: umull r3, lr, r0, r12
+; NEON8-NEXT: mla r0, r0, r2, lr
+; NEON8-NEXT: mla r0, r1, r12, r0
+; NEON8-NEXT: bic r0, r0, #-2147483648
+; NEON8-NEXT: lsrs r0, r0, #1
+; NEON8-NEXT: rrx r1, r3
+; NEON8-NEXT: orr r0, r0, r3, lsl #30
+; NEON8-NEXT: movw r3, #24026
+; NEON8-NEXT: bic r2, r0, #-2147483648
+; NEON8-NEXT: movt r3, #48461
+; NEON8-NEXT: subs r1, r1, r3
+; NEON8-NEXT: mov r0, #0
+; NEON8-NEXT: sbcs r1, r2, #1
+; NEON8-NEXT: movwlo r0, #1
; NEON8-NEXT: pop {r11, pc}
%urem = urem i63 %X, 1234567890
%cmp = icmp eq i63 %urem, 0
diff --git a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
index 7c211b7f934c..adebd8f8a1d8 100644
--- a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll
@@ -5,32 +5,34 @@
define i1 @test_srem_odd(i29 %X) nounwind {
; MIPSEL-LABEL: test_srem_odd:
; MIPSEL: # %bb.0:
-; MIPSEL-NEXT: lui $1, 48986
+; MIPSEL-NEXT: lui $1, 8026
; MIPSEL-NEXT: ori $1, $1, 33099
-; MIPSEL-NEXT: sll $2, $4, 3
-; MIPSEL-NEXT: sra $2, $2, 3
-; MIPSEL-NEXT: mul $1, $2, $1
-; MIPSEL-NEXT: lui $2, 330
-; MIPSEL-NEXT: ori $2, $2, 64874
+; MIPSEL-NEXT: mul $1, $4, $1
+; MIPSEL-NEXT: lui $2, 41
+; MIPSEL-NEXT: ori $2, $2, 24493
; MIPSEL-NEXT: addu $1, $1, $2
-; MIPSEL-NEXT: lui $2, 661
-; MIPSEL-NEXT: ori $2, $2, 64213
+; MIPSEL-NEXT: lui $2, 8191
+; MIPSEL-NEXT: ori $2, $2, 65535
+; MIPSEL-NEXT: and $1, $1, $2
+; MIPSEL-NEXT: lui $2, 82
+; MIPSEL-NEXT: ori $2, $2, 48987
; MIPSEL-NEXT: jr $ra
; MIPSEL-NEXT: sltu $2, $1, $2
;
; MIPS64EL-LABEL: test_srem_odd:
; MIPS64EL: # %bb.0:
-; MIPS64EL-NEXT: lui $1, 48986
+; MIPS64EL-NEXT: lui $1, 8026
; MIPS64EL-NEXT: ori $1, $1, 33099
; MIPS64EL-NEXT: sll $2, $4, 0
-; MIPS64EL-NEXT: sll $2, $2, 3
-; MIPS64EL-NEXT: sra $2, $2, 3
; MIPS64EL-NEXT: mul $1, $2, $1
-; MIPS64EL-NEXT: lui $2, 330
-; MIPS64EL-NEXT: ori $2, $2, 64874
+; MIPS64EL-NEXT: lui $2, 41
+; MIPS64EL-NEXT: ori $2, $2, 24493
; MIPS64EL-NEXT: addu $1, $1, $2
-; MIPS64EL-NEXT: lui $2, 661
-; MIPS64EL-NEXT: ori $2, $2, 64213
+; MIPS64EL-NEXT: lui $2, 8191
+; MIPS64EL-NEXT: ori $2, $2, 65535
+; MIPS64EL-NEXT: and $1, $1, $2
+; MIPS64EL-NEXT: lui $2, 82
+; MIPS64EL-NEXT: ori $2, $2, 48987
; MIPS64EL-NEXT: jr $ra
; MIPS64EL-NEXT: sltu $2, $1, $2
%srem = srem i29 %X, 99
diff --git a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
index d1f23523f339..e23351ee581a 100644
--- a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
@@ -5,26 +5,30 @@
define i1 @test_urem_odd(i13 %X) nounwind {
; MIPSEL-LABEL: test_urem_odd:
; MIPSEL: # %bb.0:
-; MIPSEL-NEXT: lui $1, 52428
-; MIPSEL-NEXT: ori $1, $1, 52429
-; MIPSEL-NEXT: andi $2, $4, 8191
-; MIPSEL-NEXT: mul $1, $2, $1
-; MIPSEL-NEXT: lui $2, 13107
-; MIPSEL-NEXT: ori $2, $2, 13108
+; MIPSEL-NEXT: addiu $1, $zero, 3277
+; MIPSEL-NEXT: mul $1, $4, $1
+; MIPSEL-NEXT: andi $1, $1, 8191
; MIPSEL-NEXT: jr $ra
-; MIPSEL-NEXT: sltu $2, $1, $2
+; MIPSEL-NEXT: sltiu $2, $1, 1639
;
; MIPS64EL-LABEL: test_urem_odd:
; MIPS64EL: # %bb.0:
-; MIPS64EL-NEXT: lui $1, 52428
-; MIPS64EL-NEXT: ori $1, $1, 52429
-; MIPS64EL-NEXT: sll $2, $4, 0
-; MIPS64EL-NEXT: andi $2, $2, 8191
-; MIPS64EL-NEXT: mul $1, $2, $1
-; MIPS64EL-NEXT: lui $2, 13107
-; MIPS64EL-NEXT: ori $2, $2, 13108
+; MIPS64EL-NEXT: sll $1, $4, 0
+; MIPS64EL-NEXT: sll $2, $1, 1
+; MIPS64EL-NEXT: addu $2, $2, $1
+; MIPS64EL-NEXT: sll $3, $1, 4
+; MIPS64EL-NEXT: subu $2, $3, $2
+; MIPS64EL-NEXT: sll $3, $1, 6
+; MIPS64EL-NEXT: subu $2, $2, $3
+; MIPS64EL-NEXT: sll $3, $1, 8
+; MIPS64EL-NEXT: addu $2, $3, $2
+; MIPS64EL-NEXT: sll $3, $1, 10
+; MIPS64EL-NEXT: subu $2, $2, $3
+; MIPS64EL-NEXT: sll $1, $1, 12
+; MIPS64EL-NEXT: addu $1, $1, $2
+; MIPS64EL-NEXT: andi $1, $1, 8191
; MIPS64EL-NEXT: jr $ra
-; MIPS64EL-NEXT: sltu $2, $1, $2
+; MIPS64EL-NEXT: sltiu $2, $1, 1639
%urem = urem i13 %X, 5
%cmp = icmp eq i13 %urem, 0
ret i1 %cmp
@@ -33,40 +37,40 @@ define i1 @test_urem_odd(i13 %X) nounwind {
define i1 @test_urem_even(i27 %X) nounwind {
; MIPSEL-LABEL: test_urem_even:
; MIPSEL: # %bb.0:
-; MIPSEL-NEXT: lui $1, 2047
-; MIPSEL-NEXT: ori $1, $1, 65535
-; MIPSEL-NEXT: and $1, $4, $1
-; MIPSEL-NEXT: srl $2, $1, 1
-; MIPSEL-NEXT: lui $3, 37449
-; MIPSEL-NEXT: ori $3, $3, 9363
-; MIPSEL-NEXT: multu $2, $3
-; MIPSEL-NEXT: mfhi $2
-; MIPSEL-NEXT: srl $2, $2, 2
-; MIPSEL-NEXT: sll $3, $2, 4
-; MIPSEL-NEXT: sll $2, $2, 1
-; MIPSEL-NEXT: subu $2, $2, $3
-; MIPSEL-NEXT: addu $1, $1, $2
+; MIPSEL-NEXT: lui $1, 1755
+; MIPSEL-NEXT: ori $1, $1, 28087
+; MIPSEL-NEXT: mul $1, $4, $1
+; MIPSEL-NEXT: sll $2, $1, 26
+; MIPSEL-NEXT: lui $3, 2047
+; MIPSEL-NEXT: ori $4, $3, 65534
+; MIPSEL-NEXT: and $1, $1, $4
+; MIPSEL-NEXT: srl $1, $1, 1
+; MIPSEL-NEXT: or $1, $1, $2
+; MIPSEL-NEXT: ori $2, $3, 65535
+; MIPSEL-NEXT: and $1, $1, $2
+; MIPSEL-NEXT: lui $2, 146
+; MIPSEL-NEXT: ori $2, $2, 18725
; MIPSEL-NEXT: jr $ra
-; MIPSEL-NEXT: sltiu $2, $1, 1
+; MIPSEL-NEXT: sltu $2, $1, $2
;
; MIPS64EL-LABEL: test_urem_even:
; MIPS64EL: # %bb.0:
-; MIPS64EL-NEXT: lui $1, 2047
-; MIPS64EL-NEXT: ori $1, $1, 65535
+; MIPS64EL-NEXT: lui $1, 1755
+; MIPS64EL-NEXT: ori $1, $1, 28087
; MIPS64EL-NEXT: sll $2, $4, 0
-; MIPS64EL-NEXT: and $1, $2, $1
-; MIPS64EL-NEXT: srl $2, $1, 1
-; MIPS64EL-NEXT: lui $3, 37449
-; MIPS64EL-NEXT: ori $3, $3, 9363
-; MIPS64EL-NEXT: multu $2, $3
-; MIPS64EL-NEXT: mfhi $2
-; MIPS64EL-NEXT: srl $2, $2, 2
-; MIPS64EL-NEXT: sll $3, $2, 4
-; MIPS64EL-NEXT: sll $2, $2, 1
-; MIPS64EL-NEXT: subu $2, $2, $3
-; MIPS64EL-NEXT: addu $1, $1, $2
+; MIPS64EL-NEXT: mul $1, $2, $1
+; MIPS64EL-NEXT: sll $2, $1, 26
+; MIPS64EL-NEXT: lui $3, 2047
+; MIPS64EL-NEXT: ori $4, $3, 65534
+; MIPS64EL-NEXT: and $1, $1, $4
+; MIPS64EL-NEXT: srl $1, $1, 1
+; MIPS64EL-NEXT: or $1, $1, $2
+; MIPS64EL-NEXT: ori $2, $3, 65535
+; MIPS64EL-NEXT: lui $3, 146
+; MIPS64EL-NEXT: and $1, $1, $2
+; MIPS64EL-NEXT: ori $2, $3, 18725
; MIPS64EL-NEXT: jr $ra
-; MIPS64EL-NEXT: sltiu $2, $1, 1
+; MIPS64EL-NEXT: sltu $2, $1, $2
%urem = urem i27 %X, 14
%cmp = icmp eq i27 %urem, 0
ret i1 %cmp
@@ -75,24 +79,22 @@ define i1 @test_urem_even(i27 %X) nounwind {
define i1 @test_urem_odd_setne(i4 %X) nounwind {
; MIPSEL-LABEL: test_urem_odd_setne:
; MIPSEL: # %bb.0:
-; MIPSEL-NEXT: lui $1, 52428
-; MIPSEL-NEXT: ori $1, $1, 52429
-; MIPSEL-NEXT: andi $2, $4, 15
-; MIPSEL-NEXT: mul $1, $2, $1
-; MIPSEL-NEXT: lui $2, 13107
-; MIPSEL-NEXT: ori $2, $2, 13107
+; MIPSEL-NEXT: sll $1, $4, 1
+; MIPSEL-NEXT: addu $1, $1, $4
+; MIPSEL-NEXT: negu $1, $1
+; MIPSEL-NEXT: andi $1, $1, 15
+; MIPSEL-NEXT: addiu $2, $zero, 3
; MIPSEL-NEXT: jr $ra
; MIPSEL-NEXT: sltu $2, $2, $1
;
; MIPS64EL-LABEL: test_urem_odd_setne:
; MIPS64EL: # %bb.0:
-; MIPS64EL-NEXT: lui $1, 52428
-; MIPS64EL-NEXT: ori $1, $1, 52429
-; MIPS64EL-NEXT: sll $2, $4, 0
-; MIPS64EL-NEXT: andi $2, $2, 15
-; MIPS64EL-NEXT: mul $1, $2, $1
-; MIPS64EL-NEXT: lui $2, 13107
-; MIPS64EL-NEXT: ori $2, $2, 13107
+; MIPS64EL-NEXT: sll $1, $4, 0
+; MIPS64EL-NEXT: sll $2, $1, 1
+; MIPS64EL-NEXT: addu $1, $2, $1
+; MIPS64EL-NEXT: negu $1, $1
+; MIPS64EL-NEXT: andi $1, $1, 15
+; MIPS64EL-NEXT: addiu $2, $zero, 3
; MIPS64EL-NEXT: jr $ra
; MIPS64EL-NEXT: sltu $2, $2, $1
%urem = urem i4 %X, 5
@@ -103,26 +105,34 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
define i1 @test_urem_negative_odd(i9 %X) nounwind {
; MIPSEL-LABEL: test_urem_negative_odd:
; MIPSEL: # %bb.0:
-; MIPSEL-NEXT: lui $1, 43302
-; MIPSEL-NEXT: ori $1, $1, 57651
-; MIPSEL-NEXT: andi $2, $4, 511
-; MIPSEL-NEXT: mul $1, $2, $1
-; MIPSEL-NEXT: lui $2, 129
-; MIPSEL-NEXT: ori $2, $2, 17191
+; MIPSEL-NEXT: sll $1, $4, 1
+; MIPSEL-NEXT: addu $1, $1, $4
+; MIPSEL-NEXT: sll $2, $4, 4
+; MIPSEL-NEXT: subu $1, $1, $2
+; MIPSEL-NEXT: sll $2, $4, 6
+; MIPSEL-NEXT: addu $1, $2, $1
+; MIPSEL-NEXT: sll $2, $4, 8
+; MIPSEL-NEXT: addu $1, $2, $1
+; MIPSEL-NEXT: andi $1, $1, 511
+; MIPSEL-NEXT: addiu $2, $zero, 1
; MIPSEL-NEXT: jr $ra
; MIPSEL-NEXT: sltu $2, $2, $1
;
; MIPS64EL-LABEL: test_urem_negative_odd:
; MIPS64EL: # %bb.0:
-; MIPS64EL-NEXT: lui $1, 43302
-; MIPS64EL-NEXT: ori $1, $1, 57651
-; MIPS64EL-NEXT: sll $2, $4, 0
-; MIPS64EL-NEXT: andi $2, $2, 511
-; MIPS64EL-NEXT: mul $1, $2, $1
-; MIPS64EL-NEXT: lui $2, 129
-; MIPS64EL-NEXT: ori $2, $2, 17191
+; MIPS64EL-NEXT: sll $1, $4, 0
+; MIPS64EL-NEXT: sll $2, $1, 1
+; MIPS64EL-NEXT: addu $2, $2, $1
+; MIPS64EL-NEXT: sll $3, $1, 4
+; MIPS64EL-NEXT: subu $2, $2, $3
+; MIPS64EL-NEXT: sll $3, $1, 6
+; MIPS64EL-NEXT: addu $2, $3, $2
+; MIPS64EL-NEXT: sll $1, $1, 8
+; MIPS64EL-NEXT: addiu $3, $zero, 1
+; MIPS64EL-NEXT: addu $1, $1, $2
+; MIPS64EL-NEXT: andi $1, $1, 511
; MIPS64EL-NEXT: jr $ra
-; MIPS64EL-NEXT: sltu $2, $2, $1
+; MIPS64EL-NEXT: sltu $2, $3, $1
%urem = urem i9 %X, -5
%cmp = icmp ne i9 %urem, 0
ret i1 %cmp
@@ -142,37 +152,71 @@ define i1 @test_urem_oversized(i66 %X) nounwind {
; MIPSEL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
; MIPSEL-NEXT: move $7, $6
; MIPSEL-NEXT: move $6, $5
-; MIPSEL-NEXT: lui $1, 18838
-; MIPSEL-NEXT: ori $1, $1, 722
-; MIPSEL-NEXT: sw $1, 28($sp)
-; MIPSEL-NEXT: sw $zero, 24($sp)
-; MIPSEL-NEXT: sw $zero, 20($sp)
+; MIPSEL-NEXT: move $5, $4
+; MIPSEL-NEXT: lui $1, 12057
+; MIPSEL-NEXT: ori $1, $1, 37186
+; MIPSEL-NEXT: lui $2, 52741
+; MIPSEL-NEXT: ori $2, $2, 40665
+; MIPSEL-NEXT: sw $2, 28($sp)
+; MIPSEL-NEXT: sw $1, 24($sp)
+; MIPSEL-NEXT: addiu $1, $zero, 2
+; MIPSEL-NEXT: sw $1, 20($sp)
; MIPSEL-NEXT: sw $zero, 16($sp)
-; MIPSEL-NEXT: andi $5, $4, 3
-; MIPSEL-NEXT: jal __umodti3
+; MIPSEL-NEXT: jal __multi3
; MIPSEL-NEXT: addiu $4, $zero, 0
-; MIPSEL-NEXT: or $1, $4, $2
-; MIPSEL-NEXT: or $2, $5, $3
+; MIPSEL-NEXT: sll $1, $4, 31
+; MIPSEL-NEXT: srl $2, $5, 1
; MIPSEL-NEXT: or $1, $2, $1
-; MIPSEL-NEXT: sltiu $2, $1, 1
+; MIPSEL-NEXT: lui $2, 60010
+; MIPSEL-NEXT: ori $2, $2, 61135
+; MIPSEL-NEXT: sltu $1, $1, $2
+; MIPSEL-NEXT: srl $2, $4, 1
+; MIPSEL-NEXT: andi $3, $3, 3
+; MIPSEL-NEXT: sll $4, $3, 31
+; MIPSEL-NEXT: or $4, $2, $4
+; MIPSEL-NEXT: sltiu $2, $4, 13
+; MIPSEL-NEXT: xori $4, $4, 13
+; MIPSEL-NEXT: movz $2, $1, $4
+; MIPSEL-NEXT: sll $1, $5, 1
+; MIPSEL-NEXT: srl $3, $3, 1
+; MIPSEL-NEXT: or $1, $3, $1
+; MIPSEL-NEXT: andi $1, $1, 3
+; MIPSEL-NEXT: movn $2, $zero, $1
; MIPSEL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
; MIPSEL-NEXT: jr $ra
; MIPSEL-NEXT: addiu $sp, $sp, 40
;
; MIPS64EL-LABEL: test_urem_oversized:
; MIPS64EL: # %bb.0:
-; MIPS64EL-NEXT: daddiu $sp, $sp, -16
-; MIPS64EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
-; MIPS64EL-NEXT: andi $5, $5, 3
-; MIPS64EL-NEXT: lui $1, 18838
-; MIPS64EL-NEXT: ori $6, $1, 722
-; MIPS64EL-NEXT: jal __umodti3
-; MIPS64EL-NEXT: daddiu $7, $zero, 0
-; MIPS64EL-NEXT: or $1, $2, $3
-; MIPS64EL-NEXT: sltiu $2, $1, 1
-; MIPS64EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64EL-NEXT: lui $1, 6029
+; MIPS64EL-NEXT: daddiu $1, $1, -14175
+; MIPS64EL-NEXT: dsll $1, $1, 16
+; MIPS64EL-NEXT: daddiu $1, $1, 26371
+; MIPS64EL-NEXT: dsll $1, $1, 17
+; MIPS64EL-NEXT: daddiu $1, $1, -24871
+; MIPS64EL-NEXT: dmult $5, $1
+; MIPS64EL-NEXT: mflo $2
+; MIPS64EL-NEXT: dmultu $4, $1
+; MIPS64EL-NEXT: mflo $1
+; MIPS64EL-NEXT: mfhi $3
+; MIPS64EL-NEXT: lui $5, 14
+; MIPS64EL-NEXT: daddiu $5, $5, -5525
+; MIPS64EL-NEXT: dsll $5, $5, 16
+; MIPS64EL-NEXT: daddiu $5, $5, -4401
+; MIPS64EL-NEXT: dsll $4, $4, 1
+; MIPS64EL-NEXT: daddu $3, $3, $4
+; MIPS64EL-NEXT: daddu $2, $3, $2
+; MIPS64EL-NEXT: andi $3, $2, 3
+; MIPS64EL-NEXT: dsll $2, $3, 63
+; MIPS64EL-NEXT: dsrl $4, $1, 1
+; MIPS64EL-NEXT: or $2, $4, $2
+; MIPS64EL-NEXT: sltu $2, $2, $5
+; MIPS64EL-NEXT: dsrl $3, $3, 1
+; MIPS64EL-NEXT: dsll $1, $1, 1
+; MIPS64EL-NEXT: or $1, $3, $1
+; MIPS64EL-NEXT: andi $1, $1, 3
; MIPS64EL-NEXT: jr $ra
-; MIPS64EL-NEXT: daddiu $sp, $sp, 16
+; MIPS64EL-NEXT: movn $2, $zero, $1
%urem = urem i66 %X, 1234567890
%cmp = icmp eq i66 %urem, 0
ret i1 %cmp
diff --git a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
index b2de90ed3344..fc655df6030b 100644
--- a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
@@ -5,36 +5,37 @@
define i1 @test_srem_odd(i29 %X) nounwind {
; PPC-LABEL: test_srem_odd:
; PPC: # %bb.0:
-; PPC-NEXT: lis 4, -23170
-; PPC-NEXT: slwi 3, 3, 3
-; PPC-NEXT: ori 4, 4, 46339
-; PPC-NEXT: srawi 3, 3, 3
-; PPC-NEXT: mulhw 4, 3, 4
-; PPC-NEXT: add 4, 4, 3
-; PPC-NEXT: srwi 5, 4, 31
-; PPC-NEXT: srawi 4, 4, 6
-; PPC-NEXT: add 4, 4, 5
-; PPC-NEXT: mulli 4, 4, 99
-; PPC-NEXT: sub 3, 3, 4
-; PPC-NEXT: cntlzw 3, 3
-; PPC-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT: lis 4, 8026
+; PPC-NEXT: ori 4, 4, 33099
+; PPC-NEXT: mullw 3, 3, 4
+; PPC-NEXT: addi 3, 3, 24493
+; PPC-NEXT: lis 4, 82
+; PPC-NEXT: addis 3, 3, 41
+; PPC-NEXT: ori 4, 4, 48987
+; PPC-NEXT: clrlwi 3, 3, 3
+; PPC-NEXT: cmplw 3, 4
+; PPC-NEXT: li 3, 0
+; PPC-NEXT: li 4, 1
+; PPC-NEXT: bc 12, 0, .LBB0_1
+; PPC-NEXT: blr
+; PPC-NEXT: .LBB0_1:
+; PPC-NEXT: addi 3, 4, 0
; PPC-NEXT: blr
;
; PPC64LE-LABEL: test_srem_odd:
; PPC64LE: # %bb.0:
-; PPC64LE-NEXT: lis 4, -23170
-; PPC64LE-NEXT: slwi 3, 3, 3
-; PPC64LE-NEXT: srawi 3, 3, 3
-; PPC64LE-NEXT: ori 4, 4, 46339
-; PPC64LE-NEXT: mulhw 4, 3, 4
-; PPC64LE-NEXT: add 4, 4, 3
-; PPC64LE-NEXT: srwi 5, 4, 31
-; PPC64LE-NEXT: srawi 4, 4, 6
-; PPC64LE-NEXT: add 4, 4, 5
-; PPC64LE-NEXT: mulli 4, 4, 99
-; PPC64LE-NEXT: sub 3, 3, 4
-; PPC64LE-NEXT: cntlzw 3, 3
-; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT: lis 4, 8026
+; PPC64LE-NEXT: ori 4, 4, 33099
+; PPC64LE-NEXT: mullw 3, 3, 4
+; PPC64LE-NEXT: lis 4, 82
+; PPC64LE-NEXT: ori 4, 4, 48987
+; PPC64LE-NEXT: addi 3, 3, 24493
+; PPC64LE-NEXT: addis 3, 3, 41
+; PPC64LE-NEXT: clrlwi 3, 3, 3
+; PPC64LE-NEXT: cmplw 3, 4
+; PPC64LE-NEXT: li 3, 0
+; PPC64LE-NEXT: li 4, 1
+; PPC64LE-NEXT: isellt 3, 4, 3
; PPC64LE-NEXT: blr
%srem = srem i29 %X, 99
%cmp = icmp eq i29 %srem, 0
diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll
index 40d402d424e6..ef73fa686b3c 100644
--- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll
@@ -5,29 +5,24 @@
define i1 @test_urem_odd(i13 %X) nounwind {
; PPC-LABEL: test_urem_odd:
; PPC: # %bb.0:
-; PPC-NEXT: lis 4, -13108
+; PPC-NEXT: mulli 3, 3, 3277
; PPC-NEXT: clrlwi 3, 3, 19
-; PPC-NEXT: ori 4, 4, 52429
-; PPC-NEXT: mulhwu 4, 3, 4
-; PPC-NEXT: srwi 4, 4, 2
-; PPC-NEXT: mulli 4, 4, 5
-; PPC-NEXT: sub 3, 3, 4
-; PPC-NEXT: cntlzw 3, 3
-; PPC-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT: li 4, 0
+; PPC-NEXT: cmplwi 3, 1639
+; PPC-NEXT: li 3, 1
+; PPC-NEXT: bclr 12, 0, 0
+; PPC-NEXT: # %bb.1:
+; PPC-NEXT: ori 3, 4, 0
; PPC-NEXT: blr
;
; PPC64LE-LABEL: test_urem_odd:
; PPC64LE: # %bb.0:
-; PPC64LE-NEXT: lis 4, -13108
+; PPC64LE-NEXT: mulli 3, 3, 3277
+; PPC64LE-NEXT: li 4, 0
; PPC64LE-NEXT: clrlwi 3, 3, 19
-; PPC64LE-NEXT: ori 4, 4, 52429
-; PPC64LE-NEXT: mulhwu 4, 3, 4
-; PPC64LE-NEXT: rlwinm 5, 4, 0, 0, 29
-; PPC64LE-NEXT: srwi 4, 4, 2
-; PPC64LE-NEXT: add 4, 4, 5
-; PPC64LE-NEXT: sub 3, 3, 4
-; PPC64LE-NEXT: cntlzw 3, 3
-; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT: cmplwi 3, 1639
+; PPC64LE-NEXT: li 3, 1
+; PPC64LE-NEXT: isellt 3, 3, 4
; PPC64LE-NEXT: blr
%urem = urem i13 %X, 5
%cmp = icmp eq i13 %urem, 0
@@ -37,30 +32,35 @@ define i1 @test_urem_odd(i13 %X) nounwind {
define i1 @test_urem_even(i27 %X) nounwind {
; PPC-LABEL: test_urem_even:
; PPC: # %bb.0:
-; PPC-NEXT: lis 4, -28087
-; PPC-NEXT: rlwinm 5, 3, 31, 6, 31
-; PPC-NEXT: ori 4, 4, 9363
-; PPC-NEXT: mulhwu 4, 5, 4
-; PPC-NEXT: srwi 4, 4, 2
-; PPC-NEXT: clrlwi 3, 3, 5
-; PPC-NEXT: mulli 4, 4, 14
-; PPC-NEXT: sub 3, 3, 4
-; PPC-NEXT: cntlzw 3, 3
-; PPC-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT: lis 4, 1755
+; PPC-NEXT: ori 4, 4, 28087
+; PPC-NEXT: mullw 3, 3, 4
+; PPC-NEXT: rlwinm 4, 3, 31, 6, 31
+; PPC-NEXT: rlwimi 4, 3, 26, 5, 5
+; PPC-NEXT: lis 3, 146
+; PPC-NEXT: ori 3, 3, 18725
+; PPC-NEXT: cmplw 4, 3
+; PPC-NEXT: li 3, 0
+; PPC-NEXT: li 4, 1
+; PPC-NEXT: bc 12, 0, .LBB1_1
+; PPC-NEXT: blr
+; PPC-NEXT: .LBB1_1:
+; PPC-NEXT: addi 3, 4, 0
; PPC-NEXT: blr
;
; PPC64LE-LABEL: test_urem_even:
; PPC64LE: # %bb.0:
-; PPC64LE-NEXT: lis 4, -28087
+; PPC64LE-NEXT: lis 4, 1755
+; PPC64LE-NEXT: ori 4, 4, 28087
+; PPC64LE-NEXT: mullw 3, 3, 4
+; PPC64LE-NEXT: lis 4, 146
; PPC64LE-NEXT: rlwinm 5, 3, 31, 6, 31
-; PPC64LE-NEXT: clrlwi 3, 3, 5
-; PPC64LE-NEXT: ori 4, 4, 9363
-; PPC64LE-NEXT: mulhwu 4, 5, 4
-; PPC64LE-NEXT: srwi 4, 4, 2
-; PPC64LE-NEXT: mulli 4, 4, 14
-; PPC64LE-NEXT: sub 3, 3, 4
-; PPC64LE-NEXT: cntlzw 3, 3
-; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT: rlwimi 5, 3, 26, 5, 5
+; PPC64LE-NEXT: ori 3, 4, 18725
+; PPC64LE-NEXT: li 4, 1
+; PPC64LE-NEXT: cmplw 5, 3
+; PPC64LE-NEXT: li 3, 0
+; PPC64LE-NEXT: isellt 3, 4, 3
; PPC64LE-NEXT: blr
%urem = urem i27 %X, 14
%cmp = icmp eq i27 %urem, 0
@@ -70,30 +70,26 @@ define i1 @test_urem_even(i27 %X) nounwind {
define i1 @test_urem_odd_setne(i4 %X) nounwind {
; PPC-LABEL: test_urem_odd_setne:
; PPC: # %bb.0:
-; PPC-NEXT: lis 4, -13108
+; PPC-NEXT: mulli 3, 3, 13
; PPC-NEXT: clrlwi 3, 3, 28
-; PPC-NEXT: ori 4, 4, 52429
-; PPC-NEXT: mulhwu 4, 3, 4
-; PPC-NEXT: srwi 4, 4, 2
-; PPC-NEXT: mulli 4, 4, 5
-; PPC-NEXT: sub 3, 3, 4
-; PPC-NEXT: cntlzw 3, 3
-; PPC-NEXT: not 3, 3
-; PPC-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT: li 4, 0
+; PPC-NEXT: cmplwi 3, 3
+; PPC-NEXT: li 3, 1
+; PPC-NEXT: bclr 12, 1, 0
+; PPC-NEXT: # %bb.1:
+; PPC-NEXT: ori 3, 4, 0
; PPC-NEXT: blr
;
; PPC64LE-LABEL: test_urem_odd_setne:
; PPC64LE: # %bb.0:
-; PPC64LE-NEXT: lis 4, -13108
+; PPC64LE-NEXT: slwi 5, 3, 1
+; PPC64LE-NEXT: li 4, 0
+; PPC64LE-NEXT: add 3, 3, 5
+; PPC64LE-NEXT: neg 3, 3
; PPC64LE-NEXT: clrlwi 3, 3, 28
-; PPC64LE-NEXT: ori 4, 4, 52429
-; PPC64LE-NEXT: mulhwu 4, 3, 4
-; PPC64LE-NEXT: srwi 4, 4, 2
-; PPC64LE-NEXT: rlwimi 4, 4, 2, 28, 29
-; PPC64LE-NEXT: sub 3, 3, 4
-; PPC64LE-NEXT: cntlzw 3, 3
-; PPC64LE-NEXT: not 3, 3
-; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT: cmplwi 3, 3
+; PPC64LE-NEXT: li 3, 1
+; PPC64LE-NEXT: iselgt 3, 3, 4
; PPC64LE-NEXT: blr
%urem = urem i4 %X, 5
%cmp = icmp ne i4 %urem, 0
@@ -103,30 +99,24 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
define i1 @test_urem_negative_odd(i9 %X) nounwind {
; PPC-LABEL: test_urem_negative_odd:
; PPC: # %bb.0:
-; PPC-NEXT: lis 4, 8272
+; PPC-NEXT: mulli 3, 3, 307
; PPC-NEXT: clrlwi 3, 3, 23
-; PPC-NEXT: ori 4, 4, 51705
-; PPC-NEXT: mulhwu 4, 3, 4
-; PPC-NEXT: srwi 4, 4, 6
-; PPC-NEXT: mulli 4, 4, 507
-; PPC-NEXT: sub 3, 3, 4
-; PPC-NEXT: cntlzw 3, 3
-; PPC-NEXT: not 3, 3
-; PPC-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT: li 4, 0
+; PPC-NEXT: cmplwi 3, 1
+; PPC-NEXT: li 3, 1
+; PPC-NEXT: bclr 12, 1, 0
+; PPC-NEXT: # %bb.1:
+; PPC-NEXT: ori 3, 4, 0
; PPC-NEXT: blr
;
; PPC64LE-LABEL: test_urem_negative_odd:
; PPC64LE: # %bb.0:
-; PPC64LE-NEXT: lis 4, 8272
+; PPC64LE-NEXT: mulli 3, 3, 307
+; PPC64LE-NEXT: li 4, 0
; PPC64LE-NEXT: clrlwi 3, 3, 23
-; PPC64LE-NEXT: ori 4, 4, 51705
-; PPC64LE-NEXT: mulhwu 4, 3, 4
-; PPC64LE-NEXT: srwi 4, 4, 6
-; PPC64LE-NEXT: mulli 4, 4, 507
-; PPC64LE-NEXT: sub 3, 3, 4
-; PPC64LE-NEXT: cntlzw 3, 3
-; PPC64LE-NEXT: not 3, 3
-; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC64LE-NEXT: cmplwi 3, 1
+; PPC64LE-NEXT: li 3, 1
+; PPC64LE-NEXT: iselgt 3, 3, 4
; PPC64LE-NEXT: blr
%urem = urem i9 %X, -5
%cmp = icmp ne i9 %urem, 0
@@ -136,103 +126,79 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; PPC-LABEL: test_urem_vec:
; PPC: # %bb.0:
-; PPC-NEXT: lis 6, -31983
-; PPC-NEXT: clrlwi 5, 5, 21
-; PPC-NEXT: ori 6, 6, 60211
-; PPC-NEXT: mullw 5, 5, 6
-; PPC-NEXT: lis 6, 32
-; PPC-NEXT: addi 5, 5, 10650
-; PPC-NEXT: ori 6, 6, 5132
-; PPC-NEXT: addis 5, 5, -1572
-; PPC-NEXT: cmplw 5, 6
-; PPC-NEXT: lis 6, -18725
+; PPC-NEXT: mulli 3, 3, 683
+; PPC-NEXT: rlwinm 7, 3, 31, 22, 31
+; PPC-NEXT: rlwimi 7, 3, 10, 21, 21
+; PPC-NEXT: mulli 5, 5, 819
+; PPC-NEXT: li 6, 0
+; PPC-NEXT: cmplwi 7, 341
+; PPC-NEXT: mulli 3, 4, 1463
+; PPC-NEXT: addi 4, 5, -1638
+; PPC-NEXT: addi 3, 3, -1463
; PPC-NEXT: clrlwi 4, 4, 21
-; PPC-NEXT: ori 6, 6, 28087
-; PPC-NEXT: lis 5, -21846
-; PPC-NEXT: mullw 4, 4, 6
-; PPC-NEXT: lis 6, 9362
; PPC-NEXT: clrlwi 3, 3, 21
-; PPC-NEXT: ori 5, 5, 43691
-; PPC-NEXT: addi 4, 4, -28087
-; PPC-NEXT: ori 6, 6, 18724
-; PPC-NEXT: mulhwu 5, 3, 5
-; PPC-NEXT: addis 4, 4, 18725
-; PPC-NEXT: cmplw 1, 4, 6
-; PPC-NEXT: srwi 4, 5, 2
-; PPC-NEXT: li 6, 0
-; PPC-NEXT: li 7, 1
-; PPC-NEXT: mulli 4, 4, 6
-; PPC-NEXT: sub 3, 3, 4
-; PPC-NEXT: cntlzw 3, 3
-; PPC-NEXT: not 3, 3
-; PPC-NEXT: bc 12, 5, .LBB4_2
+; PPC-NEXT: cmplwi 1, 4, 1
+; PPC-NEXT: cmplwi 5, 3, 292
+; PPC-NEXT: li 3, 1
+; PPC-NEXT: bc 12, 21, .LBB4_2
; PPC-NEXT: # %bb.1:
; PPC-NEXT: ori 4, 6, 0
; PPC-NEXT: b .LBB4_3
; PPC-NEXT: .LBB4_2:
-; PPC-NEXT: addi 4, 7, 0
+; PPC-NEXT: addi 4, 3, 0
; PPC-NEXT: .LBB4_3:
-; PPC-NEXT: bc 12, 1, .LBB4_5
+; PPC-NEXT: bc 12, 5, .LBB4_5
; PPC-NEXT: # %bb.4:
; PPC-NEXT: ori 5, 6, 0
; PPC-NEXT: b .LBB4_6
; PPC-NEXT: .LBB4_5:
-; PPC-NEXT: addi 5, 7, 0
+; PPC-NEXT: addi 5, 3, 0
; PPC-NEXT: .LBB4_6:
-; PPC-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT: bclr 12, 1, 0
+; PPC-NEXT: # %bb.7:
+; PPC-NEXT: ori 3, 6, 0
; PPC-NEXT: blr
;
; PPC64LE-LABEL: test_urem_vec:
; PPC64LE: # %bb.0:
-; PPC64LE-NEXT: lis 6, 9362
-; PPC64LE-NEXT: lis 7, -21846
-; PPC64LE-NEXT: clrlwi 4, 4, 21
-; PPC64LE-NEXT: clrlwi 3, 3, 21
-; PPC64LE-NEXT: lis 8, 160
-; PPC64LE-NEXT: clrlwi 5, 5, 21
-; PPC64LE-NEXT: ori 6, 6, 18725
-; PPC64LE-NEXT: ori 7, 7, 43691
-; PPC64LE-NEXT: ori 8, 8, 25663
-; PPC64LE-NEXT: vspltisw 4, -11
-; PPC64LE-NEXT: mulhwu 6, 4, 6
-; PPC64LE-NEXT: mulhwu 7, 3, 7
-; PPC64LE-NEXT: mulhwu 8, 5, 8
-; PPC64LE-NEXT: sub 9, 4, 6
-; PPC64LE-NEXT: srwi 7, 7, 2
-; PPC64LE-NEXT: srwi 9, 9, 1
-; PPC64LE-NEXT: mulli 7, 7, 6
-; PPC64LE-NEXT: add 6, 9, 6
-; PPC64LE-NEXT: srwi 9, 6, 2
-; PPC64LE-NEXT: rlwinm 6, 6, 1, 0, 28
-; PPC64LE-NEXT: sub 6, 9, 6
-; PPC64LE-NEXT: sub 9, 5, 8
-; PPC64LE-NEXT: add 4, 4, 6
-; PPC64LE-NEXT: srwi 6, 9, 1
-; PPC64LE-NEXT: sub 3, 3, 7
-; PPC64LE-NEXT: add 6, 6, 8
-; PPC64LE-NEXT: mtvsrwz 34, 4
-; PPC64LE-NEXT: srwi 4, 6, 10
-; PPC64LE-NEXT: mtvsrwz 35, 3
-; PPC64LE-NEXT: mulli 3, 4, 2043
-; PPC64LE-NEXT: addis 4, 2, .LCPI4_0 at toc@ha
-; PPC64LE-NEXT: vmrghw 2, 2, 3
-; PPC64LE-NEXT: addi 4, 4, .LCPI4_0 at toc@l
-; PPC64LE-NEXT: lvx 3, 0, 4
-; PPC64LE-NEXT: sub 3, 5, 3
-; PPC64LE-NEXT: mtvsrwz 37, 3
+; PPC64LE-NEXT: mtvsrwz 34, 3
+; PPC64LE-NEXT: addis 3, 2, .LCPI4_0 at toc@ha
+; PPC64LE-NEXT: mtvsrwz 35, 4
+; PPC64LE-NEXT: addi 3, 3, .LCPI4_0 at toc@l
+; PPC64LE-NEXT: addis 4, 2, .LCPI4_2 at toc@ha
+; PPC64LE-NEXT: mtvsrwz 36, 5
+; PPC64LE-NEXT: vmrghw 2, 3, 2
+; PPC64LE-NEXT: lvx 3, 0, 3
; PPC64LE-NEXT: addis 3, 2, .LCPI4_1 at toc@ha
; PPC64LE-NEXT: addi 3, 3, .LCPI4_1 at toc@l
-; PPC64LE-NEXT: vperm 2, 5, 2, 3
-; PPC64LE-NEXT: vsrw 3, 4, 4
+; PPC64LE-NEXT: vperm 2, 4, 2, 3
+; PPC64LE-NEXT: vspltisw 3, -11
; PPC64LE-NEXT: lvx 4, 0, 3
-; PPC64LE-NEXT: xxland 34, 34, 35
-; PPC64LE-NEXT: vcmpequw 2, 2, 4
-; PPC64LE-NEXT: xxlnor 0, 34, 34
-; PPC64LE-NEXT: xxswapd 1, 0
-; PPC64LE-NEXT: xxsldwi 2, 0, 0, 1
-; PPC64LE-NEXT: mffprwz 5, 0
-; PPC64LE-NEXT: mffprwz 3, 1
-; PPC64LE-NEXT: mffprwz 4, 2
+; PPC64LE-NEXT: addi 3, 4, .LCPI4_2 at toc@l
+; PPC64LE-NEXT: addis 4, 2, .LCPI4_4 at toc@ha
+; PPC64LE-NEXT: lvx 5, 0, 3
+; PPC64LE-NEXT: addis 3, 2, .LCPI4_3 at toc@ha
+; PPC64LE-NEXT: addi 4, 4, .LCPI4_4 at toc@l
+; PPC64LE-NEXT: addi 3, 3, .LCPI4_3 at toc@l
+; PPC64LE-NEXT: vsrw 3, 3, 3
+; PPC64LE-NEXT: vsubuwm 2, 2, 4
+; PPC64LE-NEXT: lvx 4, 0, 3
+; PPC64LE-NEXT: addis 3, 2, .LCPI4_5 at toc@ha
+; PPC64LE-NEXT: addi 3, 3, .LCPI4_5 at toc@l
+; PPC64LE-NEXT: vmuluwm 2, 2, 5
+; PPC64LE-NEXT: lvx 5, 0, 4
+; PPC64LE-NEXT: xxland 32, 34, 35
+; PPC64LE-NEXT: vslw 2, 2, 4
+; PPC64LE-NEXT: vsrw 4, 0, 5
+; PPC64LE-NEXT: xxlor 0, 36, 34
+; PPC64LE-NEXT: lvx 2, 0, 3
+; PPC64LE-NEXT: xxland 35, 0, 35
+; PPC64LE-NEXT: vcmpgtuw 2, 3, 2
+; PPC64LE-NEXT: xxswapd 0, 34
+; PPC64LE-NEXT: xxsldwi 1, 34, 34, 1
+; PPC64LE-NEXT: mfvsrwz 5, 34
+; PPC64LE-NEXT: mffprwz 3, 0
+; PPC64LE-NEXT: mffprwz 4, 1
; PPC64LE-NEXT: blr
%urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
%cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>
@@ -247,19 +213,35 @@ define i1 @test_urem_oversized(i66 %X) nounwind {
; PPC-NEXT: stwu 1, -16(1)
; PPC-NEXT: mr 6, 5
; PPC-NEXT: mr 5, 4
-; PPC-NEXT: clrlwi 4, 3, 30
-; PPC-NEXT: lis 3, 18838
-; PPC-NEXT: ori 10, 3, 722
+; PPC-NEXT: mr 4, 3
+; PPC-NEXT: lis 3, 12057
+; PPC-NEXT: lis 7, -12795
+; PPC-NEXT: ori 9, 3, 37186
+; PPC-NEXT: ori 10, 7, 40665
; PPC-NEXT: li 3, 0
; PPC-NEXT: li 7, 0
-; PPC-NEXT: li 8, 0
-; PPC-NEXT: li 9, 0
-; PPC-NEXT: bl __umodti3
-; PPC-NEXT: or 3, 5, 3
-; PPC-NEXT: or 4, 6, 4
-; PPC-NEXT: or 3, 4, 3
-; PPC-NEXT: cntlzw 3, 3
-; PPC-NEXT: rlwinm 3, 3, 27, 31, 31
+; PPC-NEXT: li 8, 2
+; PPC-NEXT: bl __multi3
+; PPC-NEXT: rotlwi 7, 6, 31
+; PPC-NEXT: lis 3, -5526
+; PPC-NEXT: rlwimi 7, 5, 31, 0, 0
+; PPC-NEXT: rotlwi 5, 5, 31
+; PPC-NEXT: rlwimi 5, 4, 31, 0, 0
+; PPC-NEXT: ori 3, 3, 61135
+; PPC-NEXT: cmplwi 1, 5, 13
+; PPC-NEXT: cmplw 7, 3
+; PPC-NEXT: rlwinm 4, 4, 31, 31, 31
+; PPC-NEXT: crand 20, 6, 0
+; PPC-NEXT: crandc 21, 4, 6
+; PPC-NEXT: rlwimi. 4, 6, 1, 30, 30
+; PPC-NEXT: cror 20, 20, 21
+; PPC-NEXT: crnand 20, 2, 20
+; PPC-NEXT: li 3, 1
+; PPC-NEXT: bc 12, 20, .LBB5_1
+; PPC-NEXT: b .LBB5_2
+; PPC-NEXT: .LBB5_1:
+; PPC-NEXT: li 3, 0
+; PPC-NEXT: .LBB5_2:
; PPC-NEXT: lwz 0, 20(1)
; PPC-NEXT: addi 1, 1, 16
; PPC-NEXT: mtlr 0
@@ -267,21 +249,28 @@ define i1 @test_urem_oversized(i66 %X) nounwind {
;
; PPC64LE-LABEL: test_urem_oversized:
; PPC64LE: # %bb.0:
-; PPC64LE-NEXT: mflr 0
-; PPC64LE-NEXT: std 0, 16(1)
-; PPC64LE-NEXT: stdu 1, -32(1)
-; PPC64LE-NEXT: lis 5, 18838
-; PPC64LE-NEXT: clrldi 4, 4, 62
-; PPC64LE-NEXT: li 6, 0
-; PPC64LE-NEXT: ori 5, 5, 722
-; PPC64LE-NEXT: bl __umodti3
-; PPC64LE-NEXT: nop
-; PPC64LE-NEXT: or 3, 3, 4
-; PPC64LE-NEXT: cntlzd 3, 3
-; PPC64LE-NEXT: rldicl 3, 3, 58, 63
-; PPC64LE-NEXT: addi 1, 1, 32
-; PPC64LE-NEXT: ld 0, 16(1)
-; PPC64LE-NEXT: mtlr 0
+; PPC64LE-NEXT: lis 5, 6028
+; PPC64LE-NEXT: ori 5, 5, 51361
+; PPC64LE-NEXT: rldic 5, 5, 33, 2
+; PPC64LE-NEXT: oris 5, 5, 52741
+; PPC64LE-NEXT: ori 5, 5, 40665
+; PPC64LE-NEXT: mulhdu 6, 3, 5
+; PPC64LE-NEXT: mulld 4, 4, 5
+; PPC64LE-NEXT: mulld 5, 3, 5
+; PPC64LE-NEXT: sldi 3, 3, 1
+; PPC64LE-NEXT: add 3, 6, 3
+; PPC64LE-NEXT: add 3, 3, 4
+; PPC64LE-NEXT: lis 4, -8538
+; PPC64LE-NEXT: rotldi 6, 5, 63
+; PPC64LE-NEXT: ori 4, 4, 44780
+; PPC64LE-NEXT: rldimi 6, 3, 63, 0
+; PPC64LE-NEXT: rlwinm 3, 3, 31, 31, 31
+; PPC64LE-NEXT: rldicl 4, 4, 4, 28
+; PPC64LE-NEXT: rlwimi. 3, 5, 1, 30, 30
+; PPC64LE-NEXT: cmpld 1, 6, 4
+; PPC64LE-NEXT: li 3, 1
+; PPC64LE-NEXT: crnand 20, 2, 4
+; PPC64LE-NEXT: isel 3, 0, 3, 20
; PPC64LE-NEXT: blr
%urem = urem i66 %X, 1234567890
%cmp = icmp eq i66 %urem, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll
index cafa74af1929..2798b67fa355 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll
@@ -2,22 +2,23 @@
; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
; Test that the prepareSREMEqFold optimization doesn't crash on scalable
-; vector types. RVV doesn't have ROTR or ROTL operations so the optimization
-; itself doesn't kick in.
+; vector types.
define <vscale x 4 x i1> @srem_eq_fold_nxv4i8(<vscale x 4 x i8> %va) {
; CHECK-LABEL: srem_eq_fold_nxv4i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi a0, zero, 43
+; CHECK-NEXT: addi a0, zero, -85
; CHECK-NEXT: vsetvli a1, zero, e8,mf2,ta,mu
-; CHECK-NEXT: vmulh.vx v25, v8, a0
-; CHECK-NEXT: vadd.vi v25, v25, 0
-; CHECK-NEXT: vsrl.vi v26, v25, 7
-; CHECK-NEXT: vand.vi v26, v26, -1
-; CHECK-NEXT: vadd.vv v25, v25, v26
-; CHECK-NEXT: addi a0, zero, 6
-; CHECK-NEXT: vmul.vx v25, v25, a0
-; CHECK-NEXT: vsub.vv v25, v8, v25
-; CHECK-NEXT: vmseq.vi v0, v25, 0
+; CHECK-NEXT: vmul.vx v25, v8, a0
+; CHECK-NEXT: addi a0, zero, 42
+; CHECK-NEXT: vadd.vx v25, v25, a0
+; CHECK-NEXT: vmv.v.i v26, 1
+; CHECK-NEXT: vrsub.vi v27, v26, 0
+; CHECK-NEXT: vand.vi v27, v27, 7
+; CHECK-NEXT: vsll.vv v27, v25, v27
+; CHECK-NEXT: vand.vi v26, v26, 7
+; CHECK-NEXT: vsrl.vv v25, v25, v26
+; CHECK-NEXT: vor.vv v25, v25, v27
+; CHECK-NEXT: vmsleu.vx v0, v25, a0
; CHECK-NEXT: ret
%head_six = insertelement <vscale x 4 x i8> undef, i8 6, i32 0
%splat_six = shufflevector <vscale x 4 x i8> %head_six, <vscale x 4 x i8> undef, <vscale x 4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 7a7766681995..d7344114c21a 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -11,11 +11,18 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: srai a0, a0, 3
-; RV32-NEXT: addi a1, zero, 99
-; RV32-NEXT: call __modsi3 at plt
-; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: lui a1, 128424
+; RV32-NEXT: addi a1, a1, 331
+; RV32-NEXT: call __mulsi3 at plt
+; RV32-NEXT: lui a1, 662
+; RV32-NEXT: addi a1, a1, -83
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: lui a1, 131072
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: lui a1, 1324
+; RV32-NEXT: addi a1, a1, -165
+; RV32-NEXT: sltu a0, a0, a1
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -24,100 +31,83 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: slli a0, a0, 35
-; RV64-NEXT: srai a0, a0, 35
-; RV64-NEXT: addi a1, zero, 99
-; RV64-NEXT: call __moddi3 at plt
-; RV64-NEXT: seqz a0, a0
+; RV64-NEXT: lui a1, 128424
+; RV64-NEXT: addiw a1, a1, 331
+; RV64-NEXT: call __muldi3 at plt
+; RV64-NEXT: lui a1, 662
+; RV64-NEXT: addiw a1, a1, -83
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: addiw a1, a1, -1
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: lui a1, 1324
+; RV64-NEXT: addiw a1, a1, -165
+; RV64-NEXT: sltu a0, a0, a1
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; RV32M-LABEL: test_srem_odd:
; RV32M: # %bb.0:
-; RV32M-NEXT: slli a0, a0, 3
-; RV32M-NEXT: srai a0, a0, 3
-; RV32M-NEXT: lui a1, 783784
+; RV32M-NEXT: lui a1, 128424
; RV32M-NEXT: addi a1, a1, 331
; RV32M-NEXT: mul a0, a0, a1
-; RV32M-NEXT: lui a1, 5296
-; RV32M-NEXT: addi a1, a1, -662
+; RV32M-NEXT: lui a1, 662
+; RV32M-NEXT: addi a1, a1, -83
; RV32M-NEXT: add a0, a0, a1
-; RV32M-NEXT: lui a1, 10592
-; RV32M-NEXT: addi a1, a1, -1323
+; RV32M-NEXT: lui a1, 131072
+; RV32M-NEXT: addi a1, a1, -1
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: lui a1, 1324
+; RV32M-NEXT: addi a1, a1, -165
; RV32M-NEXT: sltu a0, a0, a1
; RV32M-NEXT: ret
;
; RV64M-LABEL: test_srem_odd:
; RV64M: # %bb.0:
-; RV64M-NEXT: slli a0, a0, 35
-; RV64M-NEXT: srai a0, a0, 35
-; RV64M-NEXT: lui a1, 1048536
-; RV64M-NEXT: addiw a1, a1, -331
-; RV64M-NEXT: slli a1, a1, 15
-; RV64M-NEXT: addi a1, a1, 331
-; RV64M-NEXT: slli a1, a1, 15
-; RV64M-NEXT: addi a1, a1, -331
-; RV64M-NEXT: slli a1, a1, 15
-; RV64M-NEXT: addi a1, a1, 331
+; RV64M-NEXT: lui a1, 128424
+; RV64M-NEXT: addiw a1, a1, 331
; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: lui a1, 331
-; RV64M-NEXT: addiw a1, a1, -41
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -1531
-; RV64M-NEXT: slli a2, a1, 12
-; RV64M-NEXT: addi a2, a2, 703
-; RV64M-NEXT: slli a2, a2, 12
-; RV64M-NEXT: addi a2, a2, 1448
-; RV64M-NEXT: add a0, a0, a2
-; RV64M-NEXT: slli a1, a1, 13
-; RV64M-NEXT: addi a1, a1, 1407
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -1199
+; RV64M-NEXT: lui a1, 662
+; RV64M-NEXT: addiw a1, a1, -83
+; RV64M-NEXT: add a0, a0, a1
+; RV64M-NEXT: lui a1, 131072
+; RV64M-NEXT: addiw a1, a1, -1
+; RV64M-NEXT: and a0, a0, a1
+; RV64M-NEXT: lui a1, 1324
+; RV64M-NEXT: addiw a1, a1, -165
; RV64M-NEXT: sltu a0, a0, a1
; RV64M-NEXT: ret
;
; RV32MV-LABEL: test_srem_odd:
; RV32MV: # %bb.0:
-; RV32MV-NEXT: slli a0, a0, 3
-; RV32MV-NEXT: srai a0, a0, 3
-; RV32MV-NEXT: lui a1, 783784
+; RV32MV-NEXT: lui a1, 128424
; RV32MV-NEXT: addi a1, a1, 331
; RV32MV-NEXT: mul a0, a0, a1
-; RV32MV-NEXT: lui a1, 5296
-; RV32MV-NEXT: addi a1, a1, -662
+; RV32MV-NEXT: lui a1, 662
+; RV32MV-NEXT: addi a1, a1, -83
; RV32MV-NEXT: add a0, a0, a1
-; RV32MV-NEXT: lui a1, 10592
-; RV32MV-NEXT: addi a1, a1, -1323
+; RV32MV-NEXT: lui a1, 131072
+; RV32MV-NEXT: addi a1, a1, -1
+; RV32MV-NEXT: and a0, a0, a1
+; RV32MV-NEXT: lui a1, 1324
+; RV32MV-NEXT: addi a1, a1, -165
; RV32MV-NEXT: sltu a0, a0, a1
; RV32MV-NEXT: ret
;
; RV64MV-LABEL: test_srem_odd:
; RV64MV: # %bb.0:
-; RV64MV-NEXT: slli a0, a0, 35
-; RV64MV-NEXT: srai a0, a0, 35
-; RV64MV-NEXT: lui a1, 1048536
-; RV64MV-NEXT: addiw a1, a1, -331
-; RV64MV-NEXT: slli a1, a1, 15
-; RV64MV-NEXT: addi a1, a1, 331
-; RV64MV-NEXT: slli a1, a1, 15
-; RV64MV-NEXT: addi a1, a1, -331
-; RV64MV-NEXT: slli a1, a1, 15
-; RV64MV-NEXT: addi a1, a1, 331
+; RV64MV-NEXT: lui a1, 128424
+; RV64MV-NEXT: addiw a1, a1, 331
; RV64MV-NEXT: mul a0, a0, a1
-; RV64MV-NEXT: lui a1, 331
-; RV64MV-NEXT: addiw a1, a1, -41
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -1531
-; RV64MV-NEXT: slli a2, a1, 12
-; RV64MV-NEXT: addi a2, a2, 703
-; RV64MV-NEXT: slli a2, a2, 12
-; RV64MV-NEXT: addi a2, a2, 1448
-; RV64MV-NEXT: add a0, a0, a2
-; RV64MV-NEXT: slli a1, a1, 13
-; RV64MV-NEXT: addi a1, a1, 1407
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -1199
+; RV64MV-NEXT: lui a1, 662
+; RV64MV-NEXT: addiw a1, a1, -83
+; RV64MV-NEXT: add a0, a0, a1
+; RV64MV-NEXT: lui a1, 131072
+; RV64MV-NEXT: addiw a1, a1, -1
+; RV64MV-NEXT: and a0, a0, a1
+; RV64MV-NEXT: lui a1, 1324
+; RV64MV-NEXT: addiw a1, a1, -165
; RV64MV-NEXT: sltu a0, a0, a1
; RV64MV-NEXT: ret
%srem = srem i29 %X, 99
@@ -423,46 +413,65 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
; RV64-NEXT: and a0, a0, a1
; RV64-NEXT: ld a1, 0(s0)
; RV64-NEXT: slli a2, a0, 29
-; RV64-NEXT: srai s2, a2, 31
+; RV64-NEXT: srai s1, a2, 31
; RV64-NEXT: slli a0, a0, 31
; RV64-NEXT: srli a2, a1, 33
; RV64-NEXT: or a0, a2, a0
; RV64-NEXT: slli a0, a0, 31
-; RV64-NEXT: srai s1, a0, 31
-; RV64-NEXT: slli a0, a1, 31
; RV64-NEXT: srai a0, a0, 31
-; RV64-NEXT: addi a1, zero, 6
-; RV64-NEXT: call __moddi3 at plt
-; RV64-NEXT: mv s3, a0
+; RV64-NEXT: slli a1, a1, 31
+; RV64-NEXT: srai s2, a1, 31
; RV64-NEXT: addi a1, zero, 7
; RV64-NEXT: addi s5, zero, 7
+; RV64-NEXT: call __moddi3 at plt
+; RV64-NEXT: mv s3, a0
+; RV64-NEXT: addi a1, zero, -5
; RV64-NEXT: mv a0, s1
; RV64-NEXT: call __moddi3 at plt
; RV64-NEXT: mv s1, a0
-; RV64-NEXT: addi a1, zero, -5
+; RV64-NEXT: lui a0, 1026731
+; RV64-NEXT: addiw a0, a0, -1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a0, a0, -1365
+; RV64-NEXT: slli a0, a0, 12
+; RV64-NEXT: addi a1, a0, -1365
; RV64-NEXT: mv a0, s2
-; RV64-NEXT: call __moddi3 at plt
-; RV64-NEXT: addi a0, a0, -2
-; RV64-NEXT: snez a0, a0
-; RV64-NEXT: addi a1, s1, -1
+; RV64-NEXT: call __muldi3 at plt
+; RV64-NEXT: lui a1, 10923
+; RV64-NEXT: addiw a1, a1, -1365
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -1365
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -1365
+; RV64-NEXT: slli a1, a1, 12
+; RV64-NEXT: addi a1, a1, -1366
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: slli a2, a0, 63
+; RV64-NEXT: srli a0, a0, 1
+; RV64-NEXT: or a0, a0, a2
+; RV64-NEXT: sltu a0, a1, a0
+; RV64-NEXT: addi a1, s1, -2
; RV64-NEXT: snez a1, a1
-; RV64-NEXT: snez a2, s3
+; RV64-NEXT: addi a2, s3, -1
+; RV64-NEXT: snez a2, a2
+; RV64-NEXT: neg a0, a0
; RV64-NEXT: neg a2, a2
-; RV64-NEXT: neg a1, a1
-; RV64-NEXT: neg a3, a0
+; RV64-NEXT: neg a3, a1
; RV64-NEXT: slli a4, s5, 32
; RV64-NEXT: and a3, a3, a4
; RV64-NEXT: srli a3, a3, 32
; RV64-NEXT: sb a3, 12(s0)
-; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: slli a1, a1, 2
; RV64-NEXT: slli a3, s4, 33
; RV64-NEXT: addi a3, a3, -1
-; RV64-NEXT: and a1, a1, a3
-; RV64-NEXT: srli a4, a1, 31
-; RV64-NEXT: sub a0, a4, a0
-; RV64-NEXT: sw a0, 8(s0)
-; RV64-NEXT: and a0, a2, a3
-; RV64-NEXT: slli a1, a1, 33
+; RV64-NEXT: and a2, a2, a3
+; RV64-NEXT: srli a4, a2, 31
+; RV64-NEXT: sub a1, a4, a1
+; RV64-NEXT: sw a1, 8(s0)
+; RV64-NEXT: and a0, a0, a3
+; RV64-NEXT: slli a1, a2, 33
; RV64-NEXT: or a0, a0, a1
; RV64-NEXT: sd a0, 0(s0)
; RV64-NEXT: ld s5, 8(sp) # 8-byte Folded Reload
@@ -609,7 +618,11 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
; RV64M-NEXT: slli a5, a2, 2
; RV64M-NEXT: add a2, a5, a2
; RV64M-NEXT: add a2, a4, a2
-; RV64M-NEXT: lui a4, 10923
+; RV64M-NEXT: addi a2, a2, -2
+; RV64M-NEXT: snez a2, a2
+; RV64M-NEXT: addi a1, a1, -1
+; RV64M-NEXT: snez a1, a1
+; RV64M-NEXT: lui a4, 1026731
; RV64M-NEXT: addiw a4, a4, -1365
; RV64M-NEXT: slli a4, a4, 12
; RV64M-NEXT: addi a4, a4, -1365
@@ -617,17 +630,20 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
; RV64M-NEXT: addi a4, a4, -1365
; RV64M-NEXT: slli a4, a4, 12
; RV64M-NEXT: addi a4, a4, -1365
-; RV64M-NEXT: mulh a4, a3, a4
-; RV64M-NEXT: srli a5, a4, 63
-; RV64M-NEXT: add a4, a4, a5
-; RV64M-NEXT: addi a5, zero, 6
-; RV64M-NEXT: mul a4, a4, a5
-; RV64M-NEXT: sub a3, a3, a4
-; RV64M-NEXT: addi a2, a2, -2
-; RV64M-NEXT: snez a2, a2
-; RV64M-NEXT: addi a1, a1, -1
-; RV64M-NEXT: snez a1, a1
-; RV64M-NEXT: snez a3, a3
+; RV64M-NEXT: mul a3, a3, a4
+; RV64M-NEXT: lui a4, 10923
+; RV64M-NEXT: addiw a4, a4, -1365
+; RV64M-NEXT: slli a4, a4, 12
+; RV64M-NEXT: addi a4, a4, -1365
+; RV64M-NEXT: slli a4, a4, 12
+; RV64M-NEXT: addi a4, a4, -1365
+; RV64M-NEXT: slli a4, a4, 12
+; RV64M-NEXT: addi a4, a4, -1366
+; RV64M-NEXT: add a3, a3, a4
+; RV64M-NEXT: slli a5, a3, 63
+; RV64M-NEXT: srli a3, a3, 1
+; RV64M-NEXT: or a3, a3, a5
+; RV64M-NEXT: sltu a3, a4, a3
; RV64M-NEXT: neg a1, a1
; RV64M-NEXT: neg a4, a2
; RV64M-NEXT: neg a3, a3
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index 2f06aa4ea4dd..ed92311f5bd8 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -11,12 +11,13 @@ define i1 @test_urem_odd(i13 %X) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: lui a1, 1
+; RV32-NEXT: addi a1, a1, -819
+; RV32-NEXT: call __mulsi3 at plt
; RV32-NEXT: lui a1, 2
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a0, a0, a1
-; RV32-NEXT: addi a1, zero, 5
-; RV32-NEXT: call __umodsi3 at plt
-; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: sltiu a0, a0, 1639
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -25,90 +26,59 @@ define i1 @test_urem_odd(i13 %X) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: addiw a1, a1, -819
+; RV64-NEXT: call __muldi3 at plt
; RV64-NEXT: lui a1, 2
; RV64-NEXT: addiw a1, a1, -1
; RV64-NEXT: and a0, a0, a1
-; RV64-NEXT: addi a1, zero, 5
-; RV64-NEXT: call __umoddi3 at plt
-; RV64-NEXT: seqz a0, a0
+; RV64-NEXT: sltiu a0, a0, 1639
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; RV32M-LABEL: test_urem_odd:
; RV32M: # %bb.0:
+; RV32M-NEXT: lui a1, 1
+; RV32M-NEXT: addi a1, a1, -819
+; RV32M-NEXT: mul a0, a0, a1
; RV32M-NEXT: lui a1, 2
; RV32M-NEXT: addi a1, a1, -1
; RV32M-NEXT: and a0, a0, a1
-; RV32M-NEXT: lui a1, 838861
-; RV32M-NEXT: addi a1, a1, -819
-; RV32M-NEXT: mul a0, a0, a1
-; RV32M-NEXT: lui a1, 209715
-; RV32M-NEXT: addi a1, a1, 820
-; RV32M-NEXT: sltu a0, a0, a1
+; RV32M-NEXT: sltiu a0, a0, 1639
; RV32M-NEXT: ret
;
; RV64M-LABEL: test_urem_odd:
; RV64M: # %bb.0:
+; RV64M-NEXT: lui a1, 1
+; RV64M-NEXT: addiw a1, a1, -819
+; RV64M-NEXT: mul a0, a0, a1
; RV64M-NEXT: lui a1, 2
; RV64M-NEXT: addiw a1, a1, -1
; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: lui a1, 1035469
-; RV64M-NEXT: addiw a1, a1, -819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -819
-; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: lui a1, 13107
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, 819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, 819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, 820
-; RV64M-NEXT: sltu a0, a0, a1
+; RV64M-NEXT: sltiu a0, a0, 1639
; RV64M-NEXT: ret
;
; RV32MV-LABEL: test_urem_odd:
; RV32MV: # %bb.0:
+; RV32MV-NEXT: lui a1, 1
+; RV32MV-NEXT: addi a1, a1, -819
+; RV32MV-NEXT: mul a0, a0, a1
; RV32MV-NEXT: lui a1, 2
; RV32MV-NEXT: addi a1, a1, -1
; RV32MV-NEXT: and a0, a0, a1
-; RV32MV-NEXT: lui a1, 838861
-; RV32MV-NEXT: addi a1, a1, -819
-; RV32MV-NEXT: mul a0, a0, a1
-; RV32MV-NEXT: lui a1, 209715
-; RV32MV-NEXT: addi a1, a1, 820
-; RV32MV-NEXT: sltu a0, a0, a1
+; RV32MV-NEXT: sltiu a0, a0, 1639
; RV32MV-NEXT: ret
;
; RV64MV-LABEL: test_urem_odd:
; RV64MV: # %bb.0:
+; RV64MV-NEXT: lui a1, 1
+; RV64MV-NEXT: addiw a1, a1, -819
+; RV64MV-NEXT: mul a0, a0, a1
; RV64MV-NEXT: lui a1, 2
; RV64MV-NEXT: addiw a1, a1, -1
; RV64MV-NEXT: and a0, a0, a1
-; RV64MV-NEXT: lui a1, 1035469
-; RV64MV-NEXT: addiw a1, a1, -819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -819
-; RV64MV-NEXT: mul a0, a0, a1
-; RV64MV-NEXT: lui a1, 13107
-; RV64MV-NEXT: addiw a1, a1, 819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, 819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, 819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, 820
-; RV64MV-NEXT: sltu a0, a0, a1
+; RV64MV-NEXT: sltiu a0, a0, 1639
; RV64MV-NEXT: ret
%urem = urem i13 %X, 5
%cmp = icmp eq i13 %urem, 0
@@ -120,12 +90,20 @@ define i1 @test_urem_even(i27 %X) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: lui a1, 32768
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: lui a1, 28087
+; RV32-NEXT: addi a1, a1, -585
+; RV32-NEXT: call __mulsi3 at plt
+; RV32-NEXT: slli a1, a0, 26
+; RV32-NEXT: lui a2, 32768
+; RV32-NEXT: addi a3, a2, -2
+; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: srli a0, a0, 1
+; RV32-NEXT: or a0, a0, a1
+; RV32-NEXT: addi a1, a2, -1
; RV32-NEXT: and a0, a0, a1
-; RV32-NEXT: addi a1, zero, 14
-; RV32-NEXT: call __umodsi3 at plt
-; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: lui a1, 2341
+; RV32-NEXT: addi a1, a1, -1755
+; RV32-NEXT: sltu a0, a0, a1
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -134,90 +112,94 @@ define i1 @test_urem_even(i27 %X) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: lui a1, 32768
-; RV64-NEXT: addiw a1, a1, -1
+; RV64-NEXT: lui a1, 28087
+; RV64-NEXT: addiw a1, a1, -585
+; RV64-NEXT: call __muldi3 at plt
+; RV64-NEXT: slli a1, a0, 26
+; RV64-NEXT: lui a2, 32768
+; RV64-NEXT: addiw a3, a2, -2
+; RV64-NEXT: and a0, a0, a3
+; RV64-NEXT: srli a0, a0, 1
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: addiw a1, a2, -1
; RV64-NEXT: and a0, a0, a1
-; RV64-NEXT: addi a1, zero, 14
-; RV64-NEXT: call __umoddi3 at plt
-; RV64-NEXT: seqz a0, a0
+; RV64-NEXT: lui a1, 2341
+; RV64-NEXT: addiw a1, a1, -1755
+; RV64-NEXT: sltu a0, a0, a1
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; RV32M-LABEL: test_urem_even:
; RV32M: # %bb.0:
-; RV32M-NEXT: lui a1, 32768
-; RV32M-NEXT: addi a1, a1, -1
+; RV32M-NEXT: lui a1, 28087
+; RV32M-NEXT: addi a1, a1, -585
+; RV32M-NEXT: mul a0, a0, a1
+; RV32M-NEXT: slli a1, a0, 26
+; RV32M-NEXT: lui a2, 32768
+; RV32M-NEXT: addi a3, a2, -2
+; RV32M-NEXT: and a0, a0, a3
+; RV32M-NEXT: srli a0, a0, 1
+; RV32M-NEXT: or a0, a0, a1
+; RV32M-NEXT: addi a1, a2, -1
; RV32M-NEXT: and a0, a0, a1
-; RV32M-NEXT: srli a1, a0, 1
-; RV32M-NEXT: lui a2, 599186
-; RV32M-NEXT: addi a2, a2, 1171
-; RV32M-NEXT: mulhu a1, a1, a2
-; RV32M-NEXT: srli a1, a1, 2
-; RV32M-NEXT: addi a2, zero, 14
-; RV32M-NEXT: mul a1, a1, a2
-; RV32M-NEXT: sub a0, a0, a1
-; RV32M-NEXT: seqz a0, a0
+; RV32M-NEXT: lui a1, 2341
+; RV32M-NEXT: addi a1, a1, -1755
+; RV32M-NEXT: sltu a0, a0, a1
; RV32M-NEXT: ret
;
; RV64M-LABEL: test_urem_even:
; RV64M: # %bb.0:
-; RV64M-NEXT: lui a1, 32768
-; RV64M-NEXT: addiw a1, a1, -1
+; RV64M-NEXT: lui a1, 28087
+; RV64M-NEXT: addiw a1, a1, -585
+; RV64M-NEXT: mul a0, a0, a1
+; RV64M-NEXT: slli a1, a0, 26
+; RV64M-NEXT: lui a2, 32768
+; RV64M-NEXT: addiw a3, a2, -2
+; RV64M-NEXT: and a0, a0, a3
+; RV64M-NEXT: srli a0, a0, 1
+; RV64M-NEXT: or a0, a0, a1
+; RV64M-NEXT: addiw a1, a2, -1
; RV64M-NEXT: and a0, a0, a1
-; RV64M-NEXT: srli a1, a0, 1
-; RV64M-NEXT: lui a2, 18725
-; RV64M-NEXT: addiw a2, a2, -1755
-; RV64M-NEXT: slli a2, a2, 12
-; RV64M-NEXT: addi a2, a2, -1755
-; RV64M-NEXT: slli a2, a2, 12
-; RV64M-NEXT: addi a2, a2, -1755
-; RV64M-NEXT: slli a2, a2, 12
-; RV64M-NEXT: addi a2, a2, -1755
-; RV64M-NEXT: mulhu a1, a1, a2
-; RV64M-NEXT: srli a1, a1, 1
-; RV64M-NEXT: addi a2, zero, 14
-; RV64M-NEXT: mul a1, a1, a2
-; RV64M-NEXT: sub a0, a0, a1
-; RV64M-NEXT: seqz a0, a0
+; RV64M-NEXT: lui a1, 2341
+; RV64M-NEXT: addiw a1, a1, -1755
+; RV64M-NEXT: sltu a0, a0, a1
; RV64M-NEXT: ret
;
; RV32MV-LABEL: test_urem_even:
; RV32MV: # %bb.0:
-; RV32MV-NEXT: lui a1, 32768
-; RV32MV-NEXT: addi a1, a1, -1
+; RV32MV-NEXT: lui a1, 28087
+; RV32MV-NEXT: addi a1, a1, -585
+; RV32MV-NEXT: mul a0, a0, a1
+; RV32MV-NEXT: slli a1, a0, 26
+; RV32MV-NEXT: lui a2, 32768
+; RV32MV-NEXT: addi a3, a2, -2
+; RV32MV-NEXT: and a0, a0, a3
+; RV32MV-NEXT: srli a0, a0, 1
+; RV32MV-NEXT: or a0, a0, a1
+; RV32MV-NEXT: addi a1, a2, -1
; RV32MV-NEXT: and a0, a0, a1
-; RV32MV-NEXT: srli a1, a0, 1
-; RV32MV-NEXT: lui a2, 599186
-; RV32MV-NEXT: addi a2, a2, 1171
-; RV32MV-NEXT: mulhu a1, a1, a2
-; RV32MV-NEXT: srli a1, a1, 2
-; RV32MV-NEXT: addi a2, zero, 14
-; RV32MV-NEXT: mul a1, a1, a2
-; RV32MV-NEXT: sub a0, a0, a1
-; RV32MV-NEXT: seqz a0, a0
+; RV32MV-NEXT: lui a1, 2341
+; RV32MV-NEXT: addi a1, a1, -1755
+; RV32MV-NEXT: sltu a0, a0, a1
; RV32MV-NEXT: ret
;
; RV64MV-LABEL: test_urem_even:
; RV64MV: # %bb.0:
-; RV64MV-NEXT: lui a1, 32768
-; RV64MV-NEXT: addiw a1, a1, -1
+; RV64MV-NEXT: lui a1, 28087
+; RV64MV-NEXT: addiw a1, a1, -585
+; RV64MV-NEXT: mul a0, a0, a1
+; RV64MV-NEXT: slli a1, a0, 26
+; RV64MV-NEXT: lui a2, 32768
+; RV64MV-NEXT: addiw a3, a2, -2
+; RV64MV-NEXT: and a0, a0, a3
+; RV64MV-NEXT: srli a0, a0, 1
+; RV64MV-NEXT: or a0, a0, a1
+; RV64MV-NEXT: addiw a1, a2, -1
; RV64MV-NEXT: and a0, a0, a1
-; RV64MV-NEXT: srli a1, a0, 1
-; RV64MV-NEXT: lui a2, 18725
-; RV64MV-NEXT: addiw a2, a2, -1755
-; RV64MV-NEXT: slli a2, a2, 12
-; RV64MV-NEXT: addi a2, a2, -1755
-; RV64MV-NEXT: slli a2, a2, 12
-; RV64MV-NEXT: addi a2, a2, -1755
-; RV64MV-NEXT: slli a2, a2, 12
-; RV64MV-NEXT: addi a2, a2, -1755
-; RV64MV-NEXT: mulhu a1, a1, a2
-; RV64MV-NEXT: srli a1, a1, 1
-; RV64MV-NEXT: addi a2, zero, 14
-; RV64MV-NEXT: mul a1, a1, a2
-; RV64MV-NEXT: sub a0, a0, a1
-; RV64MV-NEXT: seqz a0, a0
+; RV64MV-NEXT: lui a1, 2341
+; RV64MV-NEXT: addiw a1, a1, -1755
+; RV64MV-NEXT: sltu a0, a0, a1
; RV64MV-NEXT: ret
%urem = urem i27 %X, 14
%cmp = icmp eq i27 %urem, 0
@@ -227,93 +209,61 @@ define i1 @test_urem_even(i27 %X) nounwind {
define i1 @test_urem_odd_setne(i4 %X) nounwind {
; RV32-LABEL: test_urem_odd_setne:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: slli a1, a0, 1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: neg a0, a0
; RV32-NEXT: andi a0, a0, 15
-; RV32-NEXT: addi a1, zero, 5
-; RV32-NEXT: call __umodsi3 at plt
-; RV32-NEXT: snez a0, a0
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: addi a1, zero, 3
+; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_odd_setne:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: slli a1, a0, 1
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: neg a0, a0
; RV64-NEXT: andi a0, a0, 15
-; RV64-NEXT: addi a1, zero, 5
-; RV64-NEXT: call __umoddi3 at plt
-; RV64-NEXT: snez a0, a0
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: addi a1, zero, 3
+; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: ret
;
; RV32M-LABEL: test_urem_odd_setne:
; RV32M: # %bb.0:
+; RV32M-NEXT: slli a1, a0, 1
+; RV32M-NEXT: add a0, a1, a0
+; RV32M-NEXT: neg a0, a0
; RV32M-NEXT: andi a0, a0, 15
-; RV32M-NEXT: lui a1, 838861
-; RV32M-NEXT: addi a1, a1, -819
-; RV32M-NEXT: mul a0, a0, a1
-; RV32M-NEXT: lui a1, 209715
-; RV32M-NEXT: addi a1, a1, 819
+; RV32M-NEXT: addi a1, zero, 3
; RV32M-NEXT: sltu a0, a1, a0
; RV32M-NEXT: ret
;
; RV64M-LABEL: test_urem_odd_setne:
; RV64M: # %bb.0:
+; RV64M-NEXT: slli a1, a0, 1
+; RV64M-NEXT: add a0, a1, a0
+; RV64M-NEXT: neg a0, a0
; RV64M-NEXT: andi a0, a0, 15
-; RV64M-NEXT: lui a1, 1035469
-; RV64M-NEXT: addiw a1, a1, -819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -819
-; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: lui a1, 13107
-; RV64M-NEXT: addiw a1, a1, 819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, 819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, 819
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, 819
+; RV64M-NEXT: addi a1, zero, 3
; RV64M-NEXT: sltu a0, a1, a0
; RV64M-NEXT: ret
;
; RV32MV-LABEL: test_urem_odd_setne:
; RV32MV: # %bb.0:
+; RV32MV-NEXT: slli a1, a0, 1
+; RV32MV-NEXT: add a0, a1, a0
+; RV32MV-NEXT: neg a0, a0
; RV32MV-NEXT: andi a0, a0, 15
-; RV32MV-NEXT: lui a1, 838861
-; RV32MV-NEXT: addi a1, a1, -819
-; RV32MV-NEXT: mul a0, a0, a1
-; RV32MV-NEXT: lui a1, 209715
-; RV32MV-NEXT: addi a1, a1, 819
+; RV32MV-NEXT: addi a1, zero, 3
; RV32MV-NEXT: sltu a0, a1, a0
; RV32MV-NEXT: ret
;
; RV64MV-LABEL: test_urem_odd_setne:
; RV64MV: # %bb.0:
+; RV64MV-NEXT: slli a1, a0, 1
+; RV64MV-NEXT: add a0, a1, a0
+; RV64MV-NEXT: neg a0, a0
; RV64MV-NEXT: andi a0, a0, 15
-; RV64MV-NEXT: lui a1, 1035469
-; RV64MV-NEXT: addiw a1, a1, -819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -819
-; RV64MV-NEXT: mul a0, a0, a1
-; RV64MV-NEXT: lui a1, 13107
-; RV64MV-NEXT: addiw a1, a1, 819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, 819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, 819
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, 819
+; RV64MV-NEXT: addi a1, zero, 3
; RV64MV-NEXT: sltu a0, a1, a0
; RV64MV-NEXT: ret
%urem = urem i4 %X, 5
@@ -326,10 +276,11 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi a1, zero, 307
+; RV32-NEXT: call __mulsi3 at plt
; RV32-NEXT: andi a0, a0, 511
-; RV32-NEXT: addi a1, zero, 507
-; RV32-NEXT: call __umodsi3 at plt
-; RV32-NEXT: snez a0, a0
+; RV32-NEXT: addi a1, zero, 1
+; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -338,75 +289,48 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi a1, zero, 307
+; RV64-NEXT: call __muldi3 at plt
; RV64-NEXT: andi a0, a0, 511
-; RV64-NEXT: addi a1, zero, 507
-; RV64-NEXT: call __umoddi3 at plt
-; RV64-NEXT: snez a0, a0
+; RV64-NEXT: addi a1, zero, 1
+; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
;
; RV32M-LABEL: test_urem_negative_odd:
; RV32M: # %bb.0:
-; RV32M-NEXT: andi a0, a0, 511
-; RV32M-NEXT: lui a1, 692846
-; RV32M-NEXT: addi a1, a1, 307
+; RV32M-NEXT: addi a1, zero, 307
; RV32M-NEXT: mul a0, a0, a1
-; RV32M-NEXT: lui a1, 2068
-; RV32M-NEXT: addi a1, a1, 807
+; RV32M-NEXT: andi a0, a0, 511
+; RV32M-NEXT: addi a1, zero, 1
; RV32M-NEXT: sltu a0, a1, a0
; RV32M-NEXT: ret
;
; RV64M-LABEL: test_urem_negative_odd:
; RV64M: # %bb.0:
-; RV64M-NEXT: andi a0, a0, 511
-; RV64M-NEXT: lui a1, 1042824
-; RV64M-NEXT: addiw a1, a1, -711
-; RV64M-NEXT: slli a1, a1, 13
-; RV64M-NEXT: addi a1, a1, 469
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -1737
-; RV64M-NEXT: slli a1, a1, 13
-; RV64M-NEXT: addi a1, a1, 307
+; RV64M-NEXT: addi a1, zero, 307
; RV64M-NEXT: mul a0, a0, a1
-; RV64M-NEXT: lui a1, 132365
-; RV64M-NEXT: addiw a1, a1, -1543
-; RV64M-NEXT: slli a1, a1, 14
-; RV64M-NEXT: addi a1, a1, -1131
-; RV64M-NEXT: slli a1, a1, 12
-; RV64M-NEXT: addi a1, a1, -186
+; RV64M-NEXT: andi a0, a0, 511
+; RV64M-NEXT: addi a1, zero, 1
; RV64M-NEXT: sltu a0, a1, a0
; RV64M-NEXT: ret
;
; RV32MV-LABEL: test_urem_negative_odd:
; RV32MV: # %bb.0:
-; RV32MV-NEXT: andi a0, a0, 511
-; RV32MV-NEXT: lui a1, 692846
-; RV32MV-NEXT: addi a1, a1, 307
+; RV32MV-NEXT: addi a1, zero, 307
; RV32MV-NEXT: mul a0, a0, a1
-; RV32MV-NEXT: lui a1, 2068
-; RV32MV-NEXT: addi a1, a1, 807
+; RV32MV-NEXT: andi a0, a0, 511
+; RV32MV-NEXT: addi a1, zero, 1
; RV32MV-NEXT: sltu a0, a1, a0
; RV32MV-NEXT: ret
;
; RV64MV-LABEL: test_urem_negative_odd:
; RV64MV: # %bb.0:
-; RV64MV-NEXT: andi a0, a0, 511
-; RV64MV-NEXT: lui a1, 1042824
-; RV64MV-NEXT: addiw a1, a1, -711
-; RV64MV-NEXT: slli a1, a1, 13
-; RV64MV-NEXT: addi a1, a1, 469
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -1737
-; RV64MV-NEXT: slli a1, a1, 13
-; RV64MV-NEXT: addi a1, a1, 307
+; RV64MV-NEXT: addi a1, zero, 307
; RV64MV-NEXT: mul a0, a0, a1
-; RV64MV-NEXT: lui a1, 132365
-; RV64MV-NEXT: addiw a1, a1, -1543
-; RV64MV-NEXT: slli a1, a1, 14
-; RV64MV-NEXT: addi a1, a1, -1131
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -186
+; RV64MV-NEXT: andi a0, a0, 511
+; RV64MV-NEXT: addi a1, zero, 1
; RV64MV-NEXT: sltu a0, a1, a0
; RV64MV-NEXT: ret
%urem = urem i9 %X, -5
@@ -428,38 +352,44 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
; RV32-NEXT: lw a1, 0(s0)
; RV32-NEXT: slli a0, a0, 10
; RV32-NEXT: srli a2, a1, 22
-; RV32-NEXT: or a0, a2, a0
-; RV32-NEXT: andi s2, a0, 2047
-; RV32-NEXT: andi s1, a1, 2047
-; RV32-NEXT: srli a0, a1, 11
+; RV32-NEXT: or s1, a2, a0
+; RV32-NEXT: srli s2, a1, 11
+; RV32-NEXT: andi a0, a1, 2047
+; RV32-NEXT: addi a1, zero, 683
+; RV32-NEXT: call __mulsi3 at plt
+; RV32-NEXT: slli a1, a0, 10
+; RV32-NEXT: andi a0, a0, 2046
+; RV32-NEXT: srli a0, a0, 1
+; RV32-NEXT: or a0, a0, a1
; RV32-NEXT: andi a0, a0, 2047
-; RV32-NEXT: addi a1, zero, 7
-; RV32-NEXT: call __umodsi3 at plt
-; RV32-NEXT: mv s3, a0
-; RV32-NEXT: addi a1, zero, 6
+; RV32-NEXT: addi a1, zero, 341
+; RV32-NEXT: sltu s3, a1, a0
+; RV32-NEXT: addi a1, zero, 819
; RV32-NEXT: mv a0, s1
-; RV32-NEXT: call __umodsi3 at plt
-; RV32-NEXT: mv s1, a0
-; RV32-NEXT: addi a1, zero, 2043
+; RV32-NEXT: call __mulsi3 at plt
+; RV32-NEXT: addi a0, a0, -1638
+; RV32-NEXT: andi a0, a0, 2047
+; RV32-NEXT: addi a1, zero, 1
+; RV32-NEXT: sltu s1, a1, a0
+; RV32-NEXT: addi a1, zero, 1463
; RV32-NEXT: mv a0, s2
-; RV32-NEXT: call __umodsi3 at plt
-; RV32-NEXT: addi a0, a0, -2
-; RV32-NEXT: snez a0, a0
-; RV32-NEXT: snez a1, s1
-; RV32-NEXT: addi a2, s3, -1
-; RV32-NEXT: snez a2, a2
-; RV32-NEXT: neg a2, a2
-; RV32-NEXT: neg a1, a1
-; RV32-NEXT: neg a3, a0
-; RV32-NEXT: srli a3, a3, 10
-; RV32-NEXT: andi a3, a3, 1
-; RV32-NEXT: sb a3, 4(s0)
+; RV32-NEXT: call __mulsi3 at plt
+; RV32-NEXT: addi a0, a0, -1463
+; RV32-NEXT: andi a0, a0, 2047
+; RV32-NEXT: addi a1, zero, 292
+; RV32-NEXT: sltu a0, a1, a0
+; RV32-NEXT: neg a1, s3
+; RV32-NEXT: neg a0, a0
+; RV32-NEXT: neg a2, s1
+; RV32-NEXT: srli a2, a2, 10
+; RV32-NEXT: andi a2, a2, 1
+; RV32-NEXT: sb a2, 4(s0)
; RV32-NEXT: andi a1, a1, 2047
-; RV32-NEXT: andi a2, a2, 2047
-; RV32-NEXT: slli a2, a2, 11
-; RV32-NEXT: or a1, a1, a2
-; RV32-NEXT: slli a0, a0, 22
-; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: andi a0, a0, 2047
+; RV32-NEXT: slli a0, a0, 11
+; RV32-NEXT: or a0, a1, a0
+; RV32-NEXT: slli a1, s1, 22
+; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: sw a0, 0(s0)
; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
@@ -477,45 +407,53 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; RV64-NEXT: mv s0, a0
; RV64-NEXT: lbu a0, 4(a0)
; RV64-NEXT: lwu a1, 0(s0)
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: or a0, a1, a0
-; RV64-NEXT: srli s2, a0, 22
-; RV64-NEXT: andi s1, a0, 2047
-; RV64-NEXT: srli a0, a0, 11
+; RV64-NEXT: srli s2, a0, 11
+; RV64-NEXT: srli s1, a0, 22
; RV64-NEXT: andi a0, a0, 2047
-; RV64-NEXT: addi a1, zero, 7
-; RV64-NEXT: call __umoddi3 at plt
-; RV64-NEXT: mv s3, a0
-; RV64-NEXT: addi a1, zero, 6
+; RV64-NEXT: addi a1, zero, 683
+; RV64-NEXT: call __muldi3 at plt
+; RV64-NEXT: slli a1, a0, 10
+; RV64-NEXT: andi a0, a0, 2046
+; RV64-NEXT: srli a0, a0, 1
+; RV64-NEXT: or a0, a0, a1
+; RV64-NEXT: andi a0, a0, 2047
+; RV64-NEXT: addi a1, zero, 341
+; RV64-NEXT: sltu s3, a1, a0
+; RV64-NEXT: addi a1, zero, 819
; RV64-NEXT: mv a0, s1
-; RV64-NEXT: call __umoddi3 at plt
-; RV64-NEXT: mv s1, a0
-; RV64-NEXT: addi a1, zero, 2043
+; RV64-NEXT: call __muldi3 at plt
+; RV64-NEXT: addi a0, a0, -1638
+; RV64-NEXT: andi a0, a0, 2047
+; RV64-NEXT: addi s4, zero, 1
+; RV64-NEXT: sltu s1, s4, a0
+; RV64-NEXT: addi a1, zero, 1463
; RV64-NEXT: mv a0, s2
-; RV64-NEXT: call __umoddi3 at plt
-; RV64-NEXT: addi a0, a0, -2
-; RV64-NEXT: snez a0, a0
-; RV64-NEXT: snez a1, s1
-; RV64-NEXT: addi a2, s3, -1
-; RV64-NEXT: snez a2, a2
-; RV64-NEXT: neg a2, a2
-; RV64-NEXT: neg a1, a1
+; RV64-NEXT: call __muldi3 at plt
+; RV64-NEXT: addi a0, a0, -1463
+; RV64-NEXT: andi a0, a0, 2047
+; RV64-NEXT: addi a1, zero, 292
+; RV64-NEXT: sltu a0, a1, a0
+; RV64-NEXT: neg a1, s3
+; RV64-NEXT: neg a0, a0
; RV64-NEXT: andi a1, a1, 2047
-; RV64-NEXT: andi a2, a2, 2047
-; RV64-NEXT: slli a2, a2, 11
-; RV64-NEXT: or a1, a1, a2
-; RV64-NEXT: slli a0, a0, 22
-; RV64-NEXT: sub a0, a1, a0
+; RV64-NEXT: andi a0, a0, 2047
+; RV64-NEXT: slli a0, a0, 11
+; RV64-NEXT: or a0, a1, a0
+; RV64-NEXT: slli a1, s1, 22
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: sw a0, 0(s0)
-; RV64-NEXT: addi a1, zero, 1
-; RV64-NEXT: slli a1, a1, 33
+; RV64-NEXT: slli a1, s4, 33
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: and a0, a0, a1
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: sb a0, 4(s0)
+; RV64-NEXT: ld s4, 0(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -531,45 +469,38 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
; RV32M-NEXT: slli a1, a1, 10
; RV32M-NEXT: srli a3, a2, 22
; RV32M-NEXT: or a1, a3, a1
-; RV32M-NEXT: andi a1, a1, 2047
; RV32M-NEXT: srli a3, a2, 11
-; RV32M-NEXT: andi a3, a3, 2047
; RV32M-NEXT: andi a2, a2, 2047
-; RV32M-NEXT: lui a4, 699051
-; RV32M-NEXT: addi a4, a4, -1365
-; RV32M-NEXT: mulhu a4, a2, a4
-; RV32M-NEXT: srli a4, a4, 2
-; RV32M-NEXT: addi a5, zero, 6
-; RV32M-NEXT: mul a4, a4, a5
-; RV32M-NEXT: sub a2, a2, a4
-; RV32M-NEXT: lui a4, 536863
-; RV32M-NEXT: addi a4, a4, -1229
+; RV32M-NEXT: addi a4, zero, 683
+; RV32M-NEXT: mul a2, a2, a4
+; RV32M-NEXT: slli a4, a2, 10
+; RV32M-NEXT: andi a2, a2, 2046
+; RV32M-NEXT: srli a2, a2, 1
+; RV32M-NEXT: or a2, a2, a4
+; RV32M-NEXT: andi a2, a2, 2047
+; RV32M-NEXT: addi a4, zero, 341
+; RV32M-NEXT: sltu a2, a4, a2
+; RV32M-NEXT: addi a4, zero, 819
; RV32M-NEXT: mul a1, a1, a4
-; RV32M-NEXT: lui a4, 1023427
-; RV32M-NEXT: addi a4, a4, -1638
-; RV32M-NEXT: add a1, a1, a4
-; RV32M-NEXT: lui a4, 513
-; RV32M-NEXT: addi a4, a4, 1036
+; RV32M-NEXT: addi a1, a1, -1638
+; RV32M-NEXT: andi a1, a1, 2047
+; RV32M-NEXT: addi a4, zero, 1
; RV32M-NEXT: sltu a1, a4, a1
-; RV32M-NEXT: lui a4, 748983
-; RV32M-NEXT: addi a4, a4, -585
+; RV32M-NEXT: addi a4, zero, 1463
; RV32M-NEXT: mul a3, a3, a4
-; RV32M-NEXT: lui a4, 299593
-; RV32M-NEXT: addi a4, a4, 585
-; RV32M-NEXT: add a3, a3, a4
-; RV32M-NEXT: lui a4, 149797
-; RV32M-NEXT: addi a4, a4, -1756
+; RV32M-NEXT: addi a3, a3, -1463
+; RV32M-NEXT: andi a3, a3, 2047
+; RV32M-NEXT: addi a4, zero, 292
; RV32M-NEXT: sltu a3, a4, a3
-; RV32M-NEXT: snez a2, a2
; RV32M-NEXT: neg a2, a2
; RV32M-NEXT: neg a3, a3
; RV32M-NEXT: neg a4, a1
; RV32M-NEXT: srli a4, a4, 10
; RV32M-NEXT: andi a4, a4, 1
; RV32M-NEXT: sb a4, 4(a0)
+; RV32M-NEXT: andi a2, a2, 2047
; RV32M-NEXT: andi a3, a3, 2047
; RV32M-NEXT: slli a3, a3, 11
-; RV32M-NEXT: andi a2, a2, 2047
; RV32M-NEXT: or a2, a2, a3
; RV32M-NEXT: slli a1, a1, 22
; RV32M-NEXT: sub a1, a2, a1
@@ -583,75 +514,29 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
; RV64M-NEXT: slli a1, a1, 32
; RV64M-NEXT: or a1, a2, a1
; RV64M-NEXT: srli a2, a1, 11
-; RV64M-NEXT: andi a2, a2, 2047
; RV64M-NEXT: srli a3, a1, 22
; RV64M-NEXT: andi a1, a1, 2047
-; RV64M-NEXT: lui a4, 1026731
-; RV64M-NEXT: addiw a4, a4, -1365
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, -1365
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, -1365
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, -1365
-; RV64M-NEXT: mulhu a4, a1, a4
-; RV64M-NEXT: srli a4, a4, 2
-; RV64M-NEXT: addi a5, zero, 6
-; RV64M-NEXT: mul a4, a4, a5
-; RV64M-NEXT: sub a1, a1, a4
-; RV64M-NEXT: snez a1, a1
-; RV64M-NEXT: lui a4, 14948
-; RV64M-NEXT: addiw a4, a4, 2029
-; RV64M-NEXT: slli a4, a4, 13
-; RV64M-NEXT: addi a4, a4, -381
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, 287
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, -1229
+; RV64M-NEXT: addi a4, zero, 683
+; RV64M-NEXT: mul a1, a1, a4
+; RV64M-NEXT: slli a4, a1, 10
+; RV64M-NEXT: andi a1, a1, 2046
+; RV64M-NEXT: srli a1, a1, 1
+; RV64M-NEXT: or a1, a1, a4
+; RV64M-NEXT: andi a1, a1, 2047
+; RV64M-NEXT: addi a4, zero, 341
+; RV64M-NEXT: sltu a1, a4, a1
+; RV64M-NEXT: addi a4, zero, 819
; RV64M-NEXT: mul a3, a3, a4
-; RV64M-NEXT: lui a4, 1436
-; RV64M-NEXT: addiw a4, a4, -2029
-; RV64M-NEXT: slli a4, a4, 13
-; RV64M-NEXT: addi a4, a4, 381
-; RV64M-NEXT: slli a4, a4, 13
-; RV64M-NEXT: addi a4, a4, -573
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, -1638
-; RV64M-NEXT: add a3, a3, a4
-; RV64M-NEXT: lui a4, 16424
-; RV64M-NEXT: addiw a4, a4, 401
-; RV64M-NEXT: slli a4, a4, 14
-; RV64M-NEXT: addi a4, a4, -345
-; RV64M-NEXT: slli a4, a4, 13
-; RV64M-NEXT: addi a4, a4, 1295
+; RV64M-NEXT: addi a3, a3, -1638
+; RV64M-NEXT: andi a3, a3, 2047
+; RV64M-NEXT: addi a4, zero, 1
; RV64M-NEXT: sltu a3, a4, a3
-; RV64M-NEXT: lui a4, 28087
-; RV64M-NEXT: addiw a4, a4, -585
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, -585
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, -585
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, -585
-; RV64M-NEXT: mul a2, a2, a4
-; RV64M-NEXT: lui a4, 1020489
-; RV64M-NEXT: addiw a4, a4, 585
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, 585
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, 585
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, 585
-; RV64M-NEXT: add a2, a2, a4
-; RV64M-NEXT: lui a4, 4681
-; RV64M-NEXT: addiw a4, a4, 585
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, 585
-; RV64M-NEXT: slli a4, a4, 12
-; RV64M-NEXT: addi a4, a4, 585
-; RV64M-NEXT: slli a4, a4, 13
-; RV64M-NEXT: addi a4, a4, 1170
-; RV64M-NEXT: sltu a2, a4, a2
+; RV64M-NEXT: addi a5, zero, 1463
+; RV64M-NEXT: mul a2, a2, a5
+; RV64M-NEXT: addi a2, a2, -1463
+; RV64M-NEXT: andi a2, a2, 2047
+; RV64M-NEXT: addi a5, zero, 292
+; RV64M-NEXT: sltu a2, a5, a2
; RV64M-NEXT: neg a1, a1
; RV64M-NEXT: neg a2, a2
; RV64M-NEXT: andi a1, a1, 2047
@@ -661,8 +546,7 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
; RV64M-NEXT: slli a2, a3, 22
; RV64M-NEXT: sub a1, a1, a2
; RV64M-NEXT: sw a1, 0(a0)
-; RV64M-NEXT: addi a2, zero, 1
-; RV64M-NEXT: slli a2, a2, 33
+; RV64M-NEXT: slli a2, a4, 33
; RV64M-NEXT: addi a2, a2, -1
; RV64M-NEXT: and a1, a1, a2
; RV64M-NEXT: srli a1, a1, 32
@@ -672,56 +556,46 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
; RV32MV-LABEL: test_urem_vec:
; RV32MV: # %bb.0:
; RV32MV-NEXT: addi sp, sp, -16
-; RV32MV-NEXT: lb a1, 4(a0)
-; RV32MV-NEXT: lw a2, 0(a0)
-; RV32MV-NEXT: slli a1, a1, 10
-; RV32MV-NEXT: srli a3, a2, 22
-; RV32MV-NEXT: or a1, a3, a1
-; RV32MV-NEXT: andi a1, a1, 2047
-; RV32MV-NEXT: srli a3, a2, 11
-; RV32MV-NEXT: andi a3, a3, 2047
-; RV32MV-NEXT: andi a2, a2, 2047
-; RV32MV-NEXT: lui a4, 699051
-; RV32MV-NEXT: addi a4, a4, -1365
-; RV32MV-NEXT: mulhu a4, a2, a4
-; RV32MV-NEXT: srli a4, a4, 2
-; RV32MV-NEXT: addi a5, zero, 6
-; RV32MV-NEXT: mul a4, a4, a5
-; RV32MV-NEXT: sub a2, a2, a4
+; RV32MV-NEXT: lw a1, 0(a0)
+; RV32MV-NEXT: andi a2, a1, 2047
; RV32MV-NEXT: sh a2, 8(sp)
-; RV32MV-NEXT: lui a2, 2566
-; RV32MV-NEXT: addi a2, a2, 1087
-; RV32MV-NEXT: mulhu a2, a1, a2
-; RV32MV-NEXT: sub a4, a1, a2
-; RV32MV-NEXT: srli a4, a4, 1
-; RV32MV-NEXT: add a2, a4, a2
-; RV32MV-NEXT: srli a2, a2, 10
-; RV32MV-NEXT: addi a4, zero, 2043
-; RV32MV-NEXT: mul a2, a2, a4
-; RV32MV-NEXT: sub a1, a1, a2
+; RV32MV-NEXT: srli a2, a1, 11
+; RV32MV-NEXT: andi a2, a2, 2047
+; RV32MV-NEXT: sh a2, 10(sp)
+; RV32MV-NEXT: lb a2, 4(a0)
+; RV32MV-NEXT: slli a2, a2, 10
+; RV32MV-NEXT: srli a1, a1, 22
+; RV32MV-NEXT: or a1, a1, a2
+; RV32MV-NEXT: andi a1, a1, 2047
; RV32MV-NEXT: sh a1, 12(sp)
-; RV32MV-NEXT: lui a1, 149797
-; RV32MV-NEXT: addi a1, a1, -1755
-; RV32MV-NEXT: mulhu a1, a3, a1
-; RV32MV-NEXT: sub a2, a3, a1
-; RV32MV-NEXT: srli a2, a2, 1
-; RV32MV-NEXT: add a1, a2, a1
-; RV32MV-NEXT: srli a1, a1, 2
-; RV32MV-NEXT: slli a2, a1, 3
-; RV32MV-NEXT: sub a1, a1, a2
-; RV32MV-NEXT: add a1, a3, a1
-; RV32MV-NEXT: sh a1, 10(sp)
; RV32MV-NEXT: vsetivli a1, 4, e16,m1,ta,mu
; RV32MV-NEXT: addi a1, sp, 8
; RV32MV-NEXT: vle16.v v25, (a1)
; RV32MV-NEXT: lui a1, %hi(.LCPI4_0)
; RV32MV-NEXT: addi a1, a1, %lo(.LCPI4_0)
; RV32MV-NEXT: vle16.v v26, (a1)
+; RV32MV-NEXT: vid.v v27
+; RV32MV-NEXT: vsub.vv v25, v25, v27
+; RV32MV-NEXT: vmul.vv v25, v25, v26
+; RV32MV-NEXT: vsll.vi v26, v25, 1
+; RV32MV-NEXT: vmv.v.i v27, 10
+; RV32MV-NEXT: addi a1, zero, 9
+; RV32MV-NEXT: vmv.s.x v27, a1
+; RV32MV-NEXT: vsll.vv v26, v26, v27
; RV32MV-NEXT: addi a1, zero, 2047
; RV32MV-NEXT: vand.vx v25, v25, a1
-; RV32MV-NEXT: vmsne.vv v0, v25, v26
-; RV32MV-NEXT: vmv.v.i v25, 0
-; RV32MV-NEXT: vmerge.vim v25, v25, -1, v0
+; RV32MV-NEXT: vmv.v.i v27, 0
+; RV32MV-NEXT: addi a2, zero, 1
+; RV32MV-NEXT: vmv1r.v v28, v27
+; RV32MV-NEXT: vmv.s.x v28, a2
+; RV32MV-NEXT: lui a2, %hi(.LCPI4_1)
+; RV32MV-NEXT: addi a2, a2, %lo(.LCPI4_1)
+; RV32MV-NEXT: vle16.v v29, (a2)
+; RV32MV-NEXT: vsrl.vv v25, v25, v28
+; RV32MV-NEXT: vor.vv v25, v25, v26
+; RV32MV-NEXT: vand.vx v25, v25, a1
+; RV32MV-NEXT: vmsltu.vv v0, v29, v25
+; RV32MV-NEXT: vmerge.vim v25, v27, -1, v0
; RV32MV-NEXT: vsetivli a1, 1, e16,m1,ta,mu
; RV32MV-NEXT: vslidedown.vi v26, v25, 2
; RV32MV-NEXT: vmv.x.s a1, v26
@@ -748,54 +622,12 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
; RV64MV-NEXT: lwu a2, 0(a0)
; RV64MV-NEXT: slli a1, a1, 32
; RV64MV-NEXT: or a1, a2, a1
-; RV64MV-NEXT: srli a2, a1, 11
-; RV64MV-NEXT: andi a2, a2, 2047
-; RV64MV-NEXT: andi a3, a1, 2047
-; RV64MV-NEXT: srli a1, a1, 22
-; RV64MV-NEXT: lui a4, 1027
-; RV64MV-NEXT: addiw a4, a4, -2023
-; RV64MV-NEXT: slli a4, a4, 15
-; RV64MV-NEXT: addi a4, a4, 2005
-; RV64MV-NEXT: slli a4, a4, 12
-; RV64MV-NEXT: addi a4, a4, -431
-; RV64MV-NEXT: slli a4, a4, 13
-; RV64MV-NEXT: addi a4, a4, -429
-; RV64MV-NEXT: mulhu a4, a1, a4
-; RV64MV-NEXT: srli a4, a4, 9
-; RV64MV-NEXT: addi a5, zero, 2043
-; RV64MV-NEXT: mul a4, a4, a5
-; RV64MV-NEXT: sub a1, a1, a4
-; RV64MV-NEXT: sh a1, 12(sp)
-; RV64MV-NEXT: lui a1, 1026731
-; RV64MV-NEXT: addiw a1, a1, -1365
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -1365
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -1365
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, -1365
-; RV64MV-NEXT: mulhu a1, a3, a1
-; RV64MV-NEXT: srli a1, a1, 2
-; RV64MV-NEXT: addi a4, zero, 6
-; RV64MV-NEXT: mul a1, a1, a4
-; RV64MV-NEXT: sub a1, a3, a1
-; RV64MV-NEXT: sh a1, 8(sp)
-; RV64MV-NEXT: lui a1, 4681
-; RV64MV-NEXT: addiw a1, a1, 585
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, 585
-; RV64MV-NEXT: slli a1, a1, 12
-; RV64MV-NEXT: addi a1, a1, 585
-; RV64MV-NEXT: slli a1, a1, 13
-; RV64MV-NEXT: addi a1, a1, 1171
-; RV64MV-NEXT: mulhu a1, a2, a1
-; RV64MV-NEXT: sub a3, a2, a1
-; RV64MV-NEXT: srli a3, a3, 1
-; RV64MV-NEXT: add a1, a3, a1
-; RV64MV-NEXT: srli a1, a1, 2
-; RV64MV-NEXT: slli a3, a1, 3
-; RV64MV-NEXT: sub a1, a1, a3
-; RV64MV-NEXT: add a1, a2, a1
+; RV64MV-NEXT: srli a2, a1, 22
+; RV64MV-NEXT: sh a2, 12(sp)
+; RV64MV-NEXT: andi a2, a1, 2047
+; RV64MV-NEXT: sh a2, 8(sp)
+; RV64MV-NEXT: srli a1, a1, 11
+; RV64MV-NEXT: andi a1, a1, 2047
; RV64MV-NEXT: sh a1, 10(sp)
; RV64MV-NEXT: vsetivli a1, 4, e16,m1,ta,mu
; RV64MV-NEXT: addi a1, sp, 8
@@ -803,14 +635,30 @@ define void @test_urem_vec(<3 x i11>* %X) nounwind {
; RV64MV-NEXT: lui a1, %hi(.LCPI4_0)
; RV64MV-NEXT: addi a1, a1, %lo(.LCPI4_0)
; RV64MV-NEXT: vle16.v v26, (a1)
+; RV64MV-NEXT: vid.v v27
+; RV64MV-NEXT: vsub.vv v25, v25, v27
+; RV64MV-NEXT: vmul.vv v25, v25, v26
+; RV64MV-NEXT: vsll.vi v26, v25, 1
+; RV64MV-NEXT: vmv.v.i v27, 10
+; RV64MV-NEXT: addi a1, zero, 9
+; RV64MV-NEXT: vmv.s.x v27, a1
+; RV64MV-NEXT: vsll.vv v26, v26, v27
; RV64MV-NEXT: addi a1, zero, 2047
; RV64MV-NEXT: vand.vx v25, v25, a1
-; RV64MV-NEXT: vmsne.vv v0, v25, v26
-; RV64MV-NEXT: vmv.v.i v25, 0
-; RV64MV-NEXT: vmerge.vim v25, v25, -1, v0
+; RV64MV-NEXT: vmv.v.i v27, 0
+; RV64MV-NEXT: addi a2, zero, 1
+; RV64MV-NEXT: vmv1r.v v28, v27
+; RV64MV-NEXT: vmv.s.x v28, a2
+; RV64MV-NEXT: lui a3, %hi(.LCPI4_1)
+; RV64MV-NEXT: addi a3, a3, %lo(.LCPI4_1)
+; RV64MV-NEXT: vle16.v v29, (a3)
+; RV64MV-NEXT: vsrl.vv v25, v25, v28
+; RV64MV-NEXT: vor.vv v25, v25, v26
+; RV64MV-NEXT: vand.vx v25, v25, a1
+; RV64MV-NEXT: vmsltu.vv v0, v29, v25
+; RV64MV-NEXT: vmerge.vim v25, v27, -1, v0
; RV64MV-NEXT: vmv.x.s a1, v25
; RV64MV-NEXT: andi a1, a1, 2047
-; RV64MV-NEXT: addi a2, zero, 1
; RV64MV-NEXT: vsetivli a3, 1, e16,m1,ta,mu
; RV64MV-NEXT: vslidedown.vi v26, v25, 1
; RV64MV-NEXT: vmv.x.s a3, v26
diff --git a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
index 2ec65eb9d771..2eac1f450dcc 100644
--- a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
@@ -4,8 +4,6 @@
define i1 @test_srem_odd(i29 %X) nounwind {
; CHECK-LABEL: test_srem_odd:
; CHECK: @ %bb.0:
-; CHECK-NEXT: lsls r0, r0, #3
-; CHECK-NEXT: asrs r0, r0, #3
; CHECK-NEXT: ldr r1, .LCPI0_0
; CHECK-NEXT: muls r1, r0, r1
; CHECK-NEXT: ldr r0, .LCPI0_1
@@ -22,11 +20,11 @@ define i1 @test_srem_odd(i29 %X) nounwind {
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
-; CHECK-NEXT: .long 3210379595 @ 0xbf5a814b
+; CHECK-NEXT: .long 4208200280 @ 0xfad40a58
; CHECK-NEXT: .LCPI0_1:
-; CHECK-NEXT: .long 21691754 @ 0x14afd6a
+; CHECK-NEXT: .long 21691752 @ 0x14afd68
; CHECK-NEXT: .LCPI0_2:
-; CHECK-NEXT: .long 43383509 @ 0x295fad5
+; CHECK-NEXT: .long 43383512 @ 0x295fad8
%srem = srem i29 %X, 99
%cmp = icmp eq i29 %srem, 0
ret i1 %cmp
diff --git a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
index f1c05f759e3f..571f8e5026e5 100644
--- a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll
@@ -5,11 +5,9 @@ define i1 @test_urem_odd(i13 %X) nounwind {
; CHECK-LABEL: test_urem_odd:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r1, .LCPI0_0
-; CHECK-NEXT: ands r1, r0
+; CHECK-NEXT: muls r1, r0, r1
; CHECK-NEXT: ldr r0, .LCPI0_1
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: ldr r1, .LCPI0_2
-; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: cmp r1, r0
; CHECK-NEXT: blo .LBB0_2
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: movs r0, #0
@@ -20,11 +18,9 @@ define i1 @test_urem_odd(i13 %X) nounwind {
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
-; CHECK-NEXT: .long 8191 @ 0x1fff
+; CHECK-NEXT: .long 1718091776 @ 0x66680000
; CHECK-NEXT: .LCPI0_1:
-; CHECK-NEXT: .long 3435973837 @ 0xcccccccd
-; CHECK-NEXT: .LCPI0_2:
-; CHECK-NEXT: .long 858993460 @ 0x33333334
+; CHECK-NEXT: .long 859308032 @ 0x33380000
%urem = urem i13 %X, 5
%cmp = icmp eq i13 %urem, 0
ret i1 %cmp
@@ -33,26 +29,31 @@ define i1 @test_urem_odd(i13 %X) nounwind {
define i1 @test_urem_even(i27 %X) nounwind {
; CHECK-LABEL: test_urem_even:
; CHECK: @ %bb.0:
-; CHECK-NEXT: movs r1, #31
-; CHECK-NEXT: lsls r1, r1, #27
-; CHECK-NEXT: bics r0, r1
; CHECK-NEXT: ldr r1, .LCPI1_0
; CHECK-NEXT: muls r1, r0, r1
-; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: rors r1, r0
+; CHECK-NEXT: lsls r0, r1, #26
; CHECK-NEXT: ldr r2, .LCPI1_1
-; CHECK-NEXT: cmp r1, r2
+; CHECK-NEXT: ands r2, r1
+; CHECK-NEXT: lsrs r1, r2, #1
+; CHECK-NEXT: adds r0, r1, r0
+; CHECK-NEXT: lsls r0, r0, #5
+; CHECK-NEXT: ldr r1, .LCPI1_2
+; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo .LBB1_2
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: bx lr
; CHECK-NEXT: .LBB1_2:
+; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI1_0:
-; CHECK-NEXT: .long 3067833783 @ 0xb6db6db7
+; CHECK-NEXT: .long 115043767 @ 0x6db6db7
; CHECK-NEXT: .LCPI1_1:
-; CHECK-NEXT: .long 306783379 @ 0x12492493
+; CHECK-NEXT: .long 134217726 @ 0x7fffffe
+; CHECK-NEXT: .LCPI1_2:
+; CHECK-NEXT: .long 306783392 @ 0x124924a0
%urem = urem i27 %X, 14
%cmp = icmp eq i27 %urem, 0
ret i1 %cmp
@@ -61,12 +62,11 @@ define i1 @test_urem_even(i27 %X) nounwind {
define i1 @test_urem_odd_setne(i4 %X) nounwind {
; CHECK-LABEL: test_urem_odd_setne:
; CHECK: @ %bb.0:
-; CHECK-NEXT: movs r1, #15
-; CHECK-NEXT: ands r1, r0
-; CHECK-NEXT: ldr r0, .LCPI2_0
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: ldr r1, .LCPI2_1
-; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: movs r1, #13
+; CHECK-NEXT: muls r1, r0, r1
+; CHECK-NEXT: movs r0, #15
+; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: cmp r0, #3
; CHECK-NEXT: bhi .LBB2_2
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: movs r0, #0
@@ -74,12 +74,6 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: bx lr
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: .LCPI2_0:
-; CHECK-NEXT: .long 3435973837 @ 0xcccccccd
-; CHECK-NEXT: .LCPI2_1:
-; CHECK-NEXT: .long 858993459 @ 0x33333333
%urem = urem i4 %X, 5
%cmp = icmp ne i4 %urem, 0
ret i1 %cmp
@@ -88,12 +82,12 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
define i1 @test_urem_negative_odd(i9 %X) nounwind {
; CHECK-LABEL: test_urem_negative_odd:
; CHECK: @ %bb.0:
-; CHECK-NEXT: ldr r1, .LCPI3_0
-; CHECK-NEXT: ands r1, r0
-; CHECK-NEXT: ldr r0, .LCPI3_1
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: ldr r1, .LCPI3_2
-; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: movs r1, #255
+; CHECK-NEXT: adds r1, #52
+; CHECK-NEXT: muls r1, r0, r1
+; CHECK-NEXT: ldr r0, .LCPI3_0
+; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: cmp r0, #1
; CHECK-NEXT: bhi .LBB3_2
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: movs r0, #0
@@ -105,10 +99,6 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI3_0:
; CHECK-NEXT: .long 511 @ 0x1ff
-; CHECK-NEXT: .LCPI3_1:
-; CHECK-NEXT: .long 2837897523 @ 0xa926e133
-; CHECK-NEXT: .LCPI3_2:
-; CHECK-NEXT: .long 8471335 @ 0x814327
%urem = urem i9 %X, -5
%cmp = icmp ne i9 %urem, 0
ret i1 %cmp
@@ -117,71 +107,73 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; CHECK-LABEL: test_urem_vec:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
-; CHECK-NEXT: movs r3, r2
-; CHECK-NEXT: ldr r5, .LCPI4_0
-; CHECK-NEXT: ands r0, r5
-; CHECK-NEXT: ldr r6, .LCPI4_1
-; CHECK-NEXT: muls r6, r0, r6
-; CHECK-NEXT: movs r2, #1
-; CHECK-NEXT: rors r6, r2
-; CHECK-NEXT: ldr r0, .LCPI4_2
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: ldr r3, .LCPI4_0
+; CHECK-NEXT: muls r3, r0, r3
+; CHECK-NEXT: lsls r0, r3, #10
+; CHECK-NEXT: ldr r4, .LCPI4_1
+; CHECK-NEXT: ands r4, r3
+; CHECK-NEXT: lsrs r3, r4, #1
+; CHECK-NEXT: adds r0, r3, r0
+; CHECK-NEXT: ldr r3, .LCPI4_2
+; CHECK-NEXT: ands r3, r0
+; CHECK-NEXT: lsrs r0, r3, #1
+; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: movs r4, #0
-; CHECK-NEXT: cmp r6, r0
-; CHECK-NEXT: push {r2}
+; CHECK-NEXT: cmp r0, #170
+; CHECK-NEXT: push {r3}
; CHECK-NEXT: pop {r0}
; CHECK-NEXT: bhi .LBB4_2
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: movs r0, r4
; CHECK-NEXT: .LBB4_2:
-; CHECK-NEXT: ands r1, r5
-; CHECK-NEXT: ldr r6, .LCPI4_3
-; CHECK-NEXT: muls r6, r1, r6
+; CHECK-NEXT: ldr r5, .LCPI4_3
+; CHECK-NEXT: muls r5, r1, r5
; CHECK-NEXT: ldr r1, .LCPI4_4
-; CHECK-NEXT: adds r1, r6, r1
-; CHECK-NEXT: ldr r6, .LCPI4_5
-; CHECK-NEXT: cmp r1, r6
-; CHECK-NEXT: push {r2}
+; CHECK-NEXT: adds r1, r5, r1
+; CHECK-NEXT: movs r5, #73
+; CHECK-NEXT: lsls r5, r5, #23
+; CHECK-NEXT: cmp r1, r5
+; CHECK-NEXT: push {r3}
; CHECK-NEXT: pop {r1}
; CHECK-NEXT: bhi .LBB4_4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: movs r1, r4
; CHECK-NEXT: .LBB4_4:
-; CHECK-NEXT: ands r3, r5
-; CHECK-NEXT: ldr r5, .LCPI4_6
-; CHECK-NEXT: muls r5, r3, r5
-; CHECK-NEXT: ldr r3, .LCPI4_7
-; CHECK-NEXT: adds r3, r5, r3
-; CHECK-NEXT: ldr r5, .LCPI4_8
-; CHECK-NEXT: cmp r3, r5
+; CHECK-NEXT: ldr r5, .LCPI4_5
+; CHECK-NEXT: muls r5, r2, r5
+; CHECK-NEXT: ldr r2, .LCPI4_6
+; CHECK-NEXT: adds r2, r5, r2
+; CHECK-NEXT: ldr r5, .LCPI4_7
+; CHECK-NEXT: ands r5, r2
+; CHECK-NEXT: cmp r5, #1
; CHECK-NEXT: bhi .LBB4_6
; CHECK-NEXT: @ %bb.5:
-; CHECK-NEXT: movs r2, r4
+; CHECK-NEXT: movs r3, r4
; CHECK-NEXT: .LBB4_6:
-; CHECK-NEXT: pop {r4, r5, r6}
+; CHECK-NEXT: movs r2, r3
+; CHECK-NEXT: pop {r4, r5, r7}
; CHECK-NEXT: pop {r3}
; CHECK-NEXT: bx r3
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.7:
; CHECK-NEXT: .LCPI4_0:
-; CHECK-NEXT: .long 2047 @ 0x7ff
+; CHECK-NEXT: .long 683 @ 0x2ab
; CHECK-NEXT: .LCPI4_1:
-; CHECK-NEXT: .long 2863311531 @ 0xaaaaaaab
+; CHECK-NEXT: .long 2044 @ 0x7fc
; CHECK-NEXT: .LCPI4_2:
-; CHECK-NEXT: .long 715827882 @ 0x2aaaaaaa
+; CHECK-NEXT: .long 2046 @ 0x7fe
; CHECK-NEXT: .LCPI4_3:
-; CHECK-NEXT: .long 3067833783 @ 0xb6db6db7
+; CHECK-NEXT: .long 3068133376 @ 0xb6e00000
; CHECK-NEXT: .LCPI4_4:
-; CHECK-NEXT: .long 1227133513 @ 0x49249249
+; CHECK-NEXT: .long 1226833920 @ 0x49200000
; CHECK-NEXT: .LCPI4_5:
-; CHECK-NEXT: .long 613566756 @ 0x24924924
+; CHECK-NEXT: .long 819 @ 0x333
; CHECK-NEXT: .LCPI4_6:
-; CHECK-NEXT: .long 2198989619 @ 0x8311eb33
+; CHECK-NEXT: .long 4294965658 @ 0xfffff99a
; CHECK-NEXT: .LCPI4_7:
-; CHECK-NEXT: .long 4191955354 @ 0xf9dc299a
-; CHECK-NEXT: .LCPI4_8:
-; CHECK-NEXT: .long 2102284 @ 0x20140c
+; CHECK-NEXT: .long 2047 @ 0x7ff
%urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
%cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>
ret <3 x i1> %cmp
diff --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
index 61734ae53c23..a0c3125cd7f8 100644
--- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll
@@ -4,15 +4,15 @@
define i1 @test_srem_odd(i29 %X) nounwind {
; CHECK-LABEL: test_srem_odd:
; CHECK: @ %bb.0:
-; CHECK-NEXT: movw r1, #64874
+; CHECK-NEXT: movw r1, #24493
; CHECK-NEXT: movw r2, #33099
-; CHECK-NEXT: sbfx r0, r0, #0, #29
-; CHECK-NEXT: movt r1, #330
-; CHECK-NEXT: movt r2, #48986
-; CHECK-NEXT: mla r1, r0, r2, r1
-; CHECK-NEXT: movw r2, #64213
+; CHECK-NEXT: movt r1, #41
+; CHECK-NEXT: movt r2, #8026
+; CHECK-NEXT: mla r0, r0, r2, r1
+; CHECK-NEXT: movw r2, #48987
+; CHECK-NEXT: movt r2, #82
+; CHECK-NEXT: bic r1, r0, #-536870912
; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: movt r2, #661
; CHECK-NEXT: cmp r1, r2
; CHECK-NEXT: it lo
; CHECK-NEXT: movlo r0, #1
diff --git a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
index feb88de65054..a0247c29f257 100644
--- a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll
@@ -4,12 +4,12 @@
define i1 @test_urem_odd(i13 %X) nounwind {
; CHECK-LABEL: test_urem_odd:
; CHECK: @ %bb.0:
-; CHECK-NEXT: movw r1, #52429
-; CHECK-NEXT: bfc r0, #13, #19
-; CHECK-NEXT: movt r1, #52428
+; CHECK-NEXT: movw r1, #3277
+; CHECK-NEXT: movw r2, #1639
; CHECK-NEXT: muls r1, r0, r1
; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: cmn.w r1, #-858993460
+; CHECK-NEXT: bfc r1, #13, #19
+; CHECK-NEXT: cmp r1, r2
; CHECK-NEXT: it lo
; CHECK-NEXT: movlo r0, #1
; CHECK-NEXT: bx lr
@@ -22,12 +22,13 @@ define i1 @test_urem_even(i27 %X) nounwind {
; CHECK-LABEL: test_urem_even:
; CHECK: @ %bb.0:
; CHECK-NEXT: movw r1, #28087
-; CHECK-NEXT: bic r0, r0, #-134217728
-; CHECK-NEXT: movt r1, #46811
-; CHECK-NEXT: movw r2, #9363
+; CHECK-NEXT: movw r2, #18725
+; CHECK-NEXT: movt r1, #1755
+; CHECK-NEXT: movt r2, #146
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: movt r2, #4681
-; CHECK-NEXT: ror.w r1, r0, #1
+; CHECK-NEXT: ubfx r1, r0, #1, #26
+; CHECK-NEXT: orr.w r0, r1, r0, lsl #26
+; CHECK-NEXT: bic r1, r0, #-134217728
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: cmp r1, r2
; CHECK-NEXT: it lo
@@ -41,12 +42,11 @@ define i1 @test_urem_even(i27 %X) nounwind {
define i1 @test_urem_odd_setne(i4 %X) nounwind {
; CHECK-LABEL: test_urem_odd_setne:
; CHECK: @ %bb.0:
-; CHECK-NEXT: movw r1, #52429
-; CHECK-NEXT: and r0, r0, #15
-; CHECK-NEXT: movt r1, #52428
-; CHECK-NEXT: muls r1, r0, r1
+; CHECK-NEXT: movs r1, #13
+; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: and r1, r0, #15
; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: cmp.w r1, #858993459
+; CHECK-NEXT: cmp r1, #3
; CHECK-NEXT: it hi
; CHECK-NEXT: movhi r0, #1
; CHECK-NEXT: bx lr
@@ -58,14 +58,11 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
define i1 @test_urem_negative_odd(i9 %X) nounwind {
; CHECK-LABEL: test_urem_negative_odd:
; CHECK: @ %bb.0:
-; CHECK-NEXT: movw r1, #57651
-; CHECK-NEXT: bfc r0, #9, #23
-; CHECK-NEXT: movt r1, #43302
-; CHECK-NEXT: movw r2, #17191
+; CHECK-NEXT: movw r1, #307
; CHECK-NEXT: muls r1, r0, r1
; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: movt r2, #129
-; CHECK-NEXT: cmp r1, r2
+; CHECK-NEXT: bfc r1, #9, #23
+; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: it hi
; CHECK-NEXT: movhi r0, #1
; CHECK-NEXT: bx lr
@@ -77,50 +74,54 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; CHECK-LABEL: test_urem_vec:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: movw r3, #18725
-; CHECK-NEXT: bfc r1, #11, #21
-; CHECK-NEXT: movt r3, #9362
-; CHECK-NEXT: bfc r2, #11, #21
-; CHECK-NEXT: umull r3, r12, r1, r3
-; CHECK-NEXT: bfc r0, #11, #21
-; CHECK-NEXT: movw r3, #25663
-; CHECK-NEXT: movt r3, #160
-; CHECK-NEXT: umull r3, lr, r2, r3
-; CHECK-NEXT: vldr d17, .LCPI4_0
-; CHECK-NEXT: movw r3, #43691
-; CHECK-NEXT: movt r3, #43690
-; CHECK-NEXT: umull r3, r4, r0, r3
-; CHECK-NEXT: sub.w r3, r1, r12
-; CHECK-NEXT: add.w r3, r12, r3, lsr #1
-; CHECK-NEXT: lsr.w r12, r3, #2
-; CHECK-NEXT: sub.w r3, r2, lr
-; CHECK-NEXT: lsrs r4, r4, #2
-; CHECK-NEXT: add.w r4, r4, r4, lsl #1
-; CHECK-NEXT: add.w r3, lr, r3, lsr #1
-; CHECK-NEXT: sub.w r0, r0, r4, lsl #1
-; CHECK-NEXT: lsr.w lr, r3, #10
-; CHECK-NEXT: movw r3, #2043
; CHECK-NEXT: vmov.16 d16[0], r0
-; CHECK-NEXT: sub.w r0, r12, r12, lsl #3
-; CHECK-NEXT: mls r2, lr, r3, r2
-; CHECK-NEXT: add r0, r1
-; CHECK-NEXT: vmov.16 d16[1], r0
+; CHECK-NEXT: vldr d17, .LCPI4_0
+; CHECK-NEXT: vmov.16 d16[1], r1
+; CHECK-NEXT: vldr d19, .LCPI4_3
; CHECK-NEXT: vmov.16 d16[2], r2
+; CHECK-NEXT: vsub.i16 d16, d16, d17
+; CHECK-NEXT: vldr d17, .LCPI4_1
+; CHECK-NEXT: vmul.i16 d16, d16, d17
+; CHECK-NEXT: vldr d17, .LCPI4_2
+; CHECK-NEXT: vneg.s16 d17, d17
+; CHECK-NEXT: vshl.i16 d18, d16, #1
+; CHECK-NEXT: vbic.i16 d16, #0xf800
+; CHECK-NEXT: vshl.u16 d16, d16, d17
+; CHECK-NEXT: vshl.u16 d17, d18, d19
+; CHECK-NEXT: vorr d16, d16, d17
+; CHECK-NEXT: vldr d17, .LCPI4_4
; CHECK-NEXT: vbic.i16 d16, #0xf800
-; CHECK-NEXT: vceq.i16 d16, d16, d17
-; CHECK-NEXT: vmvn d16, d16
+; CHECK-NEXT: vcgt.u16 d16, d16, d17
; CHECK-NEXT: vmov.u16 r0, d16[0]
; CHECK-NEXT: vmov.u16 r1, d16[1]
; CHECK-NEXT: vmov.u16 r2, d16[2]
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 3
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI4_0:
; CHECK-NEXT: .short 0 @ 0x0
; CHECK-NEXT: .short 1 @ 0x1
; CHECK-NEXT: .short 2 @ 0x2
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .LCPI4_1:
+; CHECK-NEXT: .short 683 @ 0x2ab
+; CHECK-NEXT: .short 1463 @ 0x5b7
+; CHECK-NEXT: .short 819 @ 0x333
+; CHECK-NEXT: .zero 2
+; CHECK-NEXT: .LCPI4_2:
+; CHECK-NEXT: .short 1 @ 0x1
+; CHECK-NEXT: .short 0 @ 0x0
+; CHECK-NEXT: .short 0 @ 0x0
+; CHECK-NEXT: .short 0 @ 0x0
+; CHECK-NEXT: .LCPI4_3:
+; CHECK-NEXT: .short 9 @ 0x9
+; CHECK-NEXT: .short 10 @ 0xa
+; CHECK-NEXT: .short 10 @ 0xa
+; CHECK-NEXT: .short 10 @ 0xa
+; CHECK-NEXT: .LCPI4_4:
+; CHECK-NEXT: .short 341 @ 0x155
+; CHECK-NEXT: .short 292 @ 0x124
+; CHECK-NEXT: .short 1 @ 0x1
; CHECK-NEXT: .short 0 @ 0x0
%urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
%cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 534c7121ffcb..637814b756c8 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -83,33 +83,29 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psrld $2, %xmm2
-; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2
-; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pxor {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: p4_vector_urem_by_const__splat:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; SSE4-NEXT: pmuludq %xmm2, %xmm1
-; SSE4-NEXT: pmuludq %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE4-NEXT: psrld $2, %xmm2
-; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm2
-; SSE4-NEXT: psubd %xmm2, %xmm0
-; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE4-NEXT: psrld $1, %xmm0
+; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; SSE4-NEXT: pminud %xmm0, %xmm1
; SSE4-NEXT: pcmpeqd %xmm1, %xmm0
; SSE4-NEXT: retq
;
@@ -117,16 +113,11 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
-; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%t0 = and <4 x i32> %x, <i32 128, i32 128, i32 128, i32 128> ; clearly a power-of-two or zero
@@ -140,59 +131,50 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: psrld $2, %xmm1
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1]
-; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2
-; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pxor {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: p5_vector_urem_by_const__nonsplat:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177]
-; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE4-NEXT: pmuludq %xmm2, %xmm3
-; SSE4-NEXT: pmuludq %xmm0, %xmm1
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE4-NEXT: movdqa %xmm1, %xmm2
-; SSE4-NEXT: psrld $2, %xmm2
-; SSE4-NEXT: psrld $1, %xmm1
-; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7]
-; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm1
-; SSE4-NEXT: psubd %xmm1, %xmm0
-; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE4-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; SSE4-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE4-NEXT: por %xmm2, %xmm1
+; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [1431655765,858993459,715827882,477218588]
+; SSE4-NEXT: pminud %xmm1, %xmm0
; SSE4-NEXT: pcmpeqd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX2-LABEL: p5_vector_urem_by_const__nonsplat:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%t0 = and <4 x i32> %x, <i32 128, i32 2, i32 4, i32 8>
@@ -206,39 +188,32 @@ define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psrld $2, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pxor {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: p6_vector_urem_by_const__nonsplat_undef0:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; SSE4-NEXT: pmuludq %xmm2, %xmm1
-; SSE4-NEXT: pmuludq %xmm0, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE4-NEXT: psrld $2, %xmm2
-; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm2
-; SSE4-NEXT: psubd %xmm2, %xmm0
-; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm1
+; SSE4-NEXT: psrld $1, %xmm1
+; SSE4-NEXT: pslld $31, %xmm0
+; SSE4-NEXT: por %xmm1, %xmm0
+; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; SSE4-NEXT: pminud %xmm0, %xmm1
; SSE4-NEXT: pcmpeqd %xmm1, %xmm0
; SSE4-NEXT: retq
;
@@ -246,17 +221,13 @@ define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6]
-; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $1, %xmm0, %xmm1
+; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%t0 = and <4 x i32> %x, <i32 128, i32 128, i32 undef, i32 128>
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index abb310d3a518..166bf345d89b 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -10,22 +10,19 @@
define i1 @test_srem_odd(i29 %X) nounwind {
; X86-LABEL: test_srem_odd:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shll $3, %eax
-; X86-NEXT: sarl $3, %eax
-; X86-NEXT: imull $-1084587701, %eax, %eax # imm = 0xBF5A814B
-; X86-NEXT: addl $21691754, %eax # imm = 0x14AFD6A
-; X86-NEXT: cmpl $43383509, %eax # imm = 0x295FAD5
+; X86-NEXT: imull $526025035, {{[0-9]+}}(%esp), %eax # imm = 0x1F5A814B
+; X86-NEXT: addl $2711469, %eax # imm = 0x295FAD
+; X86-NEXT: andl $536870911, %eax # imm = 0x1FFFFFFF
+; X86-NEXT: cmpl $5422939, %eax # imm = 0x52BF5B
; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_srem_odd:
; X64: # %bb.0:
-; X64-NEXT: shll $3, %edi
-; X64-NEXT: sarl $3, %edi
-; X64-NEXT: imull $-1084587701, %edi, %eax # imm = 0xBF5A814B
-; X64-NEXT: addl $21691754, %eax # imm = 0x14AFD6A
-; X64-NEXT: cmpl $43383509, %eax # imm = 0x295FAD5
+; X64-NEXT: imull $526025035, %edi, %eax # imm = 0x1F5A814B
+; X64-NEXT: addl $2711469, %eax # imm = 0x295FAD
+; X64-NEXT: andl $536870911, %eax # imm = 0x1FFFFFFF
+; X64-NEXT: cmpl $5422939, %eax # imm = 0x52BF5B
; X64-NEXT: setb %al
; X64-NEXT: retq
%srem = srem i29 %X, 99
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index d19bd1a52288..1a15175fa7a6 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -9,122 +9,80 @@
define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,1374389535,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm3, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT: psrad $1, %xmm5
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,25,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT: psrad $3, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm5
-; CHECK-SSE41-NEXT: psrad $1, %xmm5
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3],xmm5[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: paddd %xmm5, %xmm3
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,171798690,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm5
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1],xmm2[2,3]
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -270,112 +228,60 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u>
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
-; CHECK-SSE2-NEXT: psrld $31, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: pslld $31, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrad $3, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pslld $31, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378]
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -396,113 +302,60 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm6
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = <1,u,4294967295,u>
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; CHECK-SSE2-NEXT: paddd %xmm5, %xmm6
-; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
-; CHECK-SSE2-NEXT: psrld $31, %xmm6
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm6
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm6
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm3, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: pslld $31, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrad $3, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pslld $31, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378]
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -526,135 +379,80 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT: psrad $1, %xmm5
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT: psrad $3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -675,135 +473,80 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT: psrad $1, %xmm5
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT: psrad $3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -829,103 +572,73 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483649,1717986919]
-; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT: psrad $1, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2]
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = <1717986919,u,2147483649,u>
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm4
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4
-; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2
-; CHECK-SSE41-NEXT: psrad $3, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3
-; CHECK-SSE41-NEXT: psrad $1, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm4
-; CHECK-SSE41-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm4
-; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,858993458,268435454,858993458]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm4
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919]
-; CHECK-AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm4
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -949,88 +662,73 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483649,2454267027]
-; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u>
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: psrad $3, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435454,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1054,122 +752,80 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483649,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm3, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT: psrad $1, %xmm5
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,16,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT: psrad $3, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm5
-; CHECK-SSE41-NEXT: psrad $1, %xmm5
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3],xmm5[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: paddd %xmm5, %xmm3
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,268435454,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm5
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3]
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -1255,101 +911,60 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: pslld $31, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $3, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pslld $31, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378]
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -1372,129 +987,80 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,0,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT: psrad $1, %xmm5
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,1,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT: psrad $3, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT: psrad $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3]
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -1603,111 +1169,99 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483647,2454267027]
-; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3067833783,u,1,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u>
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2
-; CHECK-SSE2-NEXT: psrad $3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3
-; CHECK-SSE2-NEXT: psrad $30, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2]
-; CHECK-SSE2-NEXT: psrld $31, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <2147483648,u,2,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483647,u>
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrad $30, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE41-NEXT: psrad $3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
+; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT: por %xmm5, %xmm4
+; CHECK-SSE41-NEXT: pminud %xmm4, %xmm3
+; CHECK-SSE41-NEXT: pcmpeqd %xmm4, %xmm3
+; CHECK-SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
+; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm4, %xmm4
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
+; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm2, %xmm4
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -1735,135 +1289,103 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483647,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: psrad $30, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT: psrad $1, %xmm5
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,2147483648,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT: psrad $3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $30, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE41-NEXT: psrad $1, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm4
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm4
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm3
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2
+; CHECK-SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm4
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm4, %xmm4
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm2, %xmm3
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm2, %xmm3
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm2, %xmm3
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -1892,130 +1414,76 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,1,3435973837]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1717986919]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $1, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,5]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $1, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE41-NEXT: psrad $3, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,858993458]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -2038,121 +1506,80 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,2454267027]
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5
-; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm5
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,14]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,0,1,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $3, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -2175,134 +1602,80 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,1,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $1, %xmm3
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT: psrad $3, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE41-NEXT: psrad $1, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -2388,120 +1761,60 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,0,4294967295]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,0,2454267027]
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5
-; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[1,2]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1]
-; CHECK-SSE2-NEXT: psrld $31, %xmm5
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: pslld $31, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $3, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pslld $31, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378]
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -2524,127 +1837,80 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,0,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,0,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $1, %xmm3
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,1,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT: psrad $1, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -2669,124 +1935,80 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1717986919]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3435973837]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $1, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT: psrad $3, %xmm5
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,5]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT: psrad $1, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT: psrad $3, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,858993458]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3]
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -2809,109 +2031,80 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2147483649,0,2454267027]
-; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,1,0,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm0, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrad $3, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435454,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -2934,129 +2127,80 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
-; CHECK-SSE2-NEXT: psrad $1, %xmm5
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE41-NEXT: psrad $3, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1
-; CHECK-SSE41-NEXT: psrad $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3]
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -3080,108 +2224,60 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,2147483649,0]
-; CHECK-SSE2-NEXT: pand %xmm2, %xmm4
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE2-NEXT: psrlq $32, %xmm2
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrad $3, %xmm3
-; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: psrad $1, %xmm4
-; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u>
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: psrlq $32, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $3, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE41-NEXT: psrad $1, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrlq $32, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
@@ -3203,102 +2299,60 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm3, %xmm4
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,0,2147483649,0]
-; CHECK-SSE2-NEXT: pand %xmm6, %xmm5
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6
-; CHECK-SSE2-NEXT: psrlq $32, %xmm6
-; CHECK-SSE2-NEXT: psubd %xmm5, %xmm6
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm6
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4
-; CHECK-SSE2-NEXT: psrad $3, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT: psrld $31, %xmm6
-; CHECK-SSE2-NEXT: pand %xmm3, %xmm6
-; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,1]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm6
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1]
-; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u>
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: psrlq $32, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrad $3, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm2
-; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrlq $32, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
-; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index e1e841d921c2..e8c0b947baa1 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -71,87 +71,60 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_100:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3
-; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; CHECK-SSE2-NEXT: pand %xmm1, %xmm4
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $31, %xmm1
-; CHECK-SSE2-NEXT: psrad $5, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: pslld $30, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pslld $30, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_100:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
-; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344]
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -239,87 +212,60 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_neg100:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2920577761,1374389535,2920577761,1374389535]
-; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,0,4294967295,0]
-; CHECK-SSE2-NEXT: pand %xmm0, %xmm4
-; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: psrad $5, %xmm2
-; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: pslld $30, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_srem_even_neg100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2920577761,u,2920577761,u>
-; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: psrad $5, %xmm2
-; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pslld $30, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_srem_even_neg100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_srem_even_neg100:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761]
-; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
-; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344]
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index aad3157b9ebd..073161189f97 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -10,20 +10,17 @@
define i1 @test_urem_odd(i13 %X) nounwind {
; X86-LABEL: test_urem_odd:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $3277, {{[0-9]+}}(%esp), %eax # imm = 0xCCD
; X86-NEXT: andl $8191, %eax # imm = 0x1FFF
-; X86-NEXT: imull $-13107, %eax, %eax # imm = 0xCCCD
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: cmpl $13108, %eax # imm = 0x3334
+; X86-NEXT: cmpl $1639, %eax # imm = 0x667
; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_urem_odd:
; X64: # %bb.0:
-; X64-NEXT: andl $8191, %edi # imm = 0x1FFF
-; X64-NEXT: imull $-13107, %edi, %eax # imm = 0xCCCD
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: cmpl $13108, %eax # imm = 0x3334
+; X64-NEXT: imull $3277, %edi, %eax # imm = 0xCCD
+; X64-NEXT: andl $8191, %eax # imm = 0x1FFF
+; X64-NEXT: cmpl $1639, %eax # imm = 0x667
; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i13 %X, 5
@@ -34,20 +31,27 @@ define i1 @test_urem_odd(i13 %X) nounwind {
define i1 @test_urem_even(i27 %X) nounwind {
; X86-LABEL: test_urem_even:
; X86: # %bb.0:
-; X86-NEXT: movl $134217727, %eax # imm = 0x7FFFFFF
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $-1227133513, %eax, %eax # imm = 0xB6DB6DB7
-; X86-NEXT: rorl %eax
-; X86-NEXT: cmpl $306783379, %eax # imm = 0x12492493
+; X86-NEXT: imull $115043767, {{[0-9]+}}(%esp), %eax # imm = 0x6DB6DB7
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: shll $26, %ecx
+; X86-NEXT: andl $134217726, %eax # imm = 0x7FFFFFE
+; X86-NEXT: shrl %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF
+; X86-NEXT: cmpl $9586981, %eax # imm = 0x924925
; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_urem_even:
; X64: # %bb.0:
-; X64-NEXT: andl $134217727, %edi # imm = 0x7FFFFFF
-; X64-NEXT: imull $-1227133513, %edi, %eax # imm = 0xB6DB6DB7
-; X64-NEXT: rorl %eax
-; X64-NEXT: cmpl $306783379, %eax # imm = 0x12492493
+; X64-NEXT: imull $115043767, %edi, %eax # imm = 0x6DB6DB7
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shll $26, %ecx
+; X64-NEXT: andl $134217726, %eax # imm = 0x7FFFFFE
+; X64-NEXT: shrl %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF
+; X64-NEXT: cmpl $9586981, %eax # imm = 0x924925
; X64-NEXT: setb %al
; X64-NEXT: retq
%urem = urem i27 %X, 14
@@ -58,20 +62,21 @@ define i1 @test_urem_even(i27 %X) nounwind {
define i1 @test_urem_odd_setne(i4 %X) nounwind {
; X86-LABEL: test_urem_odd_setne:
; X86: # %bb.0:
-; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: leal (%eax,%eax,2), %ecx
+; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: andb $15, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: imull $-51, %eax, %eax
-; X86-NEXT: cmpb $51, %al
+; X86-NEXT: cmpb $3, %al
; X86-NEXT: seta %al
; X86-NEXT: retl
;
; X64-LABEL: test_urem_odd_setne:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $-51, %eax, %eax
-; X64-NEXT: cmpb $51, %al
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal (%rdi,%rdi,2), %eax
+; X64-NEXT: leal (%rdi,%rax,4), %eax
+; X64-NEXT: andb $15, %al
+; X64-NEXT: cmpb $3, %al
; X64-NEXT: seta %al
; X64-NEXT: retq
%urem = urem i4 %X, 5
@@ -82,20 +87,17 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
define i1 @test_urem_negative_odd(i9 %X) nounwind {
; X86-LABEL: test_urem_negative_odd:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $307, {{[0-9]+}}(%esp), %eax # imm = 0x133
; X86-NEXT: andl $511, %eax # imm = 0x1FF
-; X86-NEXT: imull $-7885, %eax, %eax # imm = 0xE133
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: cmpl $129, %eax
+; X86-NEXT: cmpw $1, %ax
; X86-NEXT: seta %al
; X86-NEXT: retl
;
; X64-LABEL: test_urem_negative_odd:
; X64: # %bb.0:
-; X64-NEXT: andl $511, %edi # imm = 0x1FF
-; X64-NEXT: imull $-7885, %edi, %eax # imm = 0xE133
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: cmpl $129, %eax
+; X64-NEXT: imull $307, %edi, %eax # imm = 0x133
+; X64-NEXT: andl $511, %eax # imm = 0x1FF
+; X64-NEXT: cmpw $1, %ax
; X64-NEXT: seta %al
; X64-NEXT: retq
%urem = urem i9 %X, -5
@@ -106,67 +108,55 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; X86-LABEL: test_urem_vec:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl $2047, %edx # imm = 0x7FF
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $683, {{[0-9]+}}(%esp), %eax # imm = 0x2AB
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: shll $10, %ecx
+; X86-NEXT: andl $2046, %eax # imm = 0x7FE
+; X86-NEXT: shrl %eax
+; X86-NEXT: orl %ecx, %eax
; X86-NEXT: andl $2047, %eax # imm = 0x7FF
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: andl $2047, %ecx # imm = 0x7FF
-; X86-NEXT: imull $-5325, %ecx, %ecx # imm = 0xEB33
-; X86-NEXT: addl $10650, %ecx # imm = 0x299A
-; X86-NEXT: cmpw $32, %cx
-; X86-NEXT: seta %cl
-; X86-NEXT: imull $-21845, %eax, %eax # imm = 0xAAAB
-; X86-NEXT: rorw %ax
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: cmpl $10922, %eax # imm = 0x2AAA
+; X86-NEXT: cmpl $341, %eax # imm = 0x155
; X86-NEXT: seta %al
-; X86-NEXT: imull $28087, %edx, %edx # imm = 0x6DB7
-; X86-NEXT: addl $-28087, %edx # imm = 0x9249
-; X86-NEXT: movzwl %dx, %edx
-; X86-NEXT: cmpl $9362, %edx # imm = 0x2492
+; X86-NEXT: imull $1463, {{[0-9]+}}(%esp), %ecx # imm = 0x5B7
+; X86-NEXT: addl $-1463, %ecx # imm = 0xFA49
+; X86-NEXT: andl $2047, %ecx # imm = 0x7FF
+; X86-NEXT: cmpl $292, %ecx # imm = 0x124
; X86-NEXT: seta %dl
+; X86-NEXT: imull $819, {{[0-9]+}}(%esp), %ecx # imm = 0x333
+; X86-NEXT: addl $-1638, %ecx # imm = 0xF99A
+; X86-NEXT: andl $2047, %ecx # imm = 0x7FF
+; X86-NEXT: cmpw $1, %cx
+; X86-NEXT: seta %cl
; X86-NEXT: retl
;
; SSE2-LABEL: test_urem_vec:
; SSE2: # %bb.0:
-; SSE2-NEXT: movl %esi, %eax
-; SSE2-NEXT: andl $2047, %eax # imm = 0x7FF
-; SSE2-NEXT: imull $9363, %eax, %ecx # imm = 0x2493
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: subl %ecx, %eax
-; SSE2-NEXT: movzwl %ax, %eax
-; SSE2-NEXT: shrl %eax
-; SSE2-NEXT: addl %ecx, %eax
-; SSE2-NEXT: shrl $2, %eax
-; SSE2-NEXT: leal (,%rax,8), %ecx
-; SSE2-NEXT: subl %ecx, %eax
-; SSE2-NEXT: addl %esi, %eax
-; SSE2-NEXT: andl $2047, %edi # imm = 0x7FF
-; SSE2-NEXT: imull $43691, %edi, %ecx # imm = 0xAAAB
-; SSE2-NEXT: shrl $17, %ecx
-; SSE2-NEXT: andl $-2, %ecx
-; SSE2-NEXT: leal (%rcx,%rcx,2), %ecx
-; SSE2-NEXT: subl %ecx, %edi
-; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: pinsrw $2, %eax, %xmm0
-; SSE2-NEXT: movl %edx, %eax
-; SSE2-NEXT: andl $2047, %eax # imm = 0x7FF
-; SSE2-NEXT: imull $161, %eax, %ecx
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: subl %ecx, %eax
-; SSE2-NEXT: movzwl %ax, %eax
-; SSE2-NEXT: shrl %eax
-; SSE2-NEXT: addl %ecx, %eax
-; SSE2-NEXT: shrl $10, %eax
-; SSE2-NEXT: imull $2043, %eax, %eax # imm = 0x7FB
-; SSE2-NEXT: subl %eax, %edx
-; SSE2-NEXT: pinsrw $4, %edx, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: movd %edi, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: psubd {{.*}}(%rip), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = <683,1463,819,u>
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2047,2047,2047,2047]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3]
+; SSE2-NEXT: pslld $10, %xmm1
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl
@@ -174,45 +164,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
;
; SSE41-LABEL: test_urem_vec:
; SSE41: # %bb.0:
-; SSE41-NEXT: movl %esi, %eax
-; SSE41-NEXT: andl $2047, %eax # imm = 0x7FF
-; SSE41-NEXT: imull $9363, %eax, %ecx # imm = 0x2493
-; SSE41-NEXT: shrl $16, %ecx
-; SSE41-NEXT: subl %ecx, %eax
-; SSE41-NEXT: movzwl %ax, %eax
-; SSE41-NEXT: shrl %eax
-; SSE41-NEXT: addl %ecx, %eax
-; SSE41-NEXT: shrl $2, %eax
-; SSE41-NEXT: leal (,%rax,8), %ecx
-; SSE41-NEXT: subl %ecx, %eax
-; SSE41-NEXT: addl %esi, %eax
-; SSE41-NEXT: andl $2047, %edi # imm = 0x7FF
-; SSE41-NEXT: imull $43691, %edi, %ecx # imm = 0xAAAB
-; SSE41-NEXT: shrl $17, %ecx
-; SSE41-NEXT: andl $-2, %ecx
-; SSE41-NEXT: leal (%rcx,%rcx,2), %ecx
-; SSE41-NEXT: subl %ecx, %edi
; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pinsrw $2, %eax, %xmm0
-; SSE41-NEXT: movl %edx, %eax
-; SSE41-NEXT: andl $2047, %eax # imm = 0x7FF
-; SSE41-NEXT: imull $161, %eax, %ecx
-; SSE41-NEXT: shrl $16, %ecx
-; SSE41-NEXT: subl %ecx, %eax
-; SSE41-NEXT: movzwl %ax, %eax
-; SSE41-NEXT: shrl %eax
-; SSE41-NEXT: addl %ecx, %eax
-; SSE41-NEXT: shrl $10, %eax
-; SSE41-NEXT: imull $2043, %eax, %eax # imm = 0x7FB
-; SSE41-NEXT: subl %eax, %edx
-; SSE41-NEXT: pinsrw $4, %edx, %xmm0
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: pextrb $4, %xmm1, %edx
-; SSE41-NEXT: pextrb $8, %xmm1, %ecx
+; SSE41-NEXT: pinsrd $1, %esi, %xmm0
+; SSE41-NEXT: pinsrd $2, %edx, %xmm0
+; SSE41-NEXT: psubd {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: psrld $1, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: pslld $10, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; SSE41-NEXT: por %xmm2, %xmm3
+; SSE41-NEXT: pand %xmm1, %xmm3
+; SSE41-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movd %xmm3, %eax
+; SSE41-NEXT: pextrb $4, %xmm3, %edx
+; SSE41-NEXT: pextrb $8, %xmm3, %ecx
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: # kill: def $dl killed $dl killed $edx
; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -220,42 +190,21 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
;
; AVX1-LABEL: test_urem_vec:
; AVX1: # %bb.0:
-; AVX1-NEXT: movl %esi, %eax
-; AVX1-NEXT: andl $2047, %eax # imm = 0x7FF
-; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: subl %ecx, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: addl %ecx, %eax
-; AVX1-NEXT: shrl $2, %eax
-; AVX1-NEXT: leal (,%rax,8), %ecx
-; AVX1-NEXT: subl %ecx, %eax
-; AVX1-NEXT: addl %esi, %eax
-; AVX1-NEXT: andl $2047, %edi # imm = 0x7FF
-; AVX1-NEXT: imull $43691, %edi, %ecx # imm = 0xAAAB
-; AVX1-NEXT: shrl $17, %ecx
-; AVX1-NEXT: andl $-2, %ecx
-; AVX1-NEXT: leal (%rcx,%rcx,2), %ecx
-; AVX1-NEXT: subl %ecx, %edi
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edx, %eax
-; AVX1-NEXT: andl $2047, %eax # imm = 0x7FF
-; AVX1-NEXT: imull $161, %eax, %ecx
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: subl %ecx, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: addl %ecx, %eax
-; AVX1-NEXT: shrl $10, %eax
-; AVX1-NEXT: imull $2043, %eax, %eax # imm = 0x7FB
-; AVX1-NEXT: subl %eax, %edx
-; AVX1-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT: vpslld $10, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vpextrb $4, %xmm0, %edx
; AVX1-NEXT: vpextrb $8, %xmm0, %ecx
@@ -266,43 +215,18 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
;
; AVX2-LABEL: test_urem_vec:
; AVX2: # %bb.0:
-; AVX2-NEXT: andl $2047, %esi # imm = 0x7FF
-; AVX2-NEXT: imull $9363, %esi, %eax # imm = 0x2493
-; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: subl %eax, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: leal (,%rcx,8), %eax
-; AVX2-NEXT: subl %eax, %ecx
-; AVX2-NEXT: addl %esi, %ecx
-; AVX2-NEXT: andl $2047, %edi # imm = 0x7FF
-; AVX2-NEXT: imull $43691, %edi, %eax # imm = 0xAAAB
-; AVX2-NEXT: shrl $17, %eax
-; AVX2-NEXT: andl $-2, %eax
-; AVX2-NEXT: leal (%rax,%rax,2), %eax
-; AVX2-NEXT: subl %eax, %edi
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: andl $2047, %edx # imm = 0x7FF
-; AVX2-NEXT: imull $161, %edx, %eax
-; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: movl %edx, %ecx
-; AVX2-NEXT: subl %eax, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: addl %eax, %ecx
-; AVX2-NEXT: shrl $10, %ecx
-; AVX2-NEXT: imull $2043, %ecx, %eax # imm = 0x7FB
-; AVX2-NEXT: subl %eax, %edx
-; AVX2-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2047,2047,2047,2047]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vpextrb $4, %xmm0, %edx
; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
@@ -313,40 +237,17 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
;
; AVX512VL-LABEL: test_urem_vec:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: andl $2047, %esi # imm = 0x7FF
-; AVX512VL-NEXT: imull $9363, %esi, %eax # imm = 0x2493
-; AVX512VL-NEXT: shrl $16, %eax
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: subl %eax, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: shrl %ecx
-; AVX512VL-NEXT: addl %eax, %ecx
-; AVX512VL-NEXT: shrl $2, %ecx
-; AVX512VL-NEXT: leal (,%rcx,8), %eax
-; AVX512VL-NEXT: subl %eax, %ecx
-; AVX512VL-NEXT: addl %esi, %ecx
-; AVX512VL-NEXT: andl $2047, %edi # imm = 0x7FF
-; AVX512VL-NEXT: imull $43691, %edi, %eax # imm = 0xAAAB
-; AVX512VL-NEXT: shrl $17, %eax
-; AVX512VL-NEXT: andl $-2, %eax
-; AVX512VL-NEXT: leal (%rax,%rax,2), %eax
-; AVX512VL-NEXT: subl %eax, %edi
; AVX512VL-NEXT: vmovd %edi, %xmm0
-; AVX512VL-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX512VL-NEXT: andl $2047, %edx # imm = 0x7FF
-; AVX512VL-NEXT: imull $161, %edx, %eax
-; AVX512VL-NEXT: shrl $16, %eax
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: subl %eax, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: shrl %ecx
-; AVX512VL-NEXT: addl %eax, %ecx
-; AVX512VL-NEXT: shrl $10, %ecx
-; AVX512VL-NEXT: imull $2043, %ecx, %eax # imm = 0x7FB
-; AVX512VL-NEXT: subl %eax, %edx
-; AVX512VL-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0
-; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpneqd {{.*}}(%rip), %xmm0, %k0
+; AVX512VL-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX512VL-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
+; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpternlogd $200, %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: vpcmpnleud {{.*}}(%rip), %xmm0, %k0
; AVX512VL-NEXT: kshiftrw $1, %k0, %k1
; AVX512VL-NEXT: kmovw %k1, %edx
; AVX512VL-NEXT: kshiftrw $2, %k0, %k1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
index 5099b5605d70..671d7c21013d 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
@@ -295,17 +295,19 @@ define i1 @t8_3_2(i8 %X) nounwind {
define i1 @t64_3_2(i64 %X) nounwind {
; X86-LABEL: t64_3_2:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $3
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll __umoddi3
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: xorl $2, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: sete %al
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %edx # imm = 0xAAAAAAAB
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT: adcl $-1431655766, %edx # imm = 0xAAAAAAAA
+; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT: sbbl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: t64_3_2:
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 091a351239d8..8f9c45ae3b27 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -9,98 +9,71 @@
define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: psrld $1, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrld $2, %xmm3
-; CHECK-SSE2-NEXT: psrld $3, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,25,100]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,1374389535,1374389535]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: psrld $3, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,171798691,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,1374389535,1374389535]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrld $3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -197,84 +170,64 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_allones_eq:
; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -295,87 +248,66 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783379,306783379,2,306783379]
+; CHECK-SSE41-NEXT: pmaxud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_allones_ne:
; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne:
@@ -396,98 +328,71 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: psrld $1, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,4294967295,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrld $2, %xmm3
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,4294967295,100]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -508,101 +413,73 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: psrld $1, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,4294967295,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrld $2, %xmm3
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,4294967295,100]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993460,306783379,2,42949673]
+; CHECK-SSE41-NEXT: pmaxud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne:
@@ -625,73 +502,64 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u>
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u>
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,268435455,858993459]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -714,81 +582,64 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435455,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -811,95 +662,71 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: psrld $1, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,268435456,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrld $2, %xmm3
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,16,100]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,268435456,1374389535]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,268435455,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,268435456,1374389535]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -979,81 +806,54 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_one:
; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pslld $31, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_one:
; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pslld $31, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_one:
; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1076,93 +876,71 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: psrld $1, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
-; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,14,1,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,0,1374389535]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: psrld $5, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,0,1374389535]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1187,73 +965,64 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2,u>
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2,u>
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,1,858993459]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1276,81 +1045,64 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1373,95 +1125,71 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
-; CHECK-SSE2-NEXT: psrld $1, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,2,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: psrld $2, %xmm3
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,2147483648,100]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1486,89 +1214,66 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,3435973837]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,5]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,858993459]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1591,98 +1296,71 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,14]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,1,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,1,268435456,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: psrld $31, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1705,93 +1383,71 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,268435456,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4
-; CHECK-SSE2-NEXT: psrld $5, %xmm4
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1855,95 +1511,71 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: psrld $31, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,0,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,1,1,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: psrld $31, %xmm4
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm4
-; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,1,2147483648]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,1,2147483648]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -1966,88 +1598,71 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT: psrld $5, %xmm2
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,1,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,0,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1073741824]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -2072,83 +1687,71 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3435973837]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,268435456,1,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,1,5]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,858993459]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -2171,92 +1774,71 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: psrld $1, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,1,0,3067833783]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,268435456,1,2147483648]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435455,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -2279,85 +1861,71 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2
-; CHECK-SSE2-NEXT: psrld $5, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,16,1,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,268435456,1,1073741824]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: por %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -2381,90 +1949,66 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,1]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,4294967295]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -2486,97 +2030,66 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT: psrld $1, %xmm4
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [14,4294967295,16,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,1,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: por %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
-; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,4294967295]
+; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
index 0d2c8062aa8b..f98f7428164f 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
@@ -119,67 +119,59 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t32_6_part0:
; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0
+; CHECK-SSE2-NEXT: psrld $1, %xmm0
+; CHECK-SSE2-NEXT: pslld $31, %xmm3
+; CHECK-SSE2-NEXT: por %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: t32_6_part0:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pslld $31, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: t32_6_part0:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: t32_6_part0:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6]
-; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
+; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-AVX512VL-LABEL: t32_6_part0:
@@ -198,67 +190,58 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t32_6_part1:
; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0
+; CHECK-SSE2-NEXT: psrld $1, %xmm0
+; CHECK-SSE2-NEXT: pslld $31, %xmm3
+; CHECK-SSE2-NEXT: por %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: t32_6_part1:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pslld $31, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827881,715827881,715827882,715827882]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: t32_6_part1:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: t32_6_part1:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6]
-; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-AVX512VL-LABEL: t32_6_part1:
@@ -277,71 +260,49 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t32_tautological:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: psrld $1, %xmm3
-; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,3]
-; CHECK-SSE2-NEXT: movapd %xmm2, %xmm3
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: t32_tautological:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: psrld $1, %xmm3
-; CHECK-SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pxor %xmm0, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: t32_tautological:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: t32_tautological:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; CHECK-AVX2-NEXT: retq
;
; CHECK-AVX512VL-LABEL: t32_tautological:
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 1eea18758907..ef7a07092b77 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -65,73 +65,55 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_100:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: pslld $30, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pslld $30, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_100:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
-; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672]
+; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
@@ -196,74 +178,51 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_neg100:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: psrld $27, %xmm2
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: pslld $30, %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: test_urem_even_neg100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psrld $5, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: psrld $27, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
-; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pslld $30, %xmm0
+; CHECK-SSE41-NEXT: por %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,42949672,1,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_even_neg100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpsrld $27, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX2-LABEL: test_urem_even_neg100:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925]
-; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
More information about the llvm-commits
mailing list