[llvm] r366640 - [NFC][Codegen][X86][AArch64] Add "(x s% C) == 0" tests
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 20 12:25:45 PDT 2019
Author: lebedevri
Date: Sat Jul 20 12:25:44 2019
New Revision: 366640
URL: http://llvm.org/viewvc/llvm-project?rev=366640&view=rev
Log:
[NFC][Codegen][X86][AArch64] Add "(x s% C) == 0" tests
Much like with `urem`, the same optimization (albeit with slightly
different algorithm) applies for the signed case, too.
I'm simply copying the test coverage from `urem` case for now,
i believe it should be (close to?) sufficient.
Added:
llvm/trunk/test/CodeGen/AArch64/srem-seteq-optsize.ll
llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
llvm/trunk/test/CodeGen/AArch64/srem-seteq.ll
llvm/trunk/test/CodeGen/X86/srem-seteq-optsize.ll
llvm/trunk/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/trunk/test/CodeGen/X86/srem-seteq-vec-splat.ll
llvm/trunk/test/CodeGen/X86/srem-seteq.ll
Added: llvm/trunk/test/CodeGen/AArch64/srem-seteq-optsize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/srem-seteq-optsize.ll?rev=366640&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/srem-seteq-optsize.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/srem-seteq-optsize.ll Sat Jul 20 12:25:44 2019
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone {
+; CHECK-LABEL: test_minsize:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #5
+; CHECK-NEXT: sdiv w8, w0, w8
+; CHECK-NEXT: add w8, w8, w8, lsl #2
+; CHECK-NEXT: mov w9, #-10
+; CHECK-NEXT: cmp w0, w8
+; CHECK-NEXT: mov w8, #42
+; CHECK-NEXT: csel w0, w8, w9, eq
+; CHECK-NEXT: ret
+ %rem = srem i32 %X, 5
+ %cmp = icmp eq i32 %rem, 0
+ %ret = select i1 %cmp, i32 42, i32 -10
+ ret i32 %ret
+}
+
+define i32 @test_optsize(i32 %X) optsize nounwind readnone {
+; CHECK-LABEL: test_optsize:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #26215
+; CHECK-NEXT: movk w8, #26214, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: lsr x10, x8, #63
+; CHECK-NEXT: asr x8, x8, #33
+; CHECK-NEXT: add w8, w8, w10
+; CHECK-NEXT: add w8, w8, w8, lsl #2
+; CHECK-NEXT: mov w9, #-10
+; CHECK-NEXT: cmp w0, w8
+; CHECK-NEXT: mov w8, #42
+; CHECK-NEXT: csel w0, w8, w9, eq
+; CHECK-NEXT: ret
+ %rem = srem i32 %X, 5
+ %cmp = icmp eq i32 %rem, 0
+ %ret = select i1 %cmp, i32 42, i32 -10
+ ret i32 %ret
+}
Added: llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll?rev=366640&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll Sat Jul 20 12:25:44 2019
@@ -0,0 +1,802 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; At the moment, BuildSREMEqFold does not handle nonsplat vectors.
+
+; Odd+Even divisors
+define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_even:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: adrp x8, .LCPI0_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1]
+; CHECK-NEXT: adrp x8, .LCPI0_2
+; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2]
+; CHECK-NEXT: adrp x8, .LCPI0_3
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: usra v3.4s, v1.4s, #31
+; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 25, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;==============================================================================;
+
+; One all-ones divisor in odd divisor
+define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_allones_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: adrp x8, .LCPI1_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
+; CHECK-NEXT: adrp x8, .LCPI1_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2]
+; CHECK-NEXT: adrp x8, .LCPI1_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_3]
+; CHECK-NEXT: adrp x8, .LCPI1_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_allones_ne:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: adrp x8, .LCPI2_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
+; CHECK-NEXT: adrp x8, .LCPI2_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2]
+; CHECK-NEXT: adrp x8, .LCPI2_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI2_3]
+; CHECK-NEXT: adrp x8, .LCPI2_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
+ %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor in even divisor
+define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_allones_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT: adrp x8, .LCPI3_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1]
+; CHECK-NEXT: adrp x8, .LCPI3_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2]
+; CHECK-NEXT: adrp x8, .LCPI3_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_3]
+; CHECK-NEXT: adrp x8, .LCPI3_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_allones_ne:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI4_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT: adrp x8, .LCPI4_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
+; CHECK-NEXT: adrp x8, .LCPI4_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT: adrp x8, .LCPI4_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT: adrp x8, .LCPI4_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
+ %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_even_allones_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI5_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT: adrp x8, .LCPI5_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1]
+; CHECK-NEXT: adrp x8, .LCPI5_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
+; CHECK-NEXT: adrp x8, .LCPI5_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_3]
+; CHECK-NEXT: adrp x8, .LCPI5_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_even_allones_ne:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI6_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]
+; CHECK-NEXT: adrp x8, .LCPI6_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1]
+; CHECK-NEXT: adrp x8, .LCPI6_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2]
+; CHECK-NEXT: adrp x8, .LCPI6_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_3]
+; CHECK-NEXT: adrp x8, .LCPI6_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
+ %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One power-of-two divisor in odd divisor
+define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI7_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT: adrp x8, .LCPI7_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1]
+; CHECK-NEXT: adrp x8, .LCPI7_2
+; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2]
+; CHECK-NEXT: adrp x8, .LCPI7_3
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: usra v3.4s, v1.4s, #31
+; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor in even divisor
+define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI8_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT: adrp x8, .LCPI8_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1]
+; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: sshr v3.4s, v1.4s, #3
+; CHECK-NEXT: usra v3.4s, v1.4s, #31
+; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_even_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI9_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0]
+; CHECK-NEXT: adrp x8, .LCPI9_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1]
+; CHECK-NEXT: adrp x8, .LCPI9_2
+; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2]
+; CHECK-NEXT: adrp x8, .LCPI9_3
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: usra v3.4s, v1.4s, #31
+; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One one divisor in odd divisor
+define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI10_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT: adrp x8, .LCPI10_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_1]
+; CHECK-NEXT: adrp x8, .LCPI10_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI10_2]
+; CHECK-NEXT: adrp x8, .LCPI10_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI10_3]
+; CHECK-NEXT: adrp x8, .LCPI10_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One one divisor in even divisor
+define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI11_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0]
+; CHECK-NEXT: adrp x8, .LCPI11_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1]
+; CHECK-NEXT: adrp x8, .LCPI11_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2]
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: adrp x8, .LCPI11_3
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_3]
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One one divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_even_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI12_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-NEXT: adrp x8, .LCPI12_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1]
+; CHECK-NEXT: adrp x8, .LCPI12_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2]
+; CHECK-NEXT: adrp x8, .LCPI12_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT: adrp x8, .LCPI12_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;==============================================================================;
+
+; One all-ones divisor and power-of-two divisor divisor in odd divisor
+define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI13_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT: adrp x8, .LCPI13_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1]
+; CHECK-NEXT: adrp x8, .LCPI13_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2]
+; CHECK-NEXT: adrp x8, .LCPI13_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI13_3]
+; CHECK-NEXT: adrp x8, .LCPI13_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and power-of-two divisor divisor in even divisor
+define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_allones_and_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI14_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT: adrp x8, .LCPI14_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1]
+; CHECK-NEXT: adrp x8, .LCPI14_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2]
+; CHECK-NEXT: adrp x8, .LCPI14_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_3]
+; CHECK-NEXT: adrp x8, .LCPI14_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and power-of-two divisor divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_even_allones_and_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI15_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT: adrp x8, .LCPI15_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1]
+; CHECK-NEXT: adrp x8, .LCPI15_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2]
+; CHECK-NEXT: adrp x8, .LCPI15_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_3]
+; CHECK-NEXT: adrp x8, .LCPI15_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One all-ones divisor and one one divisor in odd divisor
+define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_allones_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI16_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT: adrp x8, .LCPI16_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1]
+; CHECK-NEXT: adrp x8, .LCPI16_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2]
+; CHECK-NEXT: adrp x8, .LCPI16_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT: adrp x8, .LCPI16_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and one one divisor in even divisor
+define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_allones_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI17_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT: adrp x8, .LCPI17_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1]
+; CHECK-NEXT: adrp x8, .LCPI17_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2]
+; CHECK-NEXT: adrp x8, .LCPI17_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_3]
+; CHECK-NEXT: adrp x8, .LCPI17_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and one one divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_even_allones_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI18_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT: adrp x8, .LCPI18_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1]
+; CHECK-NEXT: adrp x8, .LCPI18_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2]
+; CHECK-NEXT: adrp x8, .LCPI18_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT: adrp x8, .LCPI18_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One power-of-two divisor divisor and one divisor in odd divisor
+define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI19_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT: adrp x8, .LCPI19_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_1]
+; CHECK-NEXT: adrp x8, .LCPI19_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI19_2]
+; CHECK-NEXT: adrp x8, .LCPI19_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3]
+; CHECK-NEXT: adrp x8, .LCPI19_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor divisor and one divisor in even divisor
+define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI20_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0]
+; CHECK-NEXT: adrp x8, .LCPI20_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1]
+; CHECK-NEXT: adrp x8, .LCPI20_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2]
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: adrp x8, .LCPI20_3
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_3]
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor divisor and one divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_even_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI21_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT: adrp x8, .LCPI21_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1]
+; CHECK-NEXT: adrp x8, .LCPI21_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2]
+; CHECK-NEXT: adrp x8, .LCPI21_3
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT: adrp x8, .LCPI21_4
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI22_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0]
+; CHECK-NEXT: adrp x8, .LCPI22_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1]
+; CHECK-NEXT: adrp x8, .LCPI22_2
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_2]
+; CHECK-NEXT: adrp x8, .LCPI22_3
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT: neg v4.4s, v4.4s
+; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
+; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI23_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0]
+; CHECK-NEXT: adrp x8, .LCPI23_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1]
+; CHECK-NEXT: adrp x8, .LCPI23_2
+; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_2]
+; CHECK-NEXT: adrp x8, .LCPI23_3
+; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3]
+; CHECK-NEXT: neg v4.4s, v4.4s
+; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
+; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s
+; CHECK-NEXT: ushr v1.4s, v1.4s, #31
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
Added: llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-splat.ll?rev=366640&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-splat.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/srem-seteq-vec-splat.ll Sat Jul 20 12:25:44 2019
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; Odd divisor
+define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_25:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34079
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: dup v2.4s, w8
+; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
+; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
+; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: sshr v3.4s, v2.4s, #3
+; CHECK-NEXT: movi v1.4s, #25
+; CHECK-NEXT: usra v3.4s, v2.4s, #31
+; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; Even divisors
+define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_100:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34079
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: dup v2.4s, w8
+; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
+; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
+; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: sshr v3.4s, v2.4s, #5
+; CHECK-NEXT: movi v1.4s, #100
+; CHECK-NEXT: usra v3.4s, v2.4s, #31
+; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+; Comparison constant has undef elements.
+;------------------------------------------------------------------------------;
+
+define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_odd_undef1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34079
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: dup v2.4s, w8
+; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
+; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
+; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: sshr v3.4s, v2.4s, #3
+; CHECK-NEXT: movi v1.4s, #25
+; CHECK-NEXT: usra v3.4s, v2.4s, #31
+; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 undef, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_even_undef1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34079
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: dup v2.4s, w8
+; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
+; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
+; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: sshr v3.4s, v2.4s, #5
+; CHECK-NEXT: movi v1.4s, #100
+; CHECK-NEXT: usra v3.4s, v2.4s, #31
+; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 undef, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+; Negative tests
+;------------------------------------------------------------------------------;
+
+; We can lower remainder of division by powers of two much better elsewhere.
+define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_pow2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshr v1.4s, v0.4s, #31
+; CHECK-NEXT: mov v2.16b, v0.16b
+; CHECK-NEXT: usra v2.4s, v1.4s, #28
+; CHECK-NEXT: bic v2.4s, #15
+; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; We could lower remainder of division by all-ones much better elsewhere.
+define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_allones:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v0.4s, #1
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; If all divisors are ones, this is constant-folded.
+define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_one_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v0.4s, #1
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_srem_one_ne:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ret
+ %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
+ %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
Added: llvm/trunk/test/CodeGen/AArch64/srem-seteq.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/srem-seteq.ll?rev=366640&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/srem-seteq.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/srem-seteq.ll Sat Jul 20 12:25:44 2019
@@ -0,0 +1,253 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+;------------------------------------------------------------------------------;
+; Odd divisors
+;------------------------------------------------------------------------------;
+
+define i32 @test_srem_odd(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_odd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #26215
+; CHECK-NEXT: movk w8, #26214, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: lsr x9, x8, #63
+; CHECK-NEXT: asr x8, x8, #33
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w8, w8, w8, lsl #2
+; CHECK-NEXT: cmp w0, w8
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 5
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+define i32 @test_srem_odd_25(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_odd_25:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34079
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: lsr x9, x8, #63
+; CHECK-NEXT: asr x8, x8, #35
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #25
+; CHECK-NEXT: msub w8, w8, w9, w0
+; CHECK-NEXT: cmp w8, #0 // =0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 25
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; This is like test_srem_odd, except the divisor has bit 30 set.
+define i32 @test_srem_odd_bit30(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_odd_bit30:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: sbfiz x9, x0, #29, #32
+; CHECK-NEXT: sub x8, x9, x8
+; CHECK-NEXT: lsr x9, x8, #63
+; CHECK-NEXT: asr x8, x8, #59
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #3
+; CHECK-NEXT: movk w9, #16384, lsl #16
+; CHECK-NEXT: msub w8, w8, w9, w0
+; CHECK-NEXT: cmp w8, #0 // =0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 1073741827
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; This is like test_srem_odd, except the divisor has bit 31 set.
+define i32 @test_srem_odd_bit31(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_odd_bit31:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: add x8, x8, x8, lsl #29
+; CHECK-NEXT: neg x8, x8
+; CHECK-NEXT: lsr x9, x8, #63
+; CHECK-NEXT: asr x8, x8, #60
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #-2147483645
+; CHECK-NEXT: msub w8, w8, w9, w0
+; CHECK-NEXT: cmp w8, #0 // =0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 2147483651
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+;------------------------------------------------------------------------------;
+; Even divisors
+;------------------------------------------------------------------------------;
+
+define i16 @test_srem_even(i16 %X) nounwind {
+; CHECK-LABEL: test_srem_even:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w9, #9363
+; CHECK-NEXT: sxth w8, w0
+; CHECK-NEXT: movk w9, #37449, lsl #16
+; CHECK-NEXT: smull x9, w8, w9
+; CHECK-NEXT: lsr x9, x9, #32
+; CHECK-NEXT: add w9, w9, w8
+; CHECK-NEXT: asr w10, w9, #3
+; CHECK-NEXT: add w9, w10, w9, lsr #31
+; CHECK-NEXT: mov w10, #14
+; CHECK-NEXT: msub w8, w9, w10, w8
+; CHECK-NEXT: tst w8, #0xffff
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %srem = srem i16 %X, 14
+ %cmp = icmp ne i16 %srem, 0
+ %ret = zext i1 %cmp to i16
+ ret i16 %ret
+}
+
+define i32 @test_srem_even_100(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_even_100:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #34079
+; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: lsr x9, x8, #63
+; CHECK-NEXT: asr x8, x8, #37
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #100
+; CHECK-NEXT: msub w8, w8, w9, w0
+; CHECK-NEXT: cmp w8, #0 // =0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 100
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; This is like test_srem_even, except the divisor has bit 30 set.
+define i32 @test_srem_even_bit30(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_even_bit30:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #65433
+; CHECK-NEXT: movk w8, #16383, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: lsr x9, x8, #63
+; CHECK-NEXT: asr x8, x8, #60
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: mov w9, #104
+; CHECK-NEXT: movk w9, #16384, lsl #16
+; CHECK-NEXT: msub w8, w8, w9, w0
+; CHECK-NEXT: cmp w8, #0 // =0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 1073741928
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; This is like test_srem_odd, except the divisor has bit 31 set.
+define i32 @test_srem_even_bit31(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_even_bit31:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #65433
+; CHECK-NEXT: movk w8, #32767, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #32
+; CHECK-NEXT: sub w8, w8, w0
+; CHECK-NEXT: asr w9, w8, #30
+; CHECK-NEXT: add w8, w9, w8, lsr #31
+; CHECK-NEXT: mov w9, #102
+; CHECK-NEXT: movk w9, #32768, lsl #16
+; CHECK-NEXT: msub w8, w8, w9, w0
+; CHECK-NEXT: cmp w8, #0 // =0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 2147483750
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+;------------------------------------------------------------------------------;
+; Special case
+;------------------------------------------------------------------------------;
+
+; 'NE' predicate is fine too.
+define i32 @test_srem_odd_setne(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_odd_setne:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #26215
+; CHECK-NEXT: movk w8, #26214, lsl #16
+; CHECK-NEXT: smull x8, w0, w8
+; CHECK-NEXT: lsr x9, x8, #63
+; CHECK-NEXT: asr x8, x8, #33
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: add w8, w8, w8, lsl #2
+; CHECK-NEXT: cmp w0, w8
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 5
+ %cmp = icmp ne i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+;------------------------------------------------------------------------------;
+; Negative tests
+;------------------------------------------------------------------------------;
+
+; The fold is invalid if divisor is 1.
+define i32 @test_srem_one(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 1
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; We can lower remainder of division by all-ones much better elsewhere.
+define i32 @test_srem_allones(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_allones:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp w0, #0 // =0
+; CHECK-NEXT: csel w8, w0, w0, lt
+; CHECK-NEXT: cmp w0, w8
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 4294967295
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; We can lower remainder of division by powers of two much better elsewhere.
+define i32 @test_srem_pow2(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_pow2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add w8, w0, #15 // =15
+; CHECK-NEXT: cmp w0, #0 // =0
+; CHECK-NEXT: csel w8, w8, w0, lt
+; CHECK-NEXT: and w8, w8, #0xfffffff0
+; CHECK-NEXT: cmp w0, w8
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %srem = srem i32 %X, 16
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
Added: llvm/trunk/test/CodeGen/X86/srem-seteq-optsize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/srem-seteq-optsize.ll?rev=366640&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/srem-seteq-optsize.ll (added)
+++ llvm/trunk/test/CodeGen/X86/srem-seteq-optsize.ll Sat Jul 20 12:25:44 2019
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64
+
+; On X86, division in expensive. BuildRemEqFold should therefore run even
+; when optimizing for size. Only optimizing for minimum size retains a plain div.
+
+define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone {
+; X86-LABEL: test_minsize:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl $5
+; X86-NEXT: popl %ecx
+; X86-NEXT: cltd
+; X86-NEXT: idivl %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: je .LBB0_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: pushl $-10
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+; X86-NEXT: .LBB0_1:
+; X86-NEXT: pushl $42
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test_minsize:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: pushq $5
+; X64-NEXT: popq %rcx
+; X64-NEXT: cltd
+; X64-NEXT: idivl %ecx
+; X64-NEXT: testl %edx, %edx
+; X64-NEXT: pushq $42
+; X64-NEXT: popq %rcx
+; X64-NEXT: pushq $-10
+; X64-NEXT: popq %rax
+; X64-NEXT: cmovel %ecx, %eax
+; X64-NEXT: retq
+ %rem = srem i32 %X, 5
+ %cmp = icmp eq i32 %rem, 0
+ %ret = select i1 %cmp, i32 42, i32 -10
+ ret i32 %ret
+}
+
+define i32 @test_optsize(i32 %X) optsize nounwind readnone {
+; X86-LABEL: test_optsize:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: leal (%edx,%edx,4), %eax
+; X86-NEXT: cmpl %eax, %ecx
+; X86-NEXT: movl $42, %eax
+; X86-NEXT: je .LBB1_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl $-10, %eax
+; X86-NEXT: .LBB1_2:
+; X86-NEXT: retl
+;
+; X64-LABEL: test_optsize:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: imulq $1717986919, %rax, %rcx # imm = 0x66666667
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: sarq $33, %rcx
+; X64-NEXT: addl %edx, %ecx
+; X64-NEXT: leal (%rcx,%rcx,4), %ecx
+; X64-NEXT: cmpl %ecx, %eax
+; X64-NEXT: movl $42, %ecx
+; X64-NEXT: movl $-10, %eax
+; X64-NEXT: cmovel %ecx, %eax
+; X64-NEXT: retq
+ %rem = srem i32 %X, 5
+ %cmp = icmp eq i32 %rem, 0
+ %ret = select i1 %cmp, i32 42, i32 -10
+ ret i32 %ret
+}
Added: llvm/trunk/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll?rev=366640&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll (added)
+++ llvm/trunk/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll Sat Jul 20 12:25:44 2019
@@ -0,0 +1,3535 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE41
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX1
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512VL
+
+; Odd+Even divisors
+define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_even:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,0,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $5, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
+; CHECK-SSE2-NEXT: psrad $1, %xmm5
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,25,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_even:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,0,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE41-NEXT: psrad $1, %xmm4
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_even:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_even:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_even:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 25, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;==============================================================================;
+
+; One all-ones divisor in odd divisor
+define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_allones_eq:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919]
+; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $1, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_allones_eq:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-SSE41-NEXT: movd %eax, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrad $1, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_allones_eq:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_allones_eq:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_allones_eq:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_allones_ne:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919]
+; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $1, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_allones_ne:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-SSE41-NEXT: movd %eax, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrad $1, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_allones_ne:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_allones_ne:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_allones_ne:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
+ %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor in even divisor
+define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_allones_eq:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm4
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_allones_eq:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-SSE41-NEXT: movd %eax, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrad $3, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_allones_eq:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_allones_eq:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_allones_eq:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_allones_ne:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm6
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = <1,u,4294967295,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm6
+; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm6
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm6
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm6
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pandn %xmm3, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_allones_ne:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-SSE41-NEXT: movd %eax, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrad $3, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_allones_ne:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_allones_ne:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_allones_ne:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
+ %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $5, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
+; CHECK-SSE2-NEXT: psrad $1, %xmm5
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_eq:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $5, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
+; CHECK-SSE2-NEXT: psrad $1, %xmm5
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_ne:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
+ %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One power-of-two divisor in odd divisor
+define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483649,1717986919]
+; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm4
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <0,u,1,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2
+; CHECK-SSE2-NEXT: psrad $1, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u>
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrad $3, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE41-NEXT: psrad $1, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor in even divisor
+define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_poweroftwo:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483649,2454267027]
+; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_poweroftwo:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u>
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: psrad $3, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_poweroftwo:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_poweroftwo:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,4294967295,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $5, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
+; CHECK-SSE2-NEXT: psrad $1, %xmm5
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,16,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE41-NEXT: psrad $1, %xmm4
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One one divisor in odd divisor
+define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919]
+; CHECK-SSE2-NEXT: pand %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,1,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $1, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-SSE41-NEXT: movd %eax, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrad $1, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: movl $1717986919, %eax # imm = 0x66666667
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One one divisor in even divisor
+define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
+; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-SSE41-NEXT: movd %eax, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $3, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One one divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_even_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $5, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
+; CHECK-SSE2-NEXT: psrad $1, %xmm5
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,1,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_even_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_even_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_even_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_even_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;==============================================================================;
+
+; One all-ones divisor and power-of-two divisor divisor in odd divisor
+define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1717986919]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $1, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,5]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $1, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE41-NEXT: psrad $3, %xmm4
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and power-of-two divisor divisor in even divisor
+define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,2454267027]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5
+; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm5
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,14]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $3, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and power-of-two divisor divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $5, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $1, %xmm3
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $3, %xmm3
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE41-NEXT: psrad $1, %xmm4
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_poweroftwo:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One all-ones divisor and one one divisor in odd divisor
+define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_allones_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1717986919,0,0,1717986919]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm4, %xmm3
+; CHECK-SSE2-NEXT: psubd %xmm3, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm5
+; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm2
+; CHECK-SSE2-NEXT: psrad $1, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[1,2]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1]
+; CHECK-SSE2-NEXT: psrld $31, %xmm5
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,1,5]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_allones_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_allones_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_allones_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and one one divisor in even divisor
+define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_allones_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,0,4294967295]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,0,2454267027]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5
+; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[1,2]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1]
+; CHECK-SSE2-NEXT: psrld $31, %xmm5
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_allones_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $3, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_allones_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_allones_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and one one divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $5, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $1, %xmm3
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,1,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $1, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One power-of-two divisor divisor and one divisor in odd divisor
+define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1717986919]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $1, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
+; CHECK-SSE2-NEXT: psrad $3, %xmm5
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,5]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $1, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor divisor and one divisor in even divisor
+define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2147483649,0,2454267027]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3
+; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm3
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm0, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrad $3, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor divisor and one divisor in odd+even divisor
+define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $5, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5
+; CHECK-SSE2-NEXT: psrad $1, %xmm5
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,1,0]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,2147483649,0]
+; CHECK-SSE2-NEXT: pand %xmm2, %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE2-NEXT: psrlq $32, %xmm2
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrad $3, %xmm3
+; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
+; CHECK-SSE2-NEXT: psrad $1, %xmm4
+; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,3]
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u>
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: psrlq $32, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $3, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE41-NEXT: psrad $1, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm3, %xmm4
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,0,2147483649,0]
+; CHECK-SSE2-NEXT: pand %xmm6, %xmm5
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6
+; CHECK-SSE2-NEXT: psrlq $32, %xmm6
+; CHECK-SSE2-NEXT: psubd %xmm5, %xmm6
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm6
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4
+; CHECK-SSE2-NEXT: psrad $3, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; CHECK-SSE2-NEXT: psrld $31, %xmm6
+; CHECK-SSE2-NEXT: pand %xmm3, %xmm6
+; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm6
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1]
+; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u>
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: psrlq $32, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrad $3, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
+; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
+; CHECK-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
Added: llvm/trunk/test/CodeGen/X86/srem-seteq-vec-splat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/srem-seteq-vec-splat.ll?rev=366640&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/srem-seteq-vec-splat.ll (added)
+++ llvm/trunk/test/CodeGen/X86/srem-seteq-vec-splat.ll Sat Jul 20 12:25:44 2019
@@ -0,0 +1,586 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE41
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX1
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512VL
+
+; Odd divisor
+define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_25:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm1, %xmm4
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: psrad $3, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_25:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: psrad $3, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_25:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_25:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25]
+; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_25:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; Even divisors
+define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_100:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm1, %xmm4
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: psrad $5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_100:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_100:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_100:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
+; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_100:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+; Comparison constant has undef elements.
+;------------------------------------------------------------------------------;
+
+define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_odd_undef1:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm1, %xmm4
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: psrad $3, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_odd_undef1:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: psrad $3, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_odd_undef1:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_odd_undef1:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25]
+; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_odd_undef1:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 undef, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_srem_even_undef1:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; CHECK-SSE2-NEXT: pand %xmm1, %xmm4
+; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: psrad $5, %xmm2
+; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_srem_even_undef1:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1
+; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: psrad $5, %xmm2
+; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_even_undef1:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_even_undef1:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
+; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_even_undef1:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip){1to4}, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 undef, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+; Negative tests
+;------------------------------------------------------------------------------;
+
+; We can lower remainder of division by powers of two much better elsewhere.
+define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
+; CHECK-SSE-LABEL: test_srem_pow2:
+; CHECK-SSE: # %bb.0:
+; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE-NEXT: psrad $31, %xmm1
+; CHECK-SSE-NEXT: psrld $28, %xmm1
+; CHECK-SSE-NEXT: paddd %xmm0, %xmm1
+; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; CHECK-SSE-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE-NEXT: psrld $31, %xmm0
+; CHECK-SSE-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_pow2:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_pow2:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $28, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967280,4294967280,4294967280,4294967280]
+; CHECK-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_pow2:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpsrad $31, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $28, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; We could lower remainder of division by all-ones much better elsewhere.
+define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind {
+; CHECK-SSE-LABEL: test_srem_allones:
+; CHECK-SSE: # %bb.0:
+; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-SSE-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_allones:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_allones:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_allones:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; If all divisors are ones, this is constant-folded.
+define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind {
+; CHECK-SSE-LABEL: test_srem_one_eq:
+; CHECK-SSE: # %bb.0:
+; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-SSE-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_srem_one_eq:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_srem_one_eq:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_srem_one_eq:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX512VL-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
+ %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind {
+; CHECK-SSE-LABEL: test_srem_one_ne:
+; CHECK-SSE: # %bb.0:
+; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
+; CHECK-AVX-LABEL: test_srem_one_ne:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+ %srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
+ %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
Added: llvm/trunk/test/CodeGen/X86/srem-seteq.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/srem-seteq.ll?rev=366640&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/srem-seteq.ll (added)
+++ llvm/trunk/test/CodeGen/X86/srem-seteq.ll Sat Jul 20 12:25:44 2019
@@ -0,0 +1,420 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64
+
+;------------------------------------------------------------------------------;
+; Odd divisors
+;------------------------------------------------------------------------------;
+
+define i32 @test_srem_odd(i32 %X) nounwind {
+; X86-LABEL: test_srem_odd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: leal (%edx,%edx,4), %edx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_odd:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rcx
+; X64-NEXT: imulq $1717986919, %rcx, %rax # imm = 0x66666667
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: sarq $33, %rax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: leal (%rax,%rax,4), %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 5
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+define i32 @test_srem_odd_25(i32 %X) nounwind {
+; X86-LABEL: test_srem_odd_25:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $3, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: leal (%edx,%edx,4), %eax
+; X86-NEXT: leal (%eax,%eax,4), %edx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_odd_25:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rcx
+; X64-NEXT: imulq $1374389535, %rcx, %rax # imm = 0x51EB851F
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: sarq $35, %rax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: leal (%rax,%rax,4), %eax
+; X64-NEXT: leal (%rax,%rax,4), %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 25
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; This is like test_srem_odd, except the divisor has bit 30 set.
+define i32 @test_srem_odd_bit30(i32 %X) nounwind {
+; X86-LABEL: test_srem_odd_bit30:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $536870911, %edx # imm = 0x1FFFFFFF
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $27, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: imull $1073741827, %edx, %edx # imm = 0x40000003
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_odd_bit30:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: shlq $29, %rax
+; X64-NEXT: subq %rcx, %rax
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: sarq $59, %rax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: imull $1073741827, %eax, %edx # imm = 0x40000003
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 1073741827
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; This is like test_srem_odd, except the divisor has bit 31 set.
+define i32 @test_srem_odd_bit31(i32 %X) nounwind {
+; X86-LABEL: test_srem_odd_bit31:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $-536870913, %edx # imm = 0xDFFFFFFF
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $28, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: imull $-2147483645, %edx, %edx # imm = 0x80000003
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_odd_bit31:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: shlq $29, %rax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: negq %rax
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: sarq $60, %rax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: imull $-2147483645, %eax, %edx # imm = 0x80000003
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 2147483651
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+;------------------------------------------------------------------------------;
+; Even divisors
+;------------------------------------------------------------------------------;
+
+define i16 @test_srem_even(i16 %X) nounwind {
+; X86-LABEL: test_srem_even:
+; X86: # %bb.0:
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: imull $18725, %ecx, %eax # imm = 0x4925
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shrl $31, %edx
+; X86-NEXT: sarl $18, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shll $4, %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpw %dx, %cx
+; X86-NEXT: setne %al
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_even:
+; X64: # %bb.0:
+; X64-NEXT: movswl %di, %ecx
+; X64-NEXT: imull $18725, %ecx, %eax # imm = 0x4925
+; X64-NEXT: movl %eax, %edx
+; X64-NEXT: shrl $31, %edx
+; X64-NEXT: sarl $18, %eax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: movl %eax, %edx
+; X64-NEXT: shll $4, %edx
+; X64-NEXT: subl %eax, %edx
+; X64-NEXT: subl %eax, %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpw %dx, %cx
+; X64-NEXT: setne %al
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %srem = srem i16 %X, 14
+ %cmp = icmp ne i16 %srem, 0
+ %ret = zext i1 %cmp to i16
+ ret i16 %ret
+}
+
+define i32 @test_srem_even_100(i32 %X) nounwind {
+; X86-LABEL: test_srem_even_100:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $5, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: imull $100, %edx, %edx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_even_100:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rcx
+; X64-NEXT: imulq $1374389535, %rcx, %rax # imm = 0x51EB851F
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: sarq $37, %rax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: imull $100, %eax, %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 100
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; This is like test_srem_even, except the divisor has bit 30 set.
+define i32 @test_srem_even_bit30(i32 %X) nounwind {
+; X86-LABEL: test_srem_even_bit30:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1073741721, %edx # imm = 0x3FFFFF99
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $28, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: imull $1073741928, %edx, %edx # imm = 0x40000068
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_even_bit30:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rcx
+; X64-NEXT: imulq $1073741721, %rcx, %rax # imm = 0x3FFFFF99
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: sarq $60, %rax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: imull $1073741928, %eax, %edx # imm = 0x40000068
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 1073741928
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; This is like test_srem_odd, except the divisor has bit 31 set.
+define i32 @test_srem_even_bit31(i32 %X) nounwind {
+; X86-LABEL: test_srem_even_bit31:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $2147483545, %edx # imm = 0x7FFFFF99
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl $30, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: imull $-2147483546, %edx, %edx # imm = 0x80000066
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_even_bit31:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rcx
+; X64-NEXT: imulq $2147483545, %rcx, %rax # imm = 0x7FFFFF99
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: subl %ecx, %eax
+; X64-NEXT: movl %eax, %edx
+; X64-NEXT: shrl $31, %edx
+; X64-NEXT: sarl $30, %eax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: imull $-2147483546, %eax, %edx # imm = 0x80000066
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 2147483750
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+;------------------------------------------------------------------------------;
+; Special case
+;------------------------------------------------------------------------------;
+
+; 'NE' predicate is fine too.
+define i32 @test_srem_odd_setne(i32 %X) nounwind {
+; X86-LABEL: test_srem_odd_setne:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: imull %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: sarl %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: leal (%edx,%edx,4), %edx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: setne %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_odd_setne:
+; X64: # %bb.0:
+; X64-NEXT: movslq %edi, %rcx
+; X64-NEXT: imulq $1717986919, %rcx, %rax # imm = 0x66666667
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: sarq $33, %rax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: leal (%rax,%rax,4), %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: setne %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 5
+ %cmp = icmp ne i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+;------------------------------------------------------------------------------;
+; Negative tests
+;------------------------------------------------------------------------------;
+
+; The fold is invalid if divisor is 1.
+define i32 @test_srem_one(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_one:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: ret{{[l|q]}}
+ %srem = srem i32 %X, 1
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; We can lower remainder of division by all-ones much better elsewhere.
+define i32 @test_srem_allones(i32 %X) nounwind {
+; CHECK-LABEL: test_srem_allones:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: ret{{[l|q]}}
+ %srem = srem i32 %X, 4294967295
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
+
+; We can lower remainder of division by powers of two much better elsewhere.
+define i32 @test_srem_pow2(i32 %X) nounwind {
+; X86-LABEL: test_srem_pow2:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: shrl $28, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: andl $-16, %edx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: sete %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_srem_pow2:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: sarl $31, %ecx
+; X64-NEXT: shrl $28, %ecx
+; X64-NEXT: addl %edi, %ecx
+; X64-NEXT: andl $-16, %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %ecx, %edi
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %srem = srem i32 %X, 16
+ %cmp = icmp eq i32 %srem, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
More information about the llvm-commits
mailing list