[llvm] r364661 - [NFC][Codegen] Revisit test coverage for X % C == 0 fold once more (add tests with '1' divisor)
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 28 10:26:28 PDT 2019
Author: lebedevri
Date: Fri Jun 28 10:26:28 2019
New Revision: 364661
URL: http://llvm.org/viewvc/llvm-project?rev=364661&view=rev
Log:
[NFC][Codegen] Revisit test coverage for X % C == 0 fold once more (add tests with '1' divisor)
Modified:
llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
llvm/trunk/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
llvm/trunk/test/CodeGen/X86/urem-seteq-vec-splat.ll
Modified: llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll?rev=364661&r1=364660&r2=364661&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll Fri Jun 28 10:26:28 2019
@@ -33,9 +33,11 @@ define <4 x i32> @test_urem_odd_even(<4
ret <4 x i32> %ret
}
+;==============================================================================;
+
; One all-ones divisor in odd divisor
-define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_odd_allones:
+define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_allones_eq:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
@@ -58,10 +60,8 @@ define <4 x i32> @test_urem_odd_allones(
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-
-; One all-ones divisor in even divisor
-define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_even_allones:
+define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_allones_ne:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
@@ -69,29 +69,26 @@ define <4 x i32> @test_urem_even_allones
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
; CHECK-NEXT: adrp x8, .LCPI2_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2]
-; CHECK-NEXT: neg v1.4s, v1.4s
-; CHECK-NEXT: adrp x8, .LCPI2_3
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
-; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_3]
+; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: neg v3.4s, v3.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
- %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
+ %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One all-ones divisor in odd+even divisor
-define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_odd_even_allones:
+; One all-ones divisor in even divisor
+define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_even_allones_eq:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI3_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
@@ -113,15 +110,13 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
+ %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-
-; One power-of-two divisor in odd divisor
-define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_odd_poweroftwo:
+define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_even_allones_ne:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
@@ -129,25 +124,30 @@ define <4 x i32> @test_urem_odd_poweroft
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
; CHECK-NEXT: adrp x8, .LCPI4_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: adrp x8, .LCPI4_3
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_3]
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
- %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
+ %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One power-of-two divisor in even divisor
-define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_even_poweroftwo:
+; One all-ones divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_even_allones_eq:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI5_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
@@ -169,15 +169,13 @@ define <4 x i32> @test_urem_even_powerof
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
+ %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-
-; One power-of-two divisor in odd+even divisor
-define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_odd_even_poweroftwo:
+define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_even_allones_ne:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI6_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]
@@ -196,18 +194,21 @@ define <4 x i32> @test_urem_odd_even_pow
; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
- %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
+ %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One all-ones divisor and one power-of-two divisor in odd divisor
-define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo:
+;------------------------------------------------------------------------------;
+
+; One power-of-two divisor in odd divisor
+define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_poweroftwo:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI7_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0]
@@ -225,15 +226,15 @@ define <4 x i32> @test_urem_odd_allones_
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 4294967295, i32 5>
+ %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One all-ones divisor and one power-of-two divisor in even divisor
-define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_even_allones_and_poweroftwo:
+; One power-of-two divisor in even divisor
+define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_even_poweroftwo:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI8_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
@@ -255,15 +256,15 @@ define <4 x i32> @test_urem_even_allones
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 14, i32 16, i32 4294967295, i32 14>
+ %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One all-ones divisor and one power-of-two divisor in odd+even divisor
-define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_odd_even_allones_and_poweroftwo:
+; One power-of-two divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_even_poweroftwo:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI9_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0]
@@ -271,27 +272,29 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1]
; CHECK-NEXT: adrp x8, .LCPI9_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2]
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: adrp x8, .LCPI9_3
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3]
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 4294967295, i32 100>
+ %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
;------------------------------------------------------------------------------;
-; Negative tests - the fold is invalid if any divisor is 1.
-;------------------------------------------------------------------------------;
-; One divisor in odd divisor
+; One one divisor in odd divisor
define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_one:
; CHECK: // %bb.0:
@@ -314,13 +317,13 @@ define <4 x i32> @test_urem_odd_one(<4 x
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 25, i32 25, i32 1, i32 25>
+ %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One divisor in even divisors
+; One one divisor in even divisor
define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_even_one:
; CHECK: // %bb.0:
@@ -330,26 +333,30 @@ define <4 x i32> @test_urem_even_one(<4
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1]
; CHECK-NEXT: adrp x8, .LCPI11_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2]
+; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI11_3
-; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_3]
+; CHECK-NEXT: adrp x8, .LCPI11_4
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_3]
-; CHECK-NEXT: neg v2.4s, v2.4s
-; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 100, i32 100, i32 1, i32 100>
+ %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 1, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One divisor in odd-even divisors
+; One one divisor in odd+even divisor
define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_even_one:
; CHECK: // %bb.0:
@@ -359,11 +366,285 @@ define <4 x i32> @test_urem_odd_even_one
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1]
; CHECK-NEXT: adrp x8, .LCPI12_2
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2]
+; CHECK-NEXT: neg v1.4s, v1.4s
; CHECK-NEXT: adrp x8, .LCPI12_3
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT: adrp x8, .LCPI12_4
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;==============================================================================;
+
+; One all-ones divisor and power-of-two divisor divisor in odd divisor
+define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI13_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT: adrp x8, .LCPI13_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1]
+; CHECK-NEXT: adrp x8, .LCPI13_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2]
+; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and power-of-two divisor divisor in even divisor
+define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_even_allones_and_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI14_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT: adrp x8, .LCPI14_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1]
+; CHECK-NEXT: adrp x8, .LCPI14_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2]
+; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: adrp x8, .LCPI14_3
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_3]
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and power-of-two divisor divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_even_allones_and_poweroftwo:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI15_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT: adrp x8, .LCPI15_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1]
+; CHECK-NEXT: adrp x8, .LCPI15_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2]
+; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One all-ones divisor and one one divisor in odd divisor
+define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_allones_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI16_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT: adrp x8, .LCPI16_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1]
+; CHECK-NEXT: adrp x8, .LCPI16_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2]
+; CHECK-NEXT: adrp x8, .LCPI16_3
+; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and one one divisor in even divisor
+define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_even_allones_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI17_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT: adrp x8, .LCPI17_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1]
+; CHECK-NEXT: adrp x8, .LCPI17_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2]
+; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: adrp x8, .LCPI17_3
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_3]
+; CHECK-NEXT: adrp x8, .LCPI17_4
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and one one divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_even_allones_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI18_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT: adrp x8, .LCPI18_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1]
+; CHECK-NEXT: adrp x8, .LCPI18_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2]
+; CHECK-NEXT: adrp x8, .LCPI18_3
+; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One power-of-two divisor divisor and one divisor in odd divisor
+define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI19_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT: adrp x8, .LCPI19_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_1]
+; CHECK-NEXT: adrp x8, .LCPI19_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI19_2]
+; CHECK-NEXT: adrp x8, .LCPI19_3
+; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3]
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor divisor and one divisor in even divisor
+define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_even_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI20_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0]
+; CHECK-NEXT: adrp x8, .LCPI20_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1]
+; CHECK-NEXT: adrp x8, .LCPI20_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2]
+; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: adrp x8, .LCPI20_3
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_3]
+; CHECK-NEXT: adrp x8, .LCPI20_4
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor divisor and one divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_even_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI21_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT: adrp x8, .LCPI21_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1]
+; CHECK-NEXT: adrp x8, .LCPI21_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2]
+; CHECK-NEXT: adrp x8, .LCPI21_3
; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3]
; CHECK-NEXT: neg v2.4s, v2.4s
; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
@@ -372,7 +653,69 @@ define <4 x i32> @test_urem_odd_even_one
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 25, i32 100, i32 1, i32 25>
+ %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI22_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0]
+; CHECK-NEXT: adrp x8, .LCPI22_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1]
+; CHECK-NEXT: adrp x8, .LCPI22_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2]
+; CHECK-NEXT: adrp x8, .LCPI22_3
+; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI23_0
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0]
+; CHECK-NEXT: adrp x8, .LCPI23_1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1]
+; CHECK-NEXT: adrp x8, .LCPI23_2
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2]
+; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: adrp x8, .LCPI23_3
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3]
+; CHECK-NEXT: adrp x8, .LCPI23_4
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_4]
+; CHECK-NEXT: neg v3.4s, v3.4s
+; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
Modified: llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-splat.ll?rev=364661&r1=364660&r2=364661&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-splat.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/urem-seteq-vec-splat.ll Fri Jun 28 10:26:28 2019
@@ -97,19 +97,23 @@ define <4 x i32> @test_urem_even_undef1(
; Negative tests
;------------------------------------------------------------------------------;
-; The fold is invalid if divisor is 1.
-define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_one:
+; We can lower remainder of division by powers of two much better elsewhere.
+define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_pow2:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.4s, #1
+; CHECK-NEXT: movi v1.4s, #15
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
+ %urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; We can lower remainder of division by all-ones much better elsewhere.
+; We could lower remainder of division by all-ones much better elsewhere.
define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_allones:
; CHECK: // %bb.0:
@@ -124,18 +128,24 @@ define <4 x i32> @test_urem_allones(<4 x
ret <4 x i32> %ret
}
-; We can lower remainder of division by powers of two much better elsewhere.
-define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
-; CHECK-LABEL: test_urem_pow2:
+; If all divisors are ones, this is constant-folded.
+define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_one_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #15
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: movi v0.4s, #1
; CHECK-NEXT: ret
- %urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
+ %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
+define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind {
+; CHECK-LABEL: test_urem_one_ne:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ret
+ %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
+ %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
Modified: llvm/trunk/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll?rev=364661&r1=364660&r2=364661&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll Fri Jun 28 10:26:28 2019
@@ -136,9 +136,11 @@ define <4 x i32> @test_urem_odd_even(<4
ret <4 x i32> %ret
}
+;==============================================================================;
+
; One all-ones divisor in odd divisor
-define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_odd_allones:
+define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u>
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
@@ -164,7 +166,7 @@ define <4 x i32> @test_urem_odd_allones(
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_odd_allones:
+; CHECK-SSE41-LABEL: test_urem_odd_allones_eq:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
@@ -183,7 +185,7 @@ define <4 x i32> @test_urem_odd_allones(
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_odd_allones:
+; CHECK-AVX1-LABEL: test_urem_odd_allones_eq:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
@@ -200,7 +202,7 @@ define <4 x i32> @test_urem_odd_allones(
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_odd_allones:
+; CHECK-AVX2-LABEL: test_urem_odd_allones_eq:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
@@ -216,7 +218,7 @@ define <4 x i32> @test_urem_odd_allones(
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_odd_allones:
+; CHECK-AVX512VL-LABEL: test_urem_odd_allones_eq:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
@@ -236,10 +238,110 @@ define <4 x i32> @test_urem_odd_allones(
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
+define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_allones_ne:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_odd_allones_ne:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u>
+; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: psrld $2, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_odd_allones_ne:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_odd_allones_ne:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_odd_allones_ne:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
+ %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
; One all-ones divisor in even divisor
-define <4 x i32> @test_urem_even_allones(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_even_allones:
+define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
@@ -269,7 +371,7 @@ define <4 x i32> @test_urem_even_allones
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_even_allones:
+; CHECK-SSE41-LABEL: test_urem_even_allones_eq:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -290,7 +392,7 @@ define <4 x i32> @test_urem_even_allones
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_even_allones:
+; CHECK-AVX1-LABEL: test_urem_even_allones_eq:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
@@ -309,7 +411,7 @@ define <4 x i32> @test_urem_even_allones
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_even_allones:
+; CHECK-AVX2-LABEL: test_urem_even_allones_eq:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
@@ -326,7 +428,7 @@ define <4 x i32> @test_urem_even_allones
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_even_allones:
+; CHECK-AVX512VL-LABEL: test_urem_even_allones_eq:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
@@ -347,10 +449,120 @@ define <4 x i32> @test_urem_even_allones
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
+define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_even_allones_ne:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: psrld $2, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_even_allones_ne:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_even_allones_ne:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
+ %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
; One all-ones divisor in odd+even divisor
-define <4 x i32> @test_urem_odd_even_allones(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_odd_even_allones:
+define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
@@ -385,7 +597,7 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_odd_even_allones:
+; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -412,7 +624,7 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_odd_even_allones:
+; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
@@ -436,7 +648,7 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_odd_even_allones:
+; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
@@ -454,7 +666,7 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones:
+; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
@@ -476,49 +688,179 @@ define <4 x i32> @test_urem_odd_even_all
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-
-; One power-of-two divisor in odd divisor
-define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
+define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u>
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psrld $5, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE2-NEXT: psrld $2, %xmm3
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,14,4294967295,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
+; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u>
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrld $5, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE41-NEXT: psrld $2, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3435973837,2454267027,2147483649,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
+ %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One power-of-two divisor in odd divisor
+define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u>
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u>
+; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo:
@@ -807,75 +1149,65 @@ define <4 x i32> @test_urem_odd_even_pow
ret <4 x i32> %ret
}
-; One all-ones divisor and one power-of-two divisor in odd divisor
-define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
+;------------------------------------------------------------------------------;
+
+; One one divisor in odd divisor
+define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837]
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD
+; CHECK-SSE2-NEXT: movd %eax, %xmm1
+; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: psrld $2, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,4294967295,5]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
+; CHECK-SSE41-LABEL: test_urem_odd_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD
+; CHECK-SSE41-NEXT: movd %eax, %xmm1
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo:
+; CHECK-AVX1-LABEL: test_urem_odd_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD
+; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -883,16 +1215,18 @@ define <4 x i32> @test_urem_odd_allones_
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo:
+; CHECK-AVX2-LABEL: test_urem_odd_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -900,83 +1234,204 @@ define <4 x i32> @test_urem_odd_allones_
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo:
+; CHECK-AVX512VL-LABEL: test_urem_odd_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837]
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
- %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 4294967295, i32 5>
+ %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One all-ones divisor and one power-of-two divisor in even divisor
-define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo:
+; One one divisor in even divisor
+define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_even_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-SSE2-NEXT: movd %eax, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,16,4294967295,14]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT: psrld $31, %xmm1
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo:
+; CHECK-SSE41-LABEL: test_urem_even_one:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
-; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-SSE41-NEXT: movd %eax, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: psrld $2, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_even_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX1-NEXT: vmovd %eax, %xmm3
+; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_even_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm3
+; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_even_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm3
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One one divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_even_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,0,1374389535]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: psrld $5, %xmm2
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,1,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0,2]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_odd_even_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [3435973837,2454267027,0,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrld $2, %xmm2
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: psrld $31, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $5, %xmm2
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
@@ -984,22 +1439,21 @@ define <4 x i32> @test_urem_even_allones
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo:
+; CHECK-AVX1-LABEL: test_urem_odd_even_one:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,2147483649,2454267027]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3435973837,2454267027,0,1374389535]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1007,9 +1461,9 @@ define <4 x i32> @test_urem_even_allones
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo:
+; CHECK-AVX2-LABEL: test_urem_odd_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,2147483649,2454267027]
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535]
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
@@ -1018,6 +1472,7 @@ define <4 x i32> @test_urem_even_allones
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1025,9 +1480,9 @@ define <4 x i32> @test_urem_even_allones
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo:
+; CHECK-AVX512VL-LABEL: test_urem_odd_even_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,2147483649,2454267027]
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535]
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
@@ -1036,23 +1491,26 @@ define <4 x i32> @test_urem_even_allones
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
- %urem = urem <4 x i32> %X, <i32 14, i32 16, i32 4294967295, i32 14>
+ %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One all-ones divisor and one power-of-two divisor in odd+even divisor
-define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
+;==============================================================================;
+
+; One all-ones divisor and power-of-two divisor divisor in odd divisor
+define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -1062,19 +1520,17 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,4294967295,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
-; CHECK-SSE2-NEXT: psrld $31, %xmm4
-; CHECK-SSE2-NEXT: psrld $2, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,5]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
@@ -1082,9 +1538,9 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo:
+; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
@@ -1092,35 +1548,33 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $2, %xmm2
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
; CHECK-SSE41-NEXT: psrld $31, %xmm3
-; CHECK-SSE41-NEXT: psrld $2, %xmm1
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo:
+; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2
; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3
-; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1128,9 +1582,9 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
+; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535]
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
@@ -1145,9 +1599,9 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo:
+; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535]
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837]
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
@@ -1161,55 +1615,67 @@ define <4 x i32> @test_urem_odd_even_all
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
- %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 4294967295, i32 100>
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-;------------------------------------------------------------------------------;
-; Negative tests - the fold is invalid if any divisor is 1.
-;------------------------------------------------------------------------------;
-
-; One divisor in odd divisor
-define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_odd_one:
+; One all-ones divisor and power-of-two divisor divisor in even divisor
+define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-SSE2-NEXT: movd %eax, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psrld $3, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,4294967295,16,14]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_odd_one:
+; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-SSE41-NEXT: movd %eax, %xmm1
-; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $3, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrld $2, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE41-NEXT: psrld $31, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
@@ -1217,17 +1683,22 @@ define <4 x i32> @test_urem_odd_one(<4 x
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_odd_one:
+; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1235,18 +1706,17 @@ define <4 x i32> @test_urem_odd_one(<4 x
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_odd_one:
+; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1254,87 +1724,101 @@ define <4 x i32> @test_urem_odd_one(<4 x
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_odd_one:
+; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
- %urem = urem <4 x i32> %X, <i32 25, i32 25, i32 1, i32 25>
+ %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One divisor in even divisors
-define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_even_one:
+; One all-ones divisor and power-of-two divisor divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-SSE2-NEXT: movd %eax, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrld $2, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3]
+; CHECK-SSE2-NEXT: psrld $5, %xmm2
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_even_one:
+; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-SSE41-NEXT: movd %eax, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE41-NEXT: psrld $2, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $5, %xmm1
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3
+; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_even_one:
+; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1342,18 +1826,16 @@ define <4 x i32> @test_urem_even_one(<4
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_even_one:
+; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1361,57 +1843,58 @@ define <4 x i32> @test_urem_even_one(<4
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_even_one:
+; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
- %urem = urem <4 x i32> %X, <i32 100, i32 100, i32 1, i32 100>
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; One divisor in odd-even divisors
-define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
-; CHECK-SSE2-LABEL: test_urem_odd_even_one:
+;------------------------------------------------------------------------------;
+
+; One all-ones divisor and one one divisor in odd divisor
+define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_allones_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-SSE2-NEXT: movd %eax, %xmm1
-; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
-; CHECK-SSE2-NEXT: psrld $3, %xmm2
-; CHECK-SSE2-NEXT: psrld $5, %xmm1
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [25,100,1,25]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,1,5]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0]
-; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2]
-; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
@@ -1419,38 +1902,38 @@ define <4 x i32> @test_urem_odd_even_one
; CHECK-SSE2-NEXT: psrld $31, %xmm0
; CHECK-SSE2-NEXT: retq
;
-; CHECK-SSE41-LABEL: test_urem_odd_even_one:
+; CHECK-SSE41-LABEL: test_urem_odd_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-SSE41-NEXT: movd %eax, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
-; CHECK-SSE41-NEXT: psrld $5, %xmm1
-; CHECK-SSE41-NEXT: psrld $3, %xmm2
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
-; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_odd_even_one:
+; CHECK-AVX1-LABEL: test_urem_odd_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX1-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
-; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
@@ -1460,16 +1943,15 @@ define <4 x i32> @test_urem_odd_even_one
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_odd_even_one:
+; CHECK-AVX2-LABEL: test_urem_odd_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX2-NEXT: vmovd %eax, %xmm2
-; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
@@ -1479,16 +1961,15 @@ define <4 x i32> @test_urem_odd_even_one
; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_odd_even_one:
+; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: movl $1374389535, %eax # imm = 0x51EB851F
-; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2
-; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,3435973837]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
@@ -1497,7 +1978,862 @@ define <4 x i32> @test_urem_odd_even_one
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
- %urem = urem <4 x i32> %X, <i32 25, i32 100, i32 1, i32 25>
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and one one divisor in even divisor
+define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_even_allones_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,4294967295,1,14]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_even_allones_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrld $31, %xmm2
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_even_allones_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_even_allones_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One all-ones divisor and one one divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $5, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: psrld $31, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,1,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,2]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrld $5, %xmm2
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE41-NEXT: psrld $31, %xmm3
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+; One power-of-two divisor divisor and one divisor in odd divisor
+define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,1,5]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor divisor and one divisor in even divisor
+define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,16,1,14]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4
+; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+; One power-of-two divisor divisor and one divisor in odd+even divisor
+define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $5, %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,1,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0]
+; CHECK-SSE2-NEXT: psrld $2, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,2]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrld $5, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $2, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+;------------------------------------------------------------------------------;
+
+define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm1
+; CHECK-SSE2-NEXT: psrld $31, %xmm1
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: psrld $2, %xmm3
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm2[3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0]
+; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrld $2, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
+ %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
+; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
+; CHECK-SSE2: # %bb.0:
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
+; CHECK-SSE2-NEXT: psrld $1, %xmm3
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
+; CHECK-SSE2-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE2-NEXT: psrld $31, %xmm2
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,1]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
+; CHECK-SSE2-NEXT: psrld $2, %xmm1
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[3,3]
+; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[2,0]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE2-NEXT: psrld $31, %xmm0
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
+; CHECK-SSE41: # %bb.0:
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2
+; CHECK-SSE41-NEXT: psrld $1, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
+; CHECK-SSE41-NEXT: psrld $2, %xmm2
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-SSE41-NEXT: psrld $31, %xmm1
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE41-NEXT: psrld $31, %xmm0
+; CHECK-SSE41-NEXT: retq
+;
+; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
+; CHECK-AVX1: # %bb.0:
+; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm3
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: movl $1, %eax
+; CHECK-AVX2-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,0]
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3]
+; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: movl $1, %eax
+; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,0]
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
+; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3]
+; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
Modified: llvm/trunk/test/CodeGen/X86/urem-seteq-vec-splat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/urem-seteq-vec-splat.ll?rev=364661&r1=364660&r2=364661&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/urem-seteq-vec-splat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/urem-seteq-vec-splat.ll Fri Jun 28 10:26:28 2019
@@ -348,34 +348,47 @@ define <4 x i32> @test_urem_even_undef1(
; Negative tests
;------------------------------------------------------------------------------;
-; The fold is invalid if divisor is 1.
-define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind {
-; CHECK-SSE-LABEL: test_urem_one:
+; We can lower remainder of division by powers of two much better elsewhere.
+define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
+; CHECK-SSE-LABEL: test_urem_pow2:
; CHECK-SSE: # %bb.0:
-; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
+; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-SSE-NEXT: psrld $31, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_one:
+; CHECK-AVX1-LABEL: test_urem_pow2:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_one:
+; CHECK-AVX2-LABEL: test_urem_pow2:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
+; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_one:
+; CHECK-AVX512VL-LABEL: test_urem_pow2:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
- %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
+ %urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
-; We can lower remainder of division by all-ones much better elsewhere.
+; We could lower remainder of division by all-ones much better elsewhere.
define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_allones:
; CHECK-SSE2: # %bb.0:
@@ -430,42 +443,44 @@ define <4 x i32> @test_urem_allones(<4 x
ret <4 x i32> %ret
}
-; We can lower remainder of division by powers of two much better elsewhere.
-define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
-; CHECK-SSE-LABEL: test_urem_pow2:
+; If all divisors are ones, this is constant-folded.
+define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind {
+; CHECK-SSE-LABEL: test_urem_one_eq:
; CHECK-SSE: # %bb.0:
-; CHECK-SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
-; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE-NEXT: psrld $31, %xmm0
+; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_pow2:
+; CHECK-AVX1-LABEL: test_urem_one_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-AVX1-NEXT: retq
;
-; CHECK-AVX2-LABEL: test_urem_pow2:
+; CHECK-AVX2-LABEL: test_urem_one_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
-; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-AVX2-NEXT: retq
;
-; CHECK-AVX512VL-LABEL: test_urem_pow2:
+; CHECK-AVX512VL-LABEL: test_urem_one_eq:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-AVX512VL-NEXT: retq
- %urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
+ %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
+define <4 x i32> @test_urem_one_ne(<4 x i32> %X) nounwind {
+; CHECK-SSE-LABEL: test_urem_one_ne:
+; CHECK-SSE: # %bb.0:
+; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
+; CHECK-AVX-LABEL: test_urem_one_ne:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+ %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
+ %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
+ %ret = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %ret
+}
More information about the llvm-commits
mailing list