[llvm] 0ebef5e - [DAGCombine] Enable div by constant optimization for odd sized vectors before type legalization. (#188313)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 26 09:16:55 PDT 2026
Author: Craig Topper
Date: 2026-03-26T09:16:46-07:00
New Revision: 0ebef5e5e2f7b65dba673eb6e6dafea1d10bcd86
URL: https://github.com/llvm/llvm-project/commit/0ebef5e5e2f7b65dba673eb6e6dafea1d10bcd86
DIFF: https://github.com/llvm/llvm-project/commit/0ebef5e5e2f7b65dba673eb6e6dafea1d10bcd86.diff
LOG: [DAGCombine] Enable div by constant optimization for odd sized vectors before type legalization. (#188313)
If we we are going to legalize to a vector with the same element type
and mulh or mul_lohi are supported, allow the optimization before type
legalization.
RISC-V will widen vectors using vp.udiv/sdiv that doesn't support
division by constant optimization. In addition, type legalization will create
a build_vector with undef elements making it hard to match after type
legalization.
Other targets may need to widen by a combination of vector and scalar
divisions to avoid traps if we widen a vector with garbage.
I had to enable the MULHU->SRL DAG combine before type legalization to
prevent regressions. After type legalization, the multiply constant
build_vector will have undef elements and the combine won't trigger.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AArch64/rem-by-const.ll
llvm/test/CodeGen/LoongArch/lasx/issue170976.ll
llvm/test/CodeGen/LoongArch/lsx/issue170976.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/srem-vector-lkk.ll
llvm/test/CodeGen/X86/urem-vector-lkk.ll
llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f6e70e52b2ca9..c646d5220749d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5636,7 +5636,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
// fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
if (isConstantOrConstantVector(N1, /*NoOpaques=*/true,
/*AllowTruncation=*/true) &&
- hasOperation(ISD::SRL, VT)) {
+ (!LegalOperations || hasOperation(ISD::SRL, VT))) {
if (SDValue LogBase2 = BuildLogBase2(N1, DL)) {
unsigned NumEltBits = VT.getScalarSizeInBits();
SDValue SRLAmt = DAG.getNode(
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9efbc62caa1ed..2e55c859f8f03 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6612,9 +6612,18 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
// Check to see if we can do this.
// FIXME: We should be more aggressive here.
- if (!isTypeLegal(VT)) {
+ EVT QueryVT = VT;
+ if (VT.isVector()) {
+ // If the vector type will be legalized to a vector type with the same
+ // element type, allow the transform before type legalization if MULHS or
+ // SMUL_LOHI are supported.
+ QueryVT = getLegalTypeToTransformTo(*DAG.getContext(), VT);
+ if (!QueryVT.isVector() ||
+ QueryVT.getVectorElementType() != VT.getVectorElementType())
+ return SDValue();
+ } else if (!isTypeLegal(VT)) {
// Limit this to simple scalars for now.
- if (VT.isVector() || !VT.isSimple())
+ if (!VT.isSimple())
return SDValue();
// If this type will be promoted to a large enough type with a legal
@@ -6628,11 +6637,12 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
- bool HasMULHS = isOperationLegalOrCustom(ISD::MULHS, VT, IsAfterLegalization);
+ bool HasMULHS =
+ isOperationLegalOrCustom(ISD::MULHS, QueryVT, IsAfterLegalization);
bool HasSMUL_LOHI =
- isOperationLegalOrCustom(ISD::SMUL_LOHI, VT, IsAfterLegalization);
+ isOperationLegalOrCustom(ISD::SMUL_LOHI, QueryVT, IsAfterLegalization);
- if (!HasMULHS && !HasSMUL_LOHI && MulVT == EVT()) {
+ if (isTypeLegal(VT) && !HasMULHS && !HasSMUL_LOHI && MulVT == EVT()) {
// If type twice as wide legal, widen and use a mul plus a shift.
EVT WideVT = VT.widenIntegerElementType(*DAG.getContext());
// Some targets like AMDGPU try to go from SDIV to SDIVREM which is then
@@ -6791,9 +6801,18 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
// Check to see if we can do this.
// FIXME: We should be more aggressive here.
- if (!isTypeLegal(VT)) {
+ EVT QueryVT = VT;
+ if (VT.isVector()) {
+ // If the vector type will be legalized to a vector type with the same
+ // element type, allow the transform before type legalization if MULHU or
+ // UMUL_LOHI are supported.
+ QueryVT = getLegalTypeToTransformTo(*DAG.getContext(), VT);
+ if (!QueryVT.isVector() ||
+ QueryVT.getVectorElementType() != VT.getVectorElementType())
+ return SDValue();
+ } else if (!isTypeLegal(VT)) {
// Limit this to simple scalars for now.
- if (VT.isVector() || !VT.isSimple())
+ if (!VT.isSimple())
return SDValue();
// If this type will be promoted to a large enough type with a legal
@@ -6807,14 +6826,15 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
- bool HasMULHU = isOperationLegalOrCustom(ISD::MULHU, VT, IsAfterLegalization);
+ bool HasMULHU =
+ isOperationLegalOrCustom(ISD::MULHU, QueryVT, IsAfterLegalization);
bool HasUMUL_LOHI =
- isOperationLegalOrCustom(ISD::UMUL_LOHI, VT, IsAfterLegalization);
+ isOperationLegalOrCustom(ISD::UMUL_LOHI, QueryVT, IsAfterLegalization);
- if (!HasMULHU && !HasUMUL_LOHI && MulVT == EVT()) {
+ if (isTypeLegal(VT) && !HasMULHU && !HasUMUL_LOHI && MulVT == EVT()) {
// If type twice as wide legal, widen and use a mul plus a shift.
EVT WideVT = VT.widenIntegerElementType(*DAG.getContext());
- // Some targets like AMDGPU try to go from SDIV to SDIVREM which is then
+ // Some targets like AMDGPU try to go from UDIV to UDIVREM which is then
// custom lowered. This is very expensive so avoid it at all costs for
// constant divisors.
if ((!IsAfterLegalTypes && isOperationExpand(ISD::UDIV, VT) &&
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index b3667c6e17e6c..1c6b241cb8f12 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -2334,23 +2334,16 @@ define <3 x i32> @sv3i32_7(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-LABEL: sv3i32_7:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov w8, #9363 // =0x2493
-; CHECK-SD-NEXT: mov w9, v0.s[2]
-; CHECK-SD-NEXT: movi v3.2s, #7
+; CHECK-SD-NEXT: movi v3.4s, #7
; CHECK-SD-NEXT: movk w8, #37449, lsl #16
-; CHECK-SD-NEXT: dup v1.2s, w8
-; CHECK-SD-NEXT: smull x8, w9, w8
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: add x8, x9, x8, lsr #32
-; CHECK-SD-NEXT: asr w10, w8, #2
-; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-SD-NEXT: add w8, w10, w8, lsr #31
-; CHECK-SD-NEXT: add v1.2s, v1.2s, v0.2s
-; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3
-; CHECK-SD-NEXT: add w8, w9, w8
-; CHECK-SD-NEXT: sshr v2.2s, v1.2s, #2
-; CHECK-SD-NEXT: usra v2.2s, v1.2s, #31
-; CHECK-SD-NEXT: mls v0.2s, v2.2s, v3.2s
-; CHECK-SD-NEXT: mov v0.s[2], w8
+; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT: add v1.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: sshr v2.4s, v1.4s, #2
+; CHECK-SD-NEXT: usra v2.4s, v1.4s, #31
+; CHECK-SD-NEXT: mls v0.4s, v2.4s, v3.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sv3i32_7:
@@ -2386,21 +2379,15 @@ define <3 x i32> @sv3i32_100(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-LABEL: sv3i32_100:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT: mov w9, v0.s[2]
-; CHECK-SD-NEXT: movi v2.2s, #100
+; CHECK-SD-NEXT: movi v3.4s, #100
; CHECK-SD-NEXT: movk w8, #20971, lsl #16
-; CHECK-SD-NEXT: dup v1.2s, w8
-; CHECK-SD-NEXT: smull x8, w9, w8
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: asr x10, x8, #37
-; CHECK-SD-NEXT: add x8, x10, x8, lsr #63
-; CHECK-SD-NEXT: mov w10, #100 // =0x64
-; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #37
-; CHECK-SD-NEXT: msub w8, w8, w10, w9
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: usra v1.2s, v1.2s, #31
-; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s
-; CHECK-SD-NEXT: mov v0.s[2], w8
+; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT: sshr v2.4s, v1.4s, #5
+; CHECK-SD-NEXT: usra v2.4s, v1.4s, #31
+; CHECK-SD-NEXT: mls v0.4s, v2.4s, v3.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sv3i32_100:
@@ -2560,26 +2547,16 @@ define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-LABEL: uv3i32_7:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov w8, #18725 // =0x4925
-; CHECK-SD-NEXT: mov x9, #2684354560 // =0xa0000000
; CHECK-SD-NEXT: movk w8, #9362, lsl #16
-; CHECK-SD-NEXT: movk x9, #18724, lsl #32
-; CHECK-SD-NEXT: dup v1.2s, w8
-; CHECK-SD-NEXT: mov w8, v0.s[2]
-; CHECK-SD-NEXT: movk x9, #9362, lsl #48
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: umulh x9, x8, x9
-; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3
-; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 def $q0
-; CHECK-SD-NEXT: add w8, w8, w9
-; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1
-; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s
-; CHECK-SD-NEXT: movi v2.2s, #7
-; CHECK-SD-NEXT: ushr v1.2s, v1.2s, #2
-; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s
-; CHECK-SD-NEXT: mov v0.s[2], w8
+; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT: sub v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: usra v1.4s, v2.4s, #1
+; CHECK-SD-NEXT: movi v2.4s, #7
+; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #2
+; CHECK-SD-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv3i32_7:
@@ -2630,19 +2607,14 @@ define <3 x i32> @uv3i32_100(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-LABEL: uv3i32_100:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
-; CHECK-SD-NEXT: mov w9, v0.s[2]
-; CHECK-SD-NEXT: movi v2.2s, #100
; CHECK-SD-NEXT: movk w8, #20971, lsl #16
-; CHECK-SD-NEXT: mov w10, #100 // =0x64
-; CHECK-SD-NEXT: dup v1.2s, w8
-; CHECK-SD-NEXT: umull x8, w9, w8
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: lsr x8, x8, #37
-; CHECK-SD-NEXT: msub w8, w8, w10, w9
-; CHECK-SD-NEXT: ushr v1.2d, v1.2d, #37
-; CHECK-SD-NEXT: xtn v1.2s, v1.2d
-; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s
-; CHECK-SD-NEXT: mov v0.s[2], w8
+; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT: movi v2.4s, #100
+; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #5
+; CHECK-SD-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv3i32_100:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/issue170976.ll b/llvm/test/CodeGen/LoongArch/lasx/issue170976.ll
index a5aa5c8d6d0b4..6cdc2794605b9 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/issue170976.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/issue170976.ll
@@ -51,9 +51,12 @@ entry:
define <8 x i64> @test_i64(<8 x i64> %shuffle) {
; LA32-LABEL: test_i64:
; LA32: # %bb.0: # %entry
-; LA32-NEXT: xvrepli.d $xr2, 3
-; LA32-NEXT: xvdiv.du $xr0, $xr0, $xr2
-; LA32-NEXT: xvdiv.du $xr1, $xr1, $xr2
+; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; LA32-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI3_0)
+; LA32-NEXT: xvmuh.du $xr0, $xr0, $xr2
+; LA32-NEXT: xvsrli.d $xr0, $xr0, 1
+; LA32-NEXT: xvmuh.du $xr1, $xr1, $xr2
+; LA32-NEXT: xvsrli.d $xr1, $xr1, 1
; LA32-NEXT: ret
;
; LA64-LABEL: test_i64:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/issue170976.ll b/llvm/test/CodeGen/LoongArch/lsx/issue170976.ll
index df4da0178f389..a8e4c2a19b58b 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/issue170976.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/issue170976.ll
@@ -51,9 +51,12 @@ entry:
define <4 x i64> @test_i64(<4 x i64> %shuffle) {
; LA32-LABEL: test_i64:
; LA32: # %bb.0: # %entry
-; LA32-NEXT: vrepli.d $vr2, 3
-; LA32-NEXT: vdiv.du $vr0, $vr0, $vr2
-; LA32-NEXT: vdiv.du $vr1, $vr1, $vr2
+; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; LA32-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI3_0)
+; LA32-NEXT: vmuh.du $vr0, $vr0, $vr2
+; LA32-NEXT: vsrli.d $vr0, $vr0, 1
+; LA32-NEXT: vmuh.du $vr1, $vr1, $vr2
+; LA32-NEXT: vsrli.d $vr1, $vr1, 1
; LA32-NEXT: ret
;
; LA64-LABEL: test_i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 7cb00d40e60c0..9e2daa4a9a066 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1141,10 +1141,25 @@ define void @mulhu_v6i16(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 0
; CHECK-NEXT: lui a1, %hi(.LCPI67_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI67_0)
-; CHECK-NEXT: vle16.v v9, (a1)
-; CHECK-NEXT: vdivu.vv v8, v8, v9
+; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT: vle16.v v10, (a1)
+; CHECK-NEXT: lui a1, 1048568
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma
+; CHECK-NEXT: vmv.s.x v9, a1
+; CHECK-NEXT: li a1, 33
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmulhu.vv v10, v8, v10
+; CHECK-NEXT: vsub.vv v8, v8, v10
+; CHECK-NEXT: vmulhu.vv v8, v8, v9
+; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: vmv.v.i v9, 3
+; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
+; CHECK-NEXT: vsrl.vv v8, v8, v9
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
%a = load <6 x i16>, ptr %x
@@ -1287,9 +1302,16 @@ define void @mulhs_v6i16(ptr %x) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: li a1, 22
; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: vmv.v.i v9, -7
-; CHECK-NEXT: vmerge.vim v9, v9, 7, v0
-; CHECK-NEXT: vdiv.vv v8, v8, v9
+; CHECK-NEXT: lui a1, 1048571
+; CHECK-NEXT: addi a1, a1, 1755
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: lui a1, 5
+; CHECK-NEXT: addi a1, a1, -1755
+; CHECK-NEXT: vmerge.vxm v9, v9, a1, v0
+; CHECK-NEXT: vmulh.vv v8, v8, v9
+; CHECK-NEXT: vsra.vi v8, v8, 1
+; CHECK-NEXT: vsrl.vi v9, v8, 15
+; CHECK-NEXT: vadd.vv v8, v8, v9
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
%a = load <6 x i16>, ptr %x
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 6f1f1fcb647f1..1931a45155cfb 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2213,121 +2213,186 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-LABEL: pr51133:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movq %rdi, %rax
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5
-; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,u,0,u,41,u,183,u,1,u,1,u,161,u,221,u]
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; CHECK-SSE2-NEXT: pand %xmm4, %xmm5
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,u,1,u,0,u,1,u,1,u,1,u,0,u,1,u]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm7
+; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,u,0,u,1,u,0,u,0,u,255,u,0,u,1,u]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm7
+; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm7
+; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15]
+; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
+; CHECK-SSE2-NEXT: psrlw $8, %xmm8
+; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6
+; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
+; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
+; CHECK-SSE2-NEXT: psrlw $8, %xmm6
+; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm6
+; CHECK-SSE2-NEXT: paddb %xmm7, %xmm6
+; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; CHECK-SSE2-NEXT: psraw $8, %xmm7
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [64,256,32,64,256,64,8,4]
+; CHECK-SSE2-NEXT: psrlw $8, %xmm7
+; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm8
+; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; CHECK-SSE2-NEXT: psraw $8, %xmm8
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [256,8,64,256,16,4,8,8]
+; CHECK-SSE2-NEXT: psrlw $8, %xmm8
+; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm8
+; CHECK-SSE2-NEXT: psrlw $7, %xmm6
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; CHECK-SSE2-NEXT: paddb %xmm8, %xmm6
+; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [114,u,1,u,50,u,7,u,2,u,8,u,97,u,117,u]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm7
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,u,103,u,183,u,171,u,61,u,1,u,127,u,183,u]
-; CHECK-SSE2-NEXT: pand %xmm4, %xmm6
-; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5
-; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,1,128,1,128,32,1,1]
-; CHECK-SSE2-NEXT: psrlw $8, %xmm5
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [3,u,87,u,7,u,6,u,84,u,128,u,127,u,56,u]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm6
+; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6
+; CHECK-SSE2-NEXT: psubb %xmm6, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137]
+; CHECK-SSE2-NEXT: psrlw $8, %xmm6
+; CHECK-SSE2-NEXT: pxor %xmm7, %xmm7
+; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
+; CHECK-SSE2-NEXT: psrlw $8, %xmm7
+; CHECK-SSE2-NEXT: packuswb %xmm6, %xmm7
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255]
+; CHECK-SSE2-NEXT: pand %xmm0, %xmm6
+; CHECK-SSE2-NEXT: paddb %xmm7, %xmm6
+; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; CHECK-SSE2-NEXT: psraw $8, %xmm7
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [8,8,128,64,8,256,256,8]
+; CHECK-SSE2-NEXT: psrlw $8, %xmm7
+; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8
+; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm8
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32]
+; CHECK-SSE2-NEXT: psraw $8, %xmm6
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [64,128,128,16,256,64,256,16]
; CHECK-SSE2-NEXT: psrlw $8, %xmm6
-; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
-; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7
-; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6
-; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
-; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5
-; CHECK-SSE2-NEXT: por %xmm7, %xmm5
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,u,223,u,205,u,183,u,161,u,1,u,171,u,239,u]
-; CHECK-SSE2-NEXT: pand %xmm4, %xmm1
-; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,u,205,u,27,u,241,u,1,u,1,u,1,u,163,u]
-; CHECK-SSE2-NEXT: pand %xmm4, %xmm0
-; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
-; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,128,1,1,1,128,1,64]
-; CHECK-SSE2-NEXT: psrlw $8, %xmm1
-; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1,128,128,32,128,32]
-; CHECK-SSE2-NEXT: psrlw $8, %xmm0
-; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
-; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3
-; CHECK-SSE2-NEXT: pandn %xmm5, %xmm3
-; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm2
-; CHECK-SSE2-NEXT: pandn %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pmovmskb %xmm2, %ecx
-; CHECK-SSE2-NEXT: pmovmskb %xmm3, %edx
-; CHECK-SSE2-NEXT: shll $16, %edx
+; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6
+; CHECK-SSE2-NEXT: psubb %xmm8, %xmm6
+; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [62,u,62,u,5,u,7,u,97,u,2,u,3,u,60,u]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm7
+; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [13,u,5,u,19,u,34,u,2,u,8,u,2,u,88,u]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm6
+; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6
+; CHECK-SSE2-NEXT: psubb %xmm6, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm2
+; CHECK-SSE2-NEXT: por %xmm0, %xmm2
+; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm3
+; CHECK-SSE2-NEXT: por %xmm1, %xmm3
+; CHECK-SSE2-NEXT: pmovmskb %xmm3, %ecx
+; CHECK-SSE2-NEXT: notl %ecx
+; CHECK-SSE2-NEXT: shll $16, %ecx
+; CHECK-SSE2-NEXT: pmovmskb %xmm2, %edx
+; CHECK-SSE2-NEXT: xorl $65535, %edx # imm = 0xFFFF
; CHECK-SSE2-NEXT: orl %ecx, %edx
; CHECK-SSE2-NEXT: movl %edx, (%rdi)
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE41-LABEL: pr51133:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4
; CHECK-SSE41-NEXT: movq %rdi, %rax
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221]
-; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6
-; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221]
-; CHECK-SSE41-NEXT: psllw $8, %xmm6
-; CHECK-SSE41-NEXT: por %xmm0, %xmm6
-; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm0
-; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
-; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,1,128,1,128,32,1,1]
-; CHECK-SSE41-NEXT: psrlw $8, %xmm0
+; CHECK-SSE41-NEXT: pxor %xmm4, %xmm4
+; CHECK-SSE41-NEXT: pxor %xmm5, %xmm5
+; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137]
+; CHECK-SSE41-NEXT: psrlw $8, %xmm5
+; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7
+; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
+; CHECK-SSE41-NEXT: psrlw $8, %xmm7
+; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm7
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255]
+; CHECK-SSE41-NEXT: pand %xmm0, %xmm6
+; CHECK-SSE41-NEXT: paddb %xmm7, %xmm6
+; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm5
+; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; CHECK-SSE41-NEXT: psraw $8, %xmm5
+; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [8,8,128,64,8,256,256,8]
+; CHECK-SSE41-NEXT: psrlw $8, %xmm5
+; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7
+; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm7
; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32]
+; CHECK-SSE41-NEXT: psraw $8, %xmm6
+; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [64,128,128,16,256,64,256,16]
; CHECK-SSE41-NEXT: psrlw $8, %xmm6
-; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
-; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7
-; CHECK-SSE41-NEXT: pxor %xmm0, %xmm7
-; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm6
+; CHECK-SSE41-NEXT: psubb %xmm7, %xmm6
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm7 = [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60]
+; CHECK-SSE41-NEXT: pmullw %xmm6, %xmm7
+; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; CHECK-SSE41-NEXT: pand %xmm5, %xmm7
+; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60]
+; CHECK-SSE41-NEXT: psllw $8, %xmm6
+; CHECK-SSE41-NEXT: por %xmm7, %xmm6
+; CHECK-SSE41-NEXT: psubb %xmm6, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6
-; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1
-; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239]
-; CHECK-SSE41-NEXT: pmullw %xmm4, %xmm0
-; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
-; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239]
-; CHECK-SSE41-NEXT: psllw $8, %xmm4
-; CHECK-SSE41-NEXT: por %xmm0, %xmm4
-; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm0
-; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,128,1,1,1,128,1,64]
-; CHECK-SSE41-NEXT: psrlw $8, %xmm0
-; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,1,1,128,128,32,128,32]
-; CHECK-SSE41-NEXT: psrlw $8, %xmm4
-; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm4
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
-; CHECK-SSE41-NEXT: pmaxub %xmm4, %xmm0
+; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15]
+; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
+; CHECK-SSE41-NEXT: psrlw $8, %xmm6
+; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7
+; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
+; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
+; CHECK-SSE41-NEXT: psrlw $8, %xmm7
+; CHECK-SSE41-NEXT: packuswb %xmm6, %xmm7
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm8 = [0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1]
+; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm8
+; CHECK-SSE41-NEXT: pand %xmm5, %xmm8
+; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6
+; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1]
+; CHECK-SSE41-NEXT: psllw $8, %xmm6
+; CHECK-SSE41-NEXT: por %xmm8, %xmm6
+; CHECK-SSE41-NEXT: paddb %xmm7, %xmm6
+; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm7
+; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; CHECK-SSE41-NEXT: psraw $8, %xmm7
+; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [64,256,32,64,256,64,8,4]
+; CHECK-SSE41-NEXT: psrlw $8, %xmm7
+; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm8
+; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; CHECK-SSE41-NEXT: psraw $8, %xmm8
+; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [256,8,64,256,16,4,8,8]
+; CHECK-SSE41-NEXT: psrlw $8, %xmm8
+; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm8
+; CHECK-SSE41-NEXT: psrlw $7, %xmm6
+; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; CHECK-SSE41-NEXT: paddb %xmm8, %xmm6
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm7 = [3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117]
+; CHECK-SSE41-NEXT: pmullw %xmm6, %xmm7
+; CHECK-SSE41-NEXT: pand %xmm5, %xmm7
+; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117]
+; CHECK-SSE41-NEXT: psllw $8, %xmm6
+; CHECK-SSE41-NEXT: por %xmm7, %xmm6
+; CHECK-SSE41-NEXT: psubb %xmm6, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm1
; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3
-; CHECK-SSE41-NEXT: pandn %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm2
-; CHECK-SSE41-NEXT: pandn %xmm0, %xmm2
+; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm3
+; CHECK-SSE41-NEXT: por %xmm1, %xmm3
+; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm2
+; CHECK-SSE41-NEXT: por %xmm0, %xmm2
; CHECK-SSE41-NEXT: pmovmskb %xmm2, %ecx
+; CHECK-SSE41-NEXT: xorl $65535, %ecx # imm = 0xFFFF
; CHECK-SSE41-NEXT: pmovmskb %xmm3, %edx
+; CHECK-SSE41-NEXT: notl %edx
; CHECK-SSE41-NEXT: shll $16, %edx
; CHECK-SSE41-NEXT: orl %ecx, %edx
; CHECK-SSE41-NEXT: movl %edx, (%rdi)
diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
index 0fb6eb3c58893..f9de4e18857c9 100644
--- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
@@ -6,155 +6,78 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
-; SSE-LABEL: fold_srem_vec_1:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: movswl %ax, %ecx
-; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %eax, %ecx
-; SSE-NEXT: movzwl %cx, %ecx
-; SSE-NEXT: movswl %cx, %edx
-; SSE-NEXT: shrl $15, %ecx
-; SSE-NEXT: sarl $9, %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movd %xmm0, %ecx
-; SSE-NEXT: movswl %cx, %edx
-; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77
-; SSE-NEXT: shrl $16, %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: movswl %dx, %esi
-; SSE-NEXT: shrl $15, %edx
-; SSE-NEXT: sarl $6, %esi
-; SSE-NEXT: addl %edx, %esi
-; SSE-NEXT: imull $95, %esi, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: pextrw $1, %xmm0, %ecx
-; SSE-NEXT: movswl %cx, %edx
-; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF
-; SSE-NEXT: movl %edx, %esi
-; SSE-NEXT: shrl $31, %esi
-; SSE-NEXT: sarl $21, %edx
-; SSE-NEXT: addl %esi, %edx
-; SSE-NEXT: imull $-124, %edx, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: pinsrw $1, %ecx, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %ecx
-; SSE-NEXT: movswl %cx, %edx
-; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73
-; SSE-NEXT: movl %edx, %esi
-; SSE-NEXT: shrl $31, %esi
-; SSE-NEXT: sarl $18, %edx
-; SSE-NEXT: addl %esi, %edx
-; SSE-NEXT: imull $98, %edx, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: pinsrw $2, %ecx, %xmm1
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: fold_srem_vec_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm2 = [1,0,0,65535,0,0,0,0]
+; SSE2-NEXT: pmullw %xmm0, %xmm2
+; SSE2-NEXT: movq {{.*#+}} xmm1 = [44151,48623,2675,32081,0,0,0,0]
+; SSE2-NEXT: pmulhw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psraw $4, %xmm3
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: psraw $1, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,0,65535,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: psrlw $15, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm1
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,65412,98,64533,u,u,u,u]
+; SSE2-NEXT: psubw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: fold_srem_vec_1:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movq {{.*#+}} xmm1 = [1,0,0,65535,0,0,0,0]
+; SSE4-NEXT: pmullw %xmm0, %xmm1
+; SSE4-NEXT: movq {{.*#+}} xmm2 = [44151,48623,2675,32081,0,0,0,0]
+; SSE4-NEXT: pmulhw %xmm0, %xmm2
+; SSE4-NEXT: paddw %xmm1, %xmm2
+; SSE4-NEXT: movdqa %xmm2, %xmm1
+; SSE4-NEXT: psrlw $15, %xmm1
+; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,2048,16384,128,u,u,u,u]
+; SSE4-NEXT: paddw %xmm1, %xmm2
+; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [95,65412,98,64533,u,u,u,u]
+; SSE4-NEXT: psubw %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1OR2-LABEL: fold_srem_vec_1:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1OR2-NEXT: movswl %ax, %ecx
-; AVX1OR2-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
-; AVX1OR2-NEXT: shrl $16, %ecx
-; AVX1OR2-NEXT: subl %eax, %ecx
-; AVX1OR2-NEXT: movzwl %cx, %ecx
-; AVX1OR2-NEXT: movswl %cx, %edx
-; AVX1OR2-NEXT: shrl $15, %ecx
-; AVX1OR2-NEXT: sarl $9, %edx
-; AVX1OR2-NEXT: addl %ecx, %edx
-; AVX1OR2-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15
-; AVX1OR2-NEXT: subl %ecx, %eax
-; AVX1OR2-NEXT: vmovd %xmm0, %ecx
-; AVX1OR2-NEXT: movswl %cx, %edx
-; AVX1OR2-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77
-; AVX1OR2-NEXT: shrl $16, %edx
-; AVX1OR2-NEXT: addl %ecx, %edx
-; AVX1OR2-NEXT: movzwl %dx, %edx
-; AVX1OR2-NEXT: movswl %dx, %esi
-; AVX1OR2-NEXT: shrl $15, %edx
-; AVX1OR2-NEXT: sarl $6, %esi
-; AVX1OR2-NEXT: addl %edx, %esi
-; AVX1OR2-NEXT: imull $95, %esi, %edx
-; AVX1OR2-NEXT: subl %edx, %ecx
-; AVX1OR2-NEXT: vmovd %ecx, %xmm1
-; AVX1OR2-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX1OR2-NEXT: movswl %cx, %edx
-; AVX1OR2-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF
-; AVX1OR2-NEXT: movl %edx, %esi
-; AVX1OR2-NEXT: shrl $31, %esi
-; AVX1OR2-NEXT: sarl $21, %edx
-; AVX1OR2-NEXT: addl %esi, %edx
-; AVX1OR2-NEXT: imull $-124, %edx, %edx
-; AVX1OR2-NEXT: subl %edx, %ecx
-; AVX1OR2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX1OR2-NEXT: movswl %cx, %edx
-; AVX1OR2-NEXT: imull $2675, %edx, %edx # imm = 0xA73
-; AVX1OR2-NEXT: movl %edx, %esi
-; AVX1OR2-NEXT: shrl $31, %esi
-; AVX1OR2-NEXT: sarl $18, %edx
-; AVX1OR2-NEXT: addl %esi, %edx
-; AVX1OR2-NEXT: imull $98, %edx, %edx
-; AVX1OR2-NEXT: subl %edx, %ecx
-; AVX1OR2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0
-; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,0,0,65535,u,u,u,u]
+; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [44151,48623,2675,32081,u,u,u,u]
+; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX1OR2-NEXT: vpsrlw $15, %xmm1, %xmm2
+; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1024,2048,16384,128,u,u,u,u]
+; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,65412,98,64533,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512-LABEL: fold_srem_vec_1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpextrw $3, %xmm0, %eax
-; AVX512-NEXT: movswl %ax, %ecx
-; AVX512-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
-; AVX512-NEXT: shrl $16, %ecx
-; AVX512-NEXT: subl %eax, %ecx
-; AVX512-NEXT: movzwl %cx, %edx
-; AVX512-NEXT: movswl %dx, %ecx
-; AVX512-NEXT: shrl $15, %edx
-; AVX512-NEXT: sarl $9, %ecx
-; AVX512-NEXT: addl %edx, %ecx
-; AVX512-NEXT: vmovd %xmm0, %edx
-; AVX512-NEXT: movswl %dx, %esi
-; AVX512-NEXT: imull $-21385, %esi, %esi # imm = 0xAC77
-; AVX512-NEXT: shrl $16, %esi
-; AVX512-NEXT: addl %edx, %esi
-; AVX512-NEXT: movzwl %si, %esi
-; AVX512-NEXT: movswl %si, %edi
-; AVX512-NEXT: shrl $15, %esi
-; AVX512-NEXT: sarl $6, %edi
-; AVX512-NEXT: addl %esi, %edi
-; AVX512-NEXT: imull $95, %edi, %esi
-; AVX512-NEXT: subl %esi, %edx
-; AVX512-NEXT: vmovd %edx, %xmm1
-; AVX512-NEXT: vpextrw $1, %xmm0, %edx
-; AVX512-NEXT: movswl %dx, %esi
-; AVX512-NEXT: imull $-16913, %esi, %esi # imm = 0xBDEF
-; AVX512-NEXT: movl %esi, %edi
-; AVX512-NEXT: shrl $31, %edi
-; AVX512-NEXT: sarl $21, %esi
-; AVX512-NEXT: addl %edi, %esi
-; AVX512-NEXT: imull $-1003, %ecx, %ecx # imm = 0xFC15
-; AVX512-NEXT: imull $-124, %esi, %esi
-; AVX512-NEXT: subl %esi, %edx
-; AVX512-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1
-; AVX512-NEXT: vpextrw $2, %xmm0, %edx
-; AVX512-NEXT: subl %ecx, %eax
-; AVX512-NEXT: movswl %dx, %ecx
-; AVX512-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; AVX512-NEXT: movl %ecx, %esi
-; AVX512-NEXT: shrl $31, %esi
-; AVX512-NEXT: sarl $18, %ecx
-; AVX512-NEXT: addl %esi, %ecx
-; AVX512-NEXT: imull $98, %ecx, %ecx
-; AVX512-NEXT: subl %ecx, %edx
-; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm0
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,0,0,65535,u,u,u,u]
+; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [44151,48623,2675,32081,u,u,u,u]
+; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2
+; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,65412,98,64533,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
ret <4 x i16> %1
@@ -163,25 +86,25 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; SSE-LABEL: fold_srem_vec_2:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
; SSE-NEXT: pmulhw %xmm0, %xmm1
; SSE-NEXT: paddw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrlw $15, %xmm2
; SSE-NEXT: psraw $6, %xmm1
; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,u,u,u,u]
; SSE-NEXT: psubw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fold_srem_vec_2:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
+; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u]
; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
; AVX-NEXT: vpsraw $6, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95]
+; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,u,u,u,u]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -193,14 +116,14 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; SSE2-LABEL: combine_srem_sdiv:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
; SSE2-NEXT: pmulhw %xmm0, %xmm1
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrlw $15, %xmm2
; SSE2-NEXT: psraw $6, %xmm1
; SSE2-NEXT: paddw %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,u,u,u,u]
; SSE2-NEXT: pmullw %xmm1, %xmm2
; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
@@ -208,7 +131,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
;
; SSE4-LABEL: combine_srem_sdiv:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
; SSE4-NEXT: pmulhw %xmm0, %xmm1
; SSE4-NEXT: paddw %xmm0, %xmm1
; SSE4-NEXT: movdqa %xmm1, %xmm2
@@ -223,12 +146,12 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
;
; AVX-LABEL: combine_srem_sdiv:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
+; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u]
; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
; AVX-NEXT: vpsraw $6, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95]
+; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,u,u,u,u]
; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -240,248 +163,237 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; Don't fold for divisors that are a power of two.
define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
-; SSE-LABEL: dont_fold_srem_power_of_two:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: leal 31(%rax), %ecx
-; SSE-NEXT: testw %ax, %ax
-; SSE-NEXT: cmovnsl %eax, %ecx
-; SSE-NEXT: andl $-32, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movd %xmm0, %ecx
-; SSE-NEXT: leal 63(%rcx), %edx
-; SSE-NEXT: testw %cx, %cx
-; SSE-NEXT: cmovnsl %ecx, %edx
-; SSE-NEXT: andl $-64, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movd %ecx, %xmm0
-; SSE-NEXT: pinsrw $1, %eax, %xmm0
-; SSE-NEXT: pextrw $2, %xmm1, %eax
-; SSE-NEXT: leal 7(%rax), %ecx
-; SSE-NEXT: testw %ax, %ax
-; SSE-NEXT: cmovnsl %eax, %ecx
-; SSE-NEXT: andl $-8, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm0
-; SSE-NEXT: pextrw $3, %xmm1, %eax
-; SSE-NEXT: movswl %ax, %ecx
-; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: addl %eax, %ecx
-; SSE-NEXT: movzwl %cx, %ecx
-; SSE-NEXT: movswl %cx, %edx
-; SSE-NEXT: shrl $15, %ecx
-; SSE-NEXT: sarl $6, %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: imull $95, %edx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: dont_fold_srem_power_of_two:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm1 = [32769,32769,32769,44151,0,0,0,0]
+; SSE2-NEXT: pmulhw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psraw $4, %xmm4
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psraw $2, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: andps %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psraw $5, %xmm4
+; SSE2-NEXT: andnps %xmm4, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: psrlw $15, %xmm1
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,8,95,u,u,u,u]
+; SSE2-NEXT: psubw %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: dont_fold_srem_power_of_two:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: leal 31(%rax), %ecx
-; AVX-NEXT: testw %ax, %ax
-; AVX-NEXT: cmovnsl %eax, %ecx
-; AVX-NEXT: andl $-32, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: leal 63(%rcx), %edx
-; AVX-NEXT: testw %cx, %cx
-; AVX-NEXT: cmovnsl %ecx, %edx
-; AVX-NEXT: andl $-64, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: leal 7(%rax), %ecx
-; AVX-NEXT: testw %ax, %ax
-; AVX-NEXT: cmovnsl %eax, %ecx
-; AVX-NEXT: andl $-8, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: movswl %ax, %ecx
-; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: movswl %cx, %edx
-; AVX-NEXT: shrl $15, %ecx
-; AVX-NEXT: sarl $6, %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: imull $95, %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT: retq
+; SSE4-LABEL: dont_fold_srem_power_of_two:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movq {{.*#+}} xmm1 = [32769,32769,32769,44151,0,0,0,0]
+; SSE4-NEXT: pmulhw %xmm0, %xmm1
+; SSE4-NEXT: paddw %xmm0, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm2
+; SSE4-NEXT: psrlw $15, %xmm2
+; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2048,4096,16384,1024,u,u,u,u]
+; SSE4-NEXT: paddw %xmm2, %xmm1
+; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,8,95,u,u,u,u]
+; SSE4-NEXT: psubw %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX1OR2-LABEL: dont_fold_srem_power_of_two:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32769,32769,32769,44151,u,u,u,u]
+; AVX1OR2-NEXT: vpaddw %xmm0, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpsrlw $15, %xmm1, %xmm2
+; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2048,4096,16384,1024,u,u,u,u]
+; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: dont_fold_srem_power_of_two:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32769,32769,32769,44151,u,u,u,u]
+; AVX512-NEXT: vpaddw %xmm0, %xmm1, %xmm1
+; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2
+; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
%1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
ret <4 x i16> %1
}
; Don't fold if the divisor is one.
define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
-; SSE-LABEL: dont_fold_srem_one:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %ecx
-; SSE-NEXT: movswl %cx, %eax
-; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217
-; SSE-NEXT: shrl $16, %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %edx
-; SSE-NEXT: movswl %dx, %eax
-; SSE-NEXT: shrl $15, %edx
-; SSE-NEXT: sarl $4, %eax
-; SSE-NEXT: addl %edx, %eax
-; SSE-NEXT: leal (%rax,%rax,2), %edx
-; SSE-NEXT: shll $3, %edx
-; SSE-NEXT: subl %edx, %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: pextrw $1, %xmm0, %ecx
-; SSE-NEXT: movswl %cx, %edx
-; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B
-; SSE-NEXT: movl %edx, %esi
-; SSE-NEXT: shrl $31, %esi
-; SSE-NEXT: sarl $23, %edx
-; SSE-NEXT: addl %esi, %edx
-; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pinsrw $1, %ecx, %xmm1
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: movswl %ax, %ecx
-; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
-; SSE-NEXT: movl %ecx, %edx
-; SSE-NEXT: shrl $31, %edx
-; SSE-NEXT: sarl $26, %ecx
-; SSE-NEXT: addl %edx, %ecx
-; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: dont_fold_srem_one:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,12827,45591,12375,0,0,0,0]
+; SSE2-NEXT: pmulhw %xmm0, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,0,65535,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psraw $4, %xmm5
+; SSE2-NEXT: pandn %xmm5, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psraw $10, %xmm5
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: psraw $7, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: psrlw $15, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: paddw %xmm3, %xmm1
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,654,23,5423,u,u,u,u]
+; SSE2-NEXT: psubw %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: dont_fold_srem_one:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: movswl %ax, %ecx
-; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: movswl %cx, %edx
-; AVX-NEXT: shrl $15, %ecx
-; AVX-NEXT: sarl $4, %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX-NEXT: shll $3, %ecx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: addl %eax, %edx
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: movswl %ax, %ecx
-; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B
-; AVX-NEXT: movl %ecx, %esi
-; AVX-NEXT: shrl $31, %esi
-; AVX-NEXT: sarl $23, %ecx
-; AVX-NEXT: addl %esi, %ecx
-; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: movswl %ax, %ecx
-; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $26, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT: retq
+; SSE4-LABEL: dont_fold_srem_one:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pxor %xmm1, %xmm1
+; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4,5,6,7]
+; SSE4-NEXT: movq {{.*#+}} xmm3 = [0,12827,45591,12375,0,0,0,0]
+; SSE4-NEXT: pmulhw %xmm0, %xmm3
+; SSE4-NEXT: paddw %xmm2, %xmm3
+; SSE4-NEXT: movdqa %xmm3, %xmm2
+; SSE4-NEXT: psrlw $15, %xmm2
+; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4,5,6,7]
+; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [u,512,4096,64,u,u,u,u]
+; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3,4,5,6,7]
+; SSE4-NEXT: paddw %xmm2, %xmm3
+; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,654,23,5423,u,u,u,u]
+; SSE4-NEXT: psubw %xmm3, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX1OR2-LABEL: dont_fold_srem_one:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [0,12827,45591,12375,u,u,u,u]
+; AVX1OR2-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1OR2-NEXT: vpsrlw $15, %xmm2, %xmm3
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5,6,7]
+; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,512,4096,64,u,u,u,u]
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: dont_fold_srem_one:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [0,12827,45591,12375,u,u,u,u]
+; AVX512-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpsrlw $15, %xmm2, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5,6,7]
+; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
%1 = srem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
ret <4 x i16> %1
}
; Don't fold if the divisor is 2^15.
define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
-; SSE-LABEL: dont_fold_urem_i16_smax:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movswl %ax, %ecx
-; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: addl %eax, %ecx
-; SSE-NEXT: movzwl %cx, %ecx
-; SSE-NEXT: movswl %cx, %edx
-; SSE-NEXT: shrl $15, %ecx
-; SSE-NEXT: sarl $4, %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
-; SSE-NEXT: shll $3, %ecx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: addl %eax, %edx
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: leal 32767(%rax), %ecx
-; SSE-NEXT: testw %ax, %ax
-; SSE-NEXT: cmovnsl %eax, %ecx
-; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000
-; SSE-NEXT: addl %eax, %ecx
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pinsrw $1, %ecx, %xmm1
-; SSE-NEXT: pinsrw $2, %edx, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: movswl %ax, %ecx
-; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
-; SSE-NEXT: movl %ecx, %edx
-; SSE-NEXT: shrl $31, %edx
-; SSE-NEXT: sarl $26, %ecx
-; SSE-NEXT: addl %edx, %ecx
-; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: dont_fold_urem_i16_smax:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm2 = [1,65535,1,0,0,0,0,0]
+; SSE2-NEXT: pmullw %xmm0, %xmm2
+; SSE2-NEXT: movq {{.*#+}} xmm1 = [0,32767,45591,12375,0,0,0,0]
+; SSE2-NEXT: pmulhw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,0,65535,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: psraw $4, %xmm5
+; SSE2-NEXT: pandn %xmm5, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: psraw $2, %xmm3
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $15, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,32768,23,5423,u,u,u,u]
+; SSE2-NEXT: psubw %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: dont_fold_urem_i16_smax:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: movswl %ax, %ecx
-; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: movswl %cx, %edx
-; AVX-NEXT: shrl $15, %ecx
-; AVX-NEXT: sarl $4, %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX-NEXT: shll $3, %ecx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: addl %eax, %edx
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: leal 32767(%rax), %ecx
-; AVX-NEXT: testw %ax, %ax
-; AVX-NEXT: cmovnsl %eax, %ecx
-; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: movswl %ax, %ecx
-; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $26, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT: retq
+; SSE4-LABEL: dont_fold_urem_i16_smax:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movq {{.*#+}} xmm1 = [1,65535,1,0,0,0,0,0]
+; SSE4-NEXT: pmullw %xmm0, %xmm1
+; SSE4-NEXT: movq {{.*#+}} xmm2 = [0,32767,45591,12375,0,0,0,0]
+; SSE4-NEXT: pmulhw %xmm0, %xmm2
+; SSE4-NEXT: paddw %xmm1, %xmm2
+; SSE4-NEXT: movdqa %xmm2, %xmm3
+; SSE4-NEXT: psrlw $15, %xmm3
+; SSE4-NEXT: pxor %xmm4, %xmm4
+; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1,2,3],xmm4[4,5,6,7]
+; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,4,4096,64,u,u,u,u]
+; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; SSE4-NEXT: paddw %xmm4, %xmm2
+; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,32768,23,5423,u,u,u,u]
+; SSE4-NEXT: psubw %xmm2, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX1OR2-LABEL: dont_fold_urem_i16_smax:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,65535,1,0,u,u,u,u]
+; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,32767,45591,12375,u,u,u,u]
+; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm2
+; AVX1OR2-NEXT: vpsrlw $15, %xmm2, %xmm3
+; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3],xmm4[4,5,6,7]
+; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,4,4096,64,u,u,u,u]
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX1OR2-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,32768,23,5423,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: dont_fold_urem_i16_smax:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,65535,1,0,u,u,u,u]
+; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,32767,45591,12375,u,u,u,u]
+; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2
+; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3],xmm3[4,5,6,7]
+; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,32768,23,5423,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
%1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
ret <4 x i16> %1
}
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 3d0d73be9a589..3faa2a0720d4e 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -6,84 +6,62 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
-; SSE-LABEL: fold_urem_vec_1:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl $2, %ecx
-; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; SSE-NEXT: shrl $19, %ecx
-; SSE-NEXT: imull $124, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movd %xmm0, %ecx
-; SSE-NEXT: movzwl %cx, %edx
-; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; SSE-NEXT: shrl $22, %edx
-; SSE-NEXT: imull $95, %edx, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl %ecx
-; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; SSE-NEXT: shrl $17, %ecx
-; SSE-NEXT: imull $98, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: shrl %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: shrl $9, %edx
-; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: fold_urem_vec_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,0,0,64,0,128,0,0,0,0,0,0,0,0,0,0]
+; SSE2-NEXT: pmulhuw %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [44151,16913,2675,1373,u,u,u,u]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psubw %xmm2, %xmm1
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,32768,u,u,u,u]
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1024,8192,32768,128,u,u,u,u]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,124,98,1003,u,u,u,u]
+; SSE2-NEXT: psubw %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: fold_urem_vec_1:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; AVX-NEXT: shrl $19, %ecx
-; AVX-NEXT: imull $124, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movzwl %cx, %edx
-; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; AVX-NEXT: shrl $22, %edx
-; AVX-NEXT: imull $95, %edx, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; AVX-NEXT: shrl $17, %ecx
-; AVX-NEXT: imull $98, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $9, %edx
-; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT: retq
+; SSE4-LABEL: fold_urem_vec_1:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,16384,32768,0,0,0,0,0]
+; SSE4-NEXT: pmulhuw %xmm0, %xmm1
+; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [44151,16913,2675,1373,u,u,u,u]
+; SSE4-NEXT: movdqa %xmm0, %xmm2
+; SSE4-NEXT: psubw %xmm1, %xmm2
+; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,32768,u,u,u,u]
+; SSE4-NEXT: paddw %xmm1, %xmm2
+; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,8192,32768,128,u,u,u,u]
+; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [95,124,98,1003,u,u,u,u]
+; SSE4-NEXT: psubw %xmm2, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX1OR2-LABEL: fold_urem_vec_1:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,16384,32768,u,u,u,u,u]
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [44151,16913,2675,1373,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,32768,u,u,u,u]
+; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1024,8192,32768,128,u,u,u,u]
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: fold_urem_vec_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [44151,16913,2675,1373,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,32768,u,u,u,u]
+; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
%1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
ret <4 x i16> %1
}
@@ -91,18 +69,18 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; SSE-LABEL: fold_urem_vec_2:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
; SSE-NEXT: psrlw $6, %xmm1
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,u,u,u,u]
; SSE-NEXT: psubw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fold_urem_vec_2:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u]
; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
-; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95]
+; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,u,u,u,u]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -114,10 +92,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; SSE2-LABEL: combine_urem_udiv:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
; SSE2-NEXT: pmulhuw %xmm0, %xmm1
; SSE2-NEXT: psrlw $6, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,u,u,u,u]
; SSE2-NEXT: pmullw %xmm1, %xmm2
; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
@@ -125,7 +103,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
;
; SSE4-LABEL: combine_urem_udiv:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
; SSE4-NEXT: pmulhuw %xmm0, %xmm1
; SSE4-NEXT: psrlw $6, %xmm1
; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
@@ -136,9 +114,9 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
;
; AVX-LABEL: combine_urem_udiv:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u]
; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
-; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95]
+; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,u,u,u,u]
; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -152,92 +130,44 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; SSE2-LABEL: dont_fold_urem_power_of_two:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pextrw $1, %xmm0, %eax
-; SSE2-NEXT: andl $31, %eax
-; SSE2-NEXT: pinsrw $1, %eax, %xmm1
-; SSE2-NEXT: pextrw $2, %xmm0, %eax
-; SSE2-NEXT: andl $7, %eax
-; SSE2-NEXT: pinsrw $2, %eax, %xmm1
-; SSE2-NEXT: pextrw $3, %xmm0, %eax
-; SSE2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; SSE2-NEXT: shrl $22, %ecx
-; SSE2-NEXT: imull $95, %ecx, %ecx
-; SSE2-NEXT: subl %ecx, %eax
-; SSE2-NEXT: pinsrw $3, %eax, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm1 = [1024,2048,8192,44151,0,0,0,0]
+; SSE2-NEXT: pmulhuw %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: psrlw $6, %xmm1
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,32,8,95,u,u,u,u]
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: dont_fold_urem_power_of_two:
; SSE4: # %bb.0:
-; SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63]
-; SSE4-NEXT: pand %xmm0, %xmm1
-; SSE4-NEXT: pextrw $1, %xmm0, %eax
-; SSE4-NEXT: andl $31, %eax
-; SSE4-NEXT: pinsrw $1, %eax, %xmm1
-; SSE4-NEXT: pextrw $2, %xmm0, %eax
-; SSE4-NEXT: andl $7, %eax
-; SSE4-NEXT: pinsrw $2, %eax, %xmm1
-; SSE4-NEXT: pextrw $3, %xmm0, %eax
-; SSE4-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; SSE4-NEXT: shrl $22, %ecx
-; SSE4-NEXT: imull $95, %ecx, %ecx
-; SSE4-NEXT: subl %ecx, %eax
-; SSE4-NEXT: pinsrw $3, %eax, %xmm1
-; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: movq {{.*#+}} xmm1 = [1024,2048,8192,44151,0,0,0,0]
+; SSE4-NEXT: pmulhuw %xmm0, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm2
+; SSE4-NEXT: psrlw $6, %xmm2
+; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
+; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,32,8,95,u,u,u,u]
+; SSE4-NEXT: psubw %xmm2, %xmm0
; SSE4-NEXT: retq
;
-; AVX1-LABEL: dont_fold_urem_power_of_two:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-NEXT: andl $31, %eax
-; AVX1-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpextrw $2, %xmm0, %eax
-; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; AVX1-NEXT: shrl $22, %ecx
-; AVX1-NEXT: imull $95, %ecx, %ecx
-; AVX1-NEXT: subl %ecx, %eax
-; AVX1-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: dont_fold_urem_power_of_two:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-NEXT: andl $31, %eax
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; AVX2-NEXT: shrl $22, %ecx
-; AVX2-NEXT: imull $95, %ecx, %ecx
-; AVX2-NEXT: subl %ecx, %eax
-; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: dont_fold_urem_power_of_two:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1024,2048,8192,44151,u,u,u,u]
+; AVX1OR2-NEXT: vpsrlw $6, %xmm1, %xmm2
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
;
; AVX512-LABEL: dont_fold_urem_power_of_two:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
-; AVX512-NEXT: vpextrw $1, %xmm0, %eax
-; AVX512-NEXT: andl $31, %eax
-; AVX512-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX512-NEXT: vpextrw $2, %xmm0, %eax
-; AVX512-NEXT: andl $7, %eax
-; AVX512-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512-NEXT: vpextrw $3, %xmm0, %eax
-; AVX512-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; AVX512-NEXT: shrl $22, %ecx
-; AVX512-NEXT: imull $95, %ecx, %ecx
-; AVX512-NEXT: subl %ecx, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1024,2048,8192,44151,u,u,u,u]
+; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
ret <4 x i16> %1
@@ -245,98 +175,58 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; Don't fold if the divisor is one.
define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
-; SSE-LABEL: dont_fold_urem_one:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: shrl %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: shrl $4, %edx
-; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
-; SSE-NEXT: shll $3, %ecx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: addl %eax, %edx
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
-; SSE-NEXT: shrl $25, %ecx
-; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pinsrw $2, %edx, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; SSE-NEXT: shrl $26, %ecx
-; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: dont_fold_urem_one:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,51307,25645,12375,0,0,0,0]
+; SSE2-NEXT: pmulhuw %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubw %xmm2, %xmm3
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [u,0,32768,0,u,u,u,u]
+; SSE2-NEXT: paddw %xmm2, %xmm3
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,128,0,0,16,64,0,u,u,u,u,u,u,u,u]
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,654,23,5423,u,u,u,u]
+; SSE2-NEXT: psubw %xmm3, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: dont_fold_urem_one:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,51307,25645,12375,0,0,0,0]
+; SSE4-NEXT: pmulhuw %xmm0, %xmm1
+; SSE4-NEXT: movdqa %xmm0, %xmm2
+; SSE4-NEXT: psubw %xmm1, %xmm2
+; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,0,32768,0,u,u,u,u]
+; SSE4-NEXT: paddw %xmm1, %xmm2
+; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,128,4096,64,u,u,u,u]
+; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,654,23,5423,u,u,u,u]
+; SSE4-NEXT: psubw %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1OR2-LABEL: dont_fold_urem_one:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX1OR2-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; AVX1OR2-NEXT: shrl $16, %ecx
-; AVX1OR2-NEXT: movl %eax, %edx
-; AVX1OR2-NEXT: subl %ecx, %edx
-; AVX1OR2-NEXT: movzwl %dx, %edx
-; AVX1OR2-NEXT: shrl %edx
-; AVX1OR2-NEXT: addl %ecx, %edx
-; AVX1OR2-NEXT: shrl $4, %edx
-; AVX1OR2-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX1OR2-NEXT: shll $3, %ecx
-; AVX1OR2-NEXT: subl %ecx, %edx
-; AVX1OR2-NEXT: addl %eax, %edx
-; AVX1OR2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1OR2-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
-; AVX1OR2-NEXT: shrl $25, %ecx
-; AVX1OR2-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
-; AVX1OR2-NEXT: subl %ecx, %eax
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1OR2-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; AVX1OR2-NEXT: shrl $26, %ecx
-; AVX1OR2-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX1OR2-NEXT: subl %ecx, %eax
-; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,0,32768,0,u,u,u,u]
+; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,128,4096,64,u,u,u,u]
+; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
+; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512-LABEL: dont_fold_urem_one:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpextrw $2, %xmm0, %eax
-; AVX512-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; AVX512-NEXT: shrl $16, %ecx
-; AVX512-NEXT: movl %eax, %edx
-; AVX512-NEXT: subl %ecx, %edx
-; AVX512-NEXT: movzwl %dx, %edx
-; AVX512-NEXT: shrl %edx
-; AVX512-NEXT: addl %ecx, %edx
-; AVX512-NEXT: shrl $4, %edx
-; AVX512-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX512-NEXT: shll $3, %ecx
-; AVX512-NEXT: subl %ecx, %edx
-; AVX512-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX512-NEXT: addl %eax, %edx
-; AVX512-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B
-; AVX512-NEXT: shrl $25, %eax
-; AVX512-NEXT: imull $654, %eax, %eax # imm = 0x28E
-; AVX512-NEXT: subl %eax, %ecx
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX512-NEXT: vpextrw $3, %xmm0, %eax
-; AVX512-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; AVX512-NEXT: shrl $26, %ecx
-; AVX512-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX512-NEXT: subl %ecx, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,0,32768,0,u,u,u,u]
+; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
ret <4 x i16> %1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
index 10a840218c864..1d9977e6d6287 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -6,7 +6,7 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
; X64-LABEL: test_udiv7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: pmuludq %xmm1, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -26,7 +26,7 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -51,7 +51,7 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
; X64-LABEL: test_urem7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: pmuludq %xmm1, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -76,7 +76,7 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -106,7 +106,7 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind {
; X64-LABEL: test_sdiv7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,u,u]
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: pmuludq %xmm1, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -132,7 +132,7 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,u,u]
; X86-NEXT: movdqa %xmm1, %xmm0
; X86-NEXT: pmuludq %xmm2, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
@@ -163,7 +163,7 @@ define void @test_srem7_v2i32(ptr %x, ptr %y) nounwind {
; X64-LABEL: test_srem7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,u,u]
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: pmuludq %xmm1, %xmm2
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -193,7 +193,7 @@ define void @test_srem7_v2i32(ptr %x, ptr %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,u,u]
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: pmuludq %xmm2, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
More information about the llvm-commits
mailing list