[llvm] [X86] Add ISD::MULHS/MULHU v4i64/v8i64 lowering (PR #169819)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 27 07:07:12 PST 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/169819
Avoid scalarisation of v4i64/v8i64 div-by-constant patterns by expanding ISD::MULHS/MULHU on AVX2+ targets
The generic ISD::MULHS/MULHU expansion provided by forceExpandMultiply is particularly useful as it matches very well with VPMULDQ/VPMULUDQ patterns.
I couldn't make v2i64 expansion worthwhile on x64 targets (although it could be worth it for 32-bit if there's a demand).
ISD::MULHS on AVX2 targets is only a marginal benefit - the benefit of avoiding xmm<->gpr traffic and possible IMULQ bottlenecks is offset against poor v4i64 SRA handling.
Fixes #37771
>From 9e43856ba347fb1356764199de45a7804e02c683 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 27 Nov 2025 15:03:10 +0000
Subject: [PATCH] [X86] Add ISD::MULHS/MULHU v4i64/v8i64 lowering
Avoid scalarisation of v4i64/v8i64 div-by-constant patterns by expanding ISD::MULHS/MULHU on AVX2+ targets
The generic ISD::MULHS/MULHU expansion provided by forceExpandMultiply is particularly useful as it matches very well with VPMULDQ/VPMULUDQ patterns.
I couldn't make v2i64 expansion worthwhile on x64 targets (although it could be worth it 32-bit if there's a demand).
ISD::MULHS on AVX2 targets is only a marginal benefit - the benefit of avoiding xmm<->gpr traffic and possible IMULQ bottlenecks is offset against poor v4i64 SRA handling.
Fixes #37771
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 2 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +
llvm/test/CodeGen/X86/srem-vector-lkk.ll | 155 ++++++------
llvm/test/CodeGen/X86/urem-vector-lkk.ll | 131 +++++-----
llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll | 228 +++++++++++-------
llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll | 219 ++++-------------
llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll | 128 ++++------
llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll | 214 +++-------------
8 files changed, 425 insertions(+), 663 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5684e0e4c26c4..28e6ccb7f3dca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11061,7 +11061,7 @@ void TargetLowering::forceExpandMultiply(SelectionDAG &DAG, const SDLoc &dl,
// Hacker's Delight (itself derived from Knuth's Algorithm M from section
// 4.3.1). If Signed is set, we can use arithmetic right shifts to propagate
// sign bits while calculating the Hi half.
- unsigned Bits = VT.getSizeInBits();
+ unsigned Bits = VT.getScalarSizeInBits();
unsigned HalfBits = Bits / 2;
SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 84cca4a6eb269..20b29f31c9a2a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1687,6 +1687,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
if (HasInt256) {
+ setOperationAction(ISD::MULHS, MVT::v4i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::v4i64, Custom);
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
// Custom legalize 2x32 to get a little better code.
@@ -1940,6 +1942,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v8i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i64, Custom);
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
@@ -29888,6 +29892,13 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
return splitVectorIntBinary(Op, DAG, dl);
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i64) {
+ SDValue Lo, Hi;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ TLI.forceExpandMultiply(DAG, dl, IsSigned, Lo, Hi, A, B);
+ return Hi;
+ }
+
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
index 0fb6eb3c58893..60ac025c2ceaf 100644
--- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
@@ -622,90 +622,85 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
;
; AVX2-LABEL: dont_fold_srem_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rdx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq $4, %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax
-; AVX2-NEXT: shlq $3, %rax
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rdx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq $11, %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rdx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq $8, %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,1681210440,18446744072402387656,1621997606]
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,4294967295,0,0,0]
+; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm4
+; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
+; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = [0,105075653,1493901669,3856996263]
+; AVX2-NEXT: vpmuludq %ymm5, %ymm4, %ymm6
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm7
+; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm7
+; AVX2-NEXT: vpmuludq %ymm5, %ymm7, %ymm8
+; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
+; AVX2-NEXT: vpaddq %ymm6, %ymm8, %ymm6
+; AVX2-NEXT: vpmuludq %ymm5, %ymm0, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm5
+; AVX2-NEXT: vpaddq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm8, %ymm2
+; AVX2-NEXT: vpsrad $31, %ymm2, %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2],ymm8[3],ymm2[4],ymm8[5],ymm2[6],ymm8[7]
+; AVX2-NEXT: vpsrad $31, %ymm5, %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpmuludq %ymm1, %ymm7, %ymm5
+; AVX2-NEXT: vpaddq %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7]
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [9223372036854775808,36028797018963968,576460752303423488,4503599627370496]
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [1,654,23,5423]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: dont_fold_srem_i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: imulq %rdx
-; AVX512-NEXT: addq %rcx, %rdx
-; AVX512-NEXT: movq %rdx, %rax
-; AVX512-NEXT: shrq $63, %rax
-; AVX512-NEXT: sarq $4, %rdx
-; AVX512-NEXT: addq %rax, %rdx
-; AVX512-NEXT: leaq (%rdx,%rdx,2), %rax
-; AVX512-NEXT: shlq $3, %rax
-; AVX512-NEXT: subq %rax, %rdx
-; AVX512-NEXT: addq %rcx, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX512-NEXT: vmovq %rdx, %xmm1
-; AVX512-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: imulq %rdx
-; AVX512-NEXT: movq %rdx, %rax
-; AVX512-NEXT: shrq $63, %rax
-; AVX512-NEXT: sarq $11, %rdx
-; AVX512-NEXT: addq %rax, %rdx
-; AVX512-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
-; AVX512-NEXT: subq %rax, %rcx
-; AVX512-NEXT: vmovq %rcx, %xmm2
-; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0]
-; AVX512-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: imulq %rdx
-; AVX512-NEXT: movq %rdx, %rax
-; AVX512-NEXT: shrq $63, %rax
-; AVX512-NEXT: sarq $8, %rdx
-; AVX512-NEXT: addq %rax, %rdx
-; AVX512-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
-; AVX512-NEXT: subq %rax, %rcx
-; AVX512-NEXT: vmovq %rcx, %xmm1
-; AVX512-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vpsraq $32, %ymm0, %ymm1
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,105075653,1493901669,3856996263]
+; AVX512-NEXT: vpmullq %ymm2, %ymm1, %ymm3
+; AVX512-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX512-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsraq $32, %ymm2, %ymm3
+; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1681210440,18446744072402387656,1621997606]
+; AVX512-NEXT: vpmullq %ymm6, %ymm5, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
+; AVX512-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX512-NEXT: vpsraq $32, %ymm2, %ymm2
+; AVX512-NEXT: vpmuldq %ymm6, %ymm1, %ymm1
+; AVX512-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
+; AVX512-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX512-NEXT: vpsrlq $63, %ymm1, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
+; AVX512-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX512-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,654,23,5423]
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 3d0d73be9a589..21e0b02335450 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -479,78 +479,73 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
;
; AVX2-LABEL: dont_fold_urem_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $4, %rax
-; AVX2-NEXT: leaq (%rax,%rax,2), %rdx
-; AVX2-NEXT: shlq $3, %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: shrq $12, %rdx
-; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
-; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: shrq $7, %rdx
-; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = [0,105075653,2987803337,3419025229]
+; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
+; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = [0,1681210440,1680639376,3243995213]
+; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm3
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = [0,0,2147483648,0]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [1,654,23,5423]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: dont_fold_urem_i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rdx
-; AVX512-NEXT: movabsq $7218291159277650633, %rax # imm = 0x642C8590B21642C9
-; AVX512-NEXT: mulxq %rax, %rax, %rax
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: subq %rax, %rcx
-; AVX512-NEXT: shrq %rcx
-; AVX512-NEXT: addq %rax, %rcx
-; AVX512-NEXT: shrq $4, %rcx
-; AVX512-NEXT: leaq (%rcx,%rcx,2), %rax
-; AVX512-NEXT: shlq $3, %rax
-; AVX512-NEXT: subq %rax, %rcx
-; AVX512-NEXT: addq %rdx, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: movabsq $-4513890722074972339, %rax # imm = 0xC15B704DCBCA2F4D
-; AVX512-NEXT: mulxq %rax, %rax, %rax
-; AVX512-NEXT: vmovq %rcx, %xmm1
-; AVX512-NEXT: shrq $12, %rax
-; AVX512-NEXT: imulq $5423, %rax, %rax # imm = 0x152F
-; AVX512-NEXT: subq %rax, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm2
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512-NEXT: movq %rax, %rdx
-; AVX512-NEXT: shrq %rdx
-; AVX512-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5
-; AVX512-NEXT: mulxq %rcx, %rcx, %rcx
-; AVX512-NEXT: shrq $7, %rcx
-; AVX512-NEXT: imulq $654, %rcx, %rcx # imm = 0x28E
-; AVX512-NEXT: subq %rcx, %rax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,105075653,2987803337,3419025229]
+; AVX512-NEXT: vpmuludq %ymm3, %ymm2, %ymm4
+; AVX512-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX512-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1681210440,1680639376,3243995213]
+; AVX512-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX512-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vpaddq %ymm4, %ymm1, %ymm1
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512-NEXT: vpmuludq %ymm5, %ymm2, %ymm2
+; AVX512-NEXT: vpaddq %ymm3, %ymm2, %ymm2
+; AVX512-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm2, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,2147483648,0]
+; AVX512-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
+; AVX512-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,654,23,5423]
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index fa5692aa9cef1..bde671cb4f4a8 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -45,42 +45,73 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_div7_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
-; AVX2-NEXT: imulq %rcx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: imulq %rcx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: imulq %rcx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: imulq %rcx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_div7_4i64:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2NOBW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757]
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2NOBW-NEXT: vpsrad $31, %ymm0, %ymm4
+; AVX2NOBW-NEXT: vpsrlq $32, %ymm4, %ymm4
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm4, %ymm5
+; AVX2NOBW-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2NOBW-NEXT: vpaddq %ymm5, %ymm3, %ymm3
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2NOBW-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT: vpsrad $31, %ymm2, %ymm3
+; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
+; AVX2NOBW-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4],ymm5[5],ymm2[6],ymm5[7]
+; AVX2NOBW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1227133513,1227133513,1227133513,1227133513]
+; AVX2NOBW-NEXT: vpmuludq %ymm5, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpsrad $31, %ymm0, %ymm2
+; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2NOBW-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2NOBW-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpmuludq %ymm5, %ymm4, %ymm2
+; AVX2NOBW-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX2NOBW-NEXT: vpsrad $1, %ymm0, %ymm1
+; AVX2NOBW-NEXT: vpsrlq $1, %ymm0, %ymm2
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2NOBW-NEXT: vpsrlq $63, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_4i64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757]
+; AVX512BW-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm4
+; AVX512BW-NEXT: vpmuludq %ymm2, %ymm4, %ymm5
+; AVX512BW-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX512BW-NEXT: vpaddq %ymm5, %ymm3, %ymm3
+; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX512BW-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vpsraq $32, %zmm2, %zmm3
+; AVX512BW-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4],ymm5[5],ymm2[6],ymm5[7]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1227133513,1227133513,1227133513,1227133513]
+; AVX512BW-NEXT: vpmuludq %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmuludq %ymm5, %ymm4, %ymm2
+; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX512BW-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpsrlq $63, %ymm0, %ymm1
+; AVX512BW-NEXT: vpsraq $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: retq
%res = sdiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %res
}
@@ -389,58 +420,79 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_rem7_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rsi
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rsi
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rsi
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rsi
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_rem7_4i64:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2NOBW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757]
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2NOBW-NEXT: vpsrad $31, %ymm0, %ymm4
+; AVX2NOBW-NEXT: vpsrlq $32, %ymm4, %ymm4
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm4, %ymm5
+; AVX2NOBW-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2NOBW-NEXT: vpaddq %ymm5, %ymm3, %ymm3
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2NOBW-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT: vpsrad $31, %ymm2, %ymm3
+; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
+; AVX2NOBW-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4],ymm5[5],ymm2[6],ymm5[7]
+; AVX2NOBW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1227133513,1227133513,1227133513,1227133513]
+; AVX2NOBW-NEXT: vpmuludq %ymm5, %ymm0, %ymm6
+; AVX2NOBW-NEXT: vpaddq %ymm2, %ymm6, %ymm2
+; AVX2NOBW-NEXT: vpsrad $31, %ymm2, %ymm6
+; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3],ymm2[4],ymm6[5],ymm2[6],ymm6[7]
+; AVX2NOBW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpmuludq %ymm5, %ymm4, %ymm3
+; AVX2NOBW-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2NOBW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrad $1, %ymm1, %ymm2
+; AVX2NOBW-NEXT: vpsrlq $1, %ymm1, %ymm3
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2NOBW-NEXT: vpsrlq $63, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT: vpsllq $3, %ymm1, %ymm2
+; AVX2NOBW-NEXT: vpsubq %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_4i64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757]
+; AVX512BW-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm4
+; AVX512BW-NEXT: vpmuludq %ymm2, %ymm4, %ymm5
+; AVX512BW-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX512BW-NEXT: vpaddq %ymm5, %ymm3, %ymm3
+; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX512BW-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vpsraq $32, %zmm2, %zmm3
+; AVX512BW-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4],ymm5[5],ymm2[6],ymm5[7]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1227133513,1227133513,1227133513,1227133513]
+; AVX512BW-NEXT: vpmuludq %ymm5, %ymm0, %ymm6
+; AVX512BW-NEXT: vpaddq %ymm2, %ymm6, %ymm2
+; AVX512BW-NEXT: vpsraq $32, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmuludq %ymm5, %ymm4, %ymm3
+; AVX512BW-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512BW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX512BW-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlq $63, %ymm1, %ymm2
+; AVX512BW-NEXT: vpsraq $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpsllq $3, %ymm1, %ymm2
+; AVX512BW-NEXT: vpsubq %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: retq
%res = srem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %res
}
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index b11756a5e3b4e..f8a3adf37852b 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -9,73 +9,31 @@
define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_div7_8i64:
; AVX: # %bb.0:
-; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrq $1, %xmm1, %rax
-; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
-; AVX-NEXT: imulq %rcx
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm2
-; AVX-NEXT: vmovq %xmm1, %rax
-; AVX-NEXT: imulq %rcx
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rax
-; AVX-NEXT: imulq %rcx
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rax
-; AVX-NEXT: imulq %rcx
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rax
-; AVX-NEXT: imulq %rcx
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rax
-; AVX-NEXT: imulq %rcx
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vpextrq $1, %xmm0, %rax
-; AVX-NEXT: imulq %rcx
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm3
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: imulq %rcx
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: vpsraq $32, %zmm0, %zmm1
+; AVX-NEXT: vpbroadcastq {{.*#+}} zmm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
+; AVX-NEXT: vpsrlq $32, %zmm1, %zmm4
+; AVX-NEXT: vpmuludq %zmm2, %zmm4, %zmm5
+; AVX-NEXT: vpsllq $32, %zmm5, %zmm5
+; AVX-NEXT: vpaddq %zmm5, %zmm3, %zmm3
+; AVX-NEXT: vpmuludq %zmm2, %zmm0, %zmm2
+; AVX-NEXT: vpsrlq $32, %zmm2, %zmm2
+; AVX-NEXT: vpaddq %zmm2, %zmm3, %zmm2
+; AVX-NEXT: vpsraq $32, %zmm2, %zmm3
+; AVX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
+; AVX-NEXT: vpbroadcastq {{.*#+}} zmm5 = [1227133513,1227133513,1227133513,1227133513,1227133513,1227133513,1227133513,1227133513]
+; AVX-NEXT: vpmuludq %zmm5, %zmm0, %zmm0
+; AVX-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX-NEXT: vpsraq $32, %zmm0, %zmm0
+; AVX-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; AVX-NEXT: vpmuludq %zmm5, %zmm1, %zmm1
+; AVX-NEXT: vpmuludq %zmm5, %zmm4, %zmm2
+; AVX-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX-NEXT: vpsrlq $63, %zmm0, %zmm1
+; AVX-NEXT: vpsraq $1, %zmm0, %zmm0
+; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = sdiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %res
@@ -277,105 +235,34 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_rem7_8i64:
; AVX: # %bb.0:
-; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: imulq %rsi
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm2
-; AVX-NEXT: vmovq %xmm1, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: imulq %rsi
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: imulq %rsi
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: imulq %rsi
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: imulq %rsi
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: imulq %rsi
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: imulq %rsi
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm3
-; AVX-NEXT: vmovq %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: imulq %rsi
-; AVX-NEXT: movq %rdx, %rax
-; AVX-NEXT: shrq $63, %rax
-; AVX-NEXT: sarq %rdx
-; AVX-NEXT: addq %rax, %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
-; AVX-NEXT: vmovq %rdx, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: vpsraq $32, %zmm0, %zmm1
+; AVX-NEXT: vpbroadcastq {{.*#+}} zmm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
+; AVX-NEXT: vpsrlq $32, %zmm1, %zmm4
+; AVX-NEXT: vpmuludq %zmm2, %zmm4, %zmm5
+; AVX-NEXT: vpsllq $32, %zmm5, %zmm5
+; AVX-NEXT: vpaddq %zmm5, %zmm3, %zmm3
+; AVX-NEXT: vpmuludq %zmm2, %zmm0, %zmm2
+; AVX-NEXT: vpsrlq $32, %zmm2, %zmm2
+; AVX-NEXT: vpaddq %zmm2, %zmm3, %zmm2
+; AVX-NEXT: vpsraq $32, %zmm2, %zmm3
+; AVX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
+; AVX-NEXT: vpbroadcastq {{.*#+}} zmm5 = [1227133513,1227133513,1227133513,1227133513,1227133513,1227133513,1227133513,1227133513]
+; AVX-NEXT: vpmuludq %zmm5, %zmm0, %zmm6
+; AVX-NEXT: vpaddq %zmm2, %zmm6, %zmm2
+; AVX-NEXT: vpsraq $32, %zmm2, %zmm2
+; AVX-NEXT: vpaddq %zmm2, %zmm3, %zmm2
+; AVX-NEXT: vpmuludq %zmm5, %zmm1, %zmm1
+; AVX-NEXT: vpmuludq %zmm5, %zmm4, %zmm3
+; AVX-NEXT: vpsllq $32, %zmm3, %zmm3
+; AVX-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpsrlq $63, %zmm1, %zmm2
+; AVX-NEXT: vpsraq $1, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpsllq $3, %zmm1, %zmm2
+; AVX-NEXT: vpsubq %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = srem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index ca57359183312..37c416f18736a 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -49,39 +49,25 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
;
; AVX2-LABEL: test_div7_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm3
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [613566756,613566756,613566756,613566756]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
+; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlq $2, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
@@ -423,59 +409,29 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
;
; AVX2-LABEL: test_rem7_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm3
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [613566756,613566756,613566756,613566756]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
+; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrlq $2, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $3, %ymm1, %ymm2
+; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = urem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index b8a131e628007..31d70041f9d0b 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -9,73 +9,24 @@
define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_div7_8i64:
; AVX: # %bb.0:
-; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
-; AVX-NEXT: vmovq %xmm1, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX-NEXT: vpbroadcastq {{.*#+}} zmm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
+; AVX-NEXT: vpmuludq %zmm2, %zmm0, %zmm2
+; AVX-NEXT: vpsrlq $32, %zmm2, %zmm2
+; AVX-NEXT: vpaddq %zmm2, %zmm3, %zmm2
+; AVX-NEXT: vpsrlq $32, %zmm2, %zmm3
+; AVX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
+; AVX-NEXT: vpbroadcastq {{.*#+}} zmm4 = [613566756,613566756,613566756,613566756,613566756,613566756,613566756,613566756]
+; AVX-NEXT: vpmuludq %zmm4, %zmm0, %zmm5
+; AVX-NEXT: vpaddq %zmm2, %zmm5, %zmm2
+; AVX-NEXT: vpsrlq $32, %zmm2, %zmm2
+; AVX-NEXT: vpmuludq %zmm4, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; AVX-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX-NEXT: vpsrlq $2, %zmm0, %zmm0
; AVX-NEXT: retq
%res = udiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -295,113 +246,28 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_rem7_8i64:
; AVX: # %bb.0:
-; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
-; AVX-NEXT: vmovq %xmm1, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
-; AVX-NEXT: vmovq %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX-NEXT: vpbroadcastq {{.*#+}} zmm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
+; AVX-NEXT: vpmuludq %zmm2, %zmm0, %zmm2
+; AVX-NEXT: vpsrlq $32, %zmm2, %zmm2
+; AVX-NEXT: vpaddq %zmm2, %zmm3, %zmm2
+; AVX-NEXT: vpsrlq $32, %zmm2, %zmm3
+; AVX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
+; AVX-NEXT: vpbroadcastq {{.*#+}} zmm4 = [613566756,613566756,613566756,613566756,613566756,613566756,613566756,613566756]
+; AVX-NEXT: vpmuludq %zmm4, %zmm0, %zmm5
+; AVX-NEXT: vpaddq %zmm2, %zmm5, %zmm2
+; AVX-NEXT: vpsrlq $32, %zmm2, %zmm2
+; AVX-NEXT: vpmuludq %zmm4, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpsubq %zmm1, %zmm0, %zmm2
+; AVX-NEXT: vpsrlq $1, %zmm2, %zmm2
+; AVX-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; AVX-NEXT: vpsrlq $2, %zmm1, %zmm1
+; AVX-NEXT: vpsllq $3, %zmm1, %zmm2
+; AVX-NEXT: vpsubq %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = urem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %res
More information about the llvm-commits
mailing list