[llvm] [X86] Add ISD::MULHS/MULHU v4i64/v8i64 lowering (PR #169819)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 27 07:07:50 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Avoid scalarisation of v4i64/v8i64 div-by-constant patterns by expanding ISD::MULHS/MULHU on AVX2+ targets
The generic ISD::MULHS/MULHU expansion provided by forceExpandMultiply is particularly useful as it matches very well with VPMULDQ/VPMULUDQ patterns.
I couldn't make v2i64 expansion worthwhile on x64 targets (although it could be worth it for 32-bit if there's a demand).
ISD::MULHS on AVX2 targets is only a marginal benefit - the benefit of avoiding xmm<->gpr traffic and possible IMULQ bottlenecks is offset against poor v4i64 SRA handling.
Fixes #<!-- -->37771
---
Patch is 52.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169819.diff
8 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+1-1)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+11)
- (modified) llvm/test/CodeGen/X86/srem-vector-lkk.ll (+75-80)
- (modified) llvm/test/CodeGen/X86/urem-vector-lkk.ll (+63-68)
- (modified) llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll (+140-88)
- (modified) llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll (+53-166)
- (modified) llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll (+42-86)
- (modified) llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll (+40-174)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5684e0e4c26c4..28e6ccb7f3dca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11061,7 +11061,7 @@ void TargetLowering::forceExpandMultiply(SelectionDAG &DAG, const SDLoc &dl,
// Hacker's Delight (itself derived from Knuth's Algorithm M from section
// 4.3.1). If Signed is set, we can use arithmetic right shifts to propagate
// sign bits while calculating the Hi half.
- unsigned Bits = VT.getSizeInBits();
+ unsigned Bits = VT.getScalarSizeInBits();
unsigned HalfBits = Bits / 2;
SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 84cca4a6eb269..20b29f31c9a2a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1687,6 +1687,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
if (HasInt256) {
+ setOperationAction(ISD::MULHS, MVT::v4i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::v4i64, Custom);
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
// Custom legalize 2x32 to get a little better code.
@@ -1940,6 +1942,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v8i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i64, Custom);
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
@@ -29888,6 +29892,13 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
return splitVectorIntBinary(Op, DAG, dl);
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i64) {
+ SDValue Lo, Hi;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ TLI.forceExpandMultiply(DAG, dl, IsSigned, Lo, Hi, A, B);
+ return Hi;
+ }
+
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
index 0fb6eb3c58893..60ac025c2ceaf 100644
--- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
@@ -622,90 +622,85 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
;
; AVX2-LABEL: dont_fold_srem_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rdx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq $4, %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax
-; AVX2-NEXT: shlq $3, %rax
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: addq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rdx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq $11, %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: imulq %rdx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq $8, %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,1681210440,18446744072402387656,1621997606]
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,4294967295,0,0,0]
+; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm4
+; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
+; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = [0,105075653,1493901669,3856996263]
+; AVX2-NEXT: vpmuludq %ymm5, %ymm4, %ymm6
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm7
+; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm7
+; AVX2-NEXT: vpmuludq %ymm5, %ymm7, %ymm8
+; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
+; AVX2-NEXT: vpaddq %ymm6, %ymm8, %ymm6
+; AVX2-NEXT: vpmuludq %ymm5, %ymm0, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm5
+; AVX2-NEXT: vpaddq %ymm5, %ymm6, %ymm5
+; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm8, %ymm2
+; AVX2-NEXT: vpsrad $31, %ymm2, %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2],ymm8[3],ymm2[4],ymm8[5],ymm2[6],ymm8[7]
+; AVX2-NEXT: vpsrad $31, %ymm5, %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpmuludq %ymm1, %ymm7, %ymm5
+; AVX2-NEXT: vpaddq %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7]
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [9223372036854775808,36028797018963968,576460752303423488,4503599627370496]
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [1,654,23,5423]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: dont_fold_srem_i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: imulq %rdx
-; AVX512-NEXT: addq %rcx, %rdx
-; AVX512-NEXT: movq %rdx, %rax
-; AVX512-NEXT: shrq $63, %rax
-; AVX512-NEXT: sarq $4, %rdx
-; AVX512-NEXT: addq %rax, %rdx
-; AVX512-NEXT: leaq (%rdx,%rdx,2), %rax
-; AVX512-NEXT: shlq $3, %rax
-; AVX512-NEXT: subq %rax, %rdx
-; AVX512-NEXT: addq %rcx, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX512-NEXT: vmovq %rdx, %xmm1
-; AVX512-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: imulq %rdx
-; AVX512-NEXT: movq %rdx, %rax
-; AVX512-NEXT: shrq $63, %rax
-; AVX512-NEXT: sarq $11, %rdx
-; AVX512-NEXT: addq %rax, %rdx
-; AVX512-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
-; AVX512-NEXT: subq %rax, %rcx
-; AVX512-NEXT: vmovq %rcx, %xmm2
-; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0]
-; AVX512-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: imulq %rdx
-; AVX512-NEXT: movq %rdx, %rax
-; AVX512-NEXT: shrq $63, %rax
-; AVX512-NEXT: sarq $8, %rdx
-; AVX512-NEXT: addq %rax, %rdx
-; AVX512-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
-; AVX512-NEXT: subq %rax, %rcx
-; AVX512-NEXT: vmovq %rcx, %xmm1
-; AVX512-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vpsraq $32, %ymm0, %ymm1
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,105075653,1493901669,3856996263]
+; AVX512-NEXT: vpmullq %ymm2, %ymm1, %ymm3
+; AVX512-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX512-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsraq $32, %ymm2, %ymm3
+; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1681210440,18446744072402387656,1621997606]
+; AVX512-NEXT: vpmullq %ymm6, %ymm5, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7]
+; AVX512-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX512-NEXT: vpsraq $32, %ymm2, %ymm2
+; AVX512-NEXT: vpmuldq %ymm6, %ymm1, %ymm1
+; AVX512-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
+; AVX512-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX512-NEXT: vpsrlq $63, %ymm1, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
+; AVX512-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX512-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,654,23,5423]
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 3d0d73be9a589..21e0b02335450 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -479,78 +479,73 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
;
; AVX2-LABEL: dont_fold_urem_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $4, %rax
-; AVX2-NEXT: leaq (%rax,%rax,2), %rdx
-; AVX2-NEXT: shlq $3, %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: shrq $12, %rdx
-; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
-; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: shrq $7, %rdx
-; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = [0,105075653,2987803337,3419025229]
+; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
+; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = [0,1681210440,1680639376,3243995213]
+; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm3
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = [0,0,2147483648,0]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [1,654,23,5423]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: dont_fold_urem_i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rdx
-; AVX512-NEXT: movabsq $7218291159277650633, %rax # imm = 0x642C8590B21642C9
-; AVX512-NEXT: mulxq %rax, %rax, %rax
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: subq %rax, %rcx
-; AVX512-NEXT: shrq %rcx
-; AVX512-NEXT: addq %rax, %rcx
-; AVX512-NEXT: shrq $4, %rcx
-; AVX512-NEXT: leaq (%rcx,%rcx,2), %rax
-; AVX512-NEXT: shlq $3, %rax
-; AVX512-NEXT: subq %rax, %rcx
-; AVX512-NEXT: addq %rdx, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: movabsq $-4513890722074972339, %rax # imm = 0xC15B704DCBCA2F4D
-; AVX512-NEXT: mulxq %rax, %rax, %rax
-; AVX512-NEXT: vmovq %rcx, %xmm1
-; AVX512-NEXT: shrq $12, %rax
-; AVX512-NEXT: imulq $5423, %rax, %rax # imm = 0x152F
-; AVX512-NEXT: subq %rax, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm2
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512-NEXT: movq %rax, %rdx
-; AVX512-NEXT: shrq %rdx
-; AVX512-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5
-; AVX512-NEXT: mulxq %rcx, %rcx, %rcx
-; AVX512-NEXT: shrq $7, %rcx
-; AVX512-NEXT: imulq $654, %rcx, %rcx # imm = 0x28E
-; AVX512-NEXT: subq %rcx, %rax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,105075653,2987803337,3419025229]
+; AVX512-NEXT: vpmuludq %ymm3, %ymm2, %ymm4
+; AVX512-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX512-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1681210440,1680639376,3243995213]
+; AVX512-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX512-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vpaddq %ymm4, %ymm1, %ymm1
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512-NEXT: vpmuludq %ymm5, %ymm2, %ymm2
+; AVX512-NEXT: vpaddq %ymm3, %ymm2, %ymm2
+; AVX512-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm2, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,2147483648,0]
+; AVX512-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
+; AVX512-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpaddq %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,654,23,5423]
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index fa5692aa9cef1..bde671cb4f4a8 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -45,42 +45,73 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_div7_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
-; AVX2-NEXT: imulq %rcx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: imulq %rcx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: imulq %rcx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: imulq %rcx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: sarq %rdx
-; AVX2-NEXT: addq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_div7_4i64:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2NOBW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757]
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2NOBW-NEXT: vpsrad $31, %ymm0, %ymm4
+; AVX2NOBW-NEXT: vpsrlq $32, %ymm4, %ymm4
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm4, %ymm5
+; AVX2NOBW-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2NOBW-NEXT: vpaddq %ymm5, %ymm3, %ymm3
+; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
+; AVX2NOBW-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT: vpsrad $31, %ymm2, %ymm3
+; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
+; AVX2NOBW-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4],ymm5[5],ymm2[6],ymm5[7]
+; AVX2NOBW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1227133513,1227133513,1227133513,1227133513]
+; AVX2NOBW-NEXT: vpmuludq %ymm5...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/169819
More information about the llvm-commits
mailing list