[llvm] fa0c433 - [X86] Attempt to use VPMADD52L/VPMULUDQ instead of VPMULLQ on slow VPMULLQ targets (or when VPMULLQ is unavailable) (#171760)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 19 09:08:30 PST 2025
Author: 黃國庭
Date: 2025-12-19T17:08:25Z
New Revision: fa0c4334bd522144da12a55ad690cded76bbebd1
URL: https://github.com/llvm/llvm-project/commit/fa0c4334bd522144da12a55ad690cded76bbebd1
DIFF: https://github.com/llvm/llvm-project/commit/fa0c4334bd522144da12a55ad690cded76bbebd1.diff
LOG: [X86] Attempt to use VPMADD52L/VPMULUDQ instead of VPMULLQ on slow VPMULLQ targets (or when VPMULLQ is unavailable) (#171760)
This pull request introduces a new tuning flag "TuningSlowPMULLQ" and
uses it to optimize 64-bit vector multiplication on Intel targets where
"VPMULLQ" is slow.
On recent Intel microarchitectures , the "VPMULLQ" instruction has a
high latency of 15 cycles . In contrast, the "VPMADD52LUQ" instruction
(available via AVX512IFMA) performs a similar operation with a latency
of only 4 cycles .
Reference data from uops.info (Ice Lake):
"VPMULLQ" : Latency 15, TP 1.5
"VPMADD52LUQ" : Latency 4, TP 0.5
Fixes #158854
Added:
llvm/test/CodeGen/X86/slow-pmullq.ll
Modified:
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86ISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8f29a64d58194..4a3dc17263402 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -485,6 +485,9 @@ def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
def TuningSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
"PMULLD instruction is slow (compared to PMULLW/PMULHW and PMULUDQ)">;
+def TuningSlowPMULLQ : SubtargetFeature<"slow-pmullq", "IsPMULLQSlow", "true",
+ "PMULLQ instruction is slow">;
+
def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
"true",
"PMADDWD is slower than PMULLD">;
@@ -1065,7 +1068,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningSlowPMULLQ];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
@@ -1094,7 +1098,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningSlowPMULLQ];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);
@@ -1291,7 +1296,8 @@ def ProcessorFeatures {
FeatureWAITPKG];
list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
TuningPreferMovmskOverVTest,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningSlowPMULLQ];
list<SubtargetFeature> ADLRemoveTuning = [TuningPOPCNTFalseDeps];
list<SubtargetFeature> ADLTuning =
!listremove(!listconcat(SKLTuning, ADLAdditionalTuning), ADLRemoveTuning);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 558728f12754b..68d5e6ce199bc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49986,6 +49986,40 @@ static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineMulToPMADD52(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ // Only optimize vXi64 when the standard PMULLQ instruction is slow.
+ if (VT.getScalarType() != MVT::i64 || !Subtarget.isPMULLQSlow())
+ return SDValue();
+ // Check hardware support:
+ // 512-bit vectors (v8i64) require AVX512-IFMA.
+ // 128/256-bit vectors (v2i64/v4i64) require either AVX512-IFMA + VLX, or
+ // AVX-IFMA.
+ bool Supported512 = (VT == MVT::v8i64) && Subtarget.hasIFMA();
+ bool SupportedSmall =
+ (VT == MVT::v2i64 || VT == MVT::v4i64) &&
+ ((Subtarget.hasIFMA() && Subtarget.hasVLX()) || Subtarget.hasAVXIFMA());
+
+ if (!Supported512 && !SupportedSmall)
+ return SDValue();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ // Use KnownBits analysis to verify if the high bits are zero.
+ KnownBits Known0 = DAG.computeKnownBits(Op0);
+ KnownBits Known1 = DAG.computeKnownBits(Op1);
+ KnownBits KnownMul = KnownBits::mul(Known0, Known1, Op0 == Op1);
+ // If inputs and the result fit in 52 bits, VPMADD52L is safe to use.
+ // We pass a zero vector as the addend since we only need the multiply result.
+ if (Known0.countMaxActiveBits() <= 52 && Known1.countMaxActiveBits() <= 52 &&
+ KnownMul.countMaxActiveBits() <= 52) {
+ SDValue Zero = getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
+ return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Op0, Op1, Zero);
+ }
+ return SDValue();
+}
+
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -49998,6 +50032,9 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
return V;
+ if (SDValue V = combineMulToPMADD52(N, DL, DAG, Subtarget))
+ return V;
+
if (DCI.isBeforeLegalize() && VT.isVector())
return reduceVMULWidth(N, DL, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/X86/slow-pmullq.ll b/llvm/test/CodeGen/X86/slow-pmullq.ll
new file mode 100644
index 0000000000000..501bdf1761366
--- /dev/null
+++ b/llvm/test/CodeGen/X86/slow-pmullq.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=cannonlake | FileCheck %s --check-prefix=CNL
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=cannonlake -mattr=-avx512vl | FileCheck %s --check-prefix=NOVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512ifma,+avx512dq,+avx512vl,+slow-pmullq | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512ifma,+avx512dq,-avx512vl,+slow-pmullq | FileCheck %s --check-prefix=GENERIC-NOVLX
+
+; ============================================================================
+; Case 1: 52-bit Optimization (vpmadd52luq)
+; ============================================================================
+
+define <8 x i64> @test_mul_52bit_fits(<8 x i64> %a, <8 x i64> %b) {
+; CNL-LABEL: test_mul_52bit_fits:
+; CNL: # %bb.0:
+; CNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2
+; CNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; CNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CNL-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; CNL-NEXT: retq
+;
+; NOVLX-LABEL: test_mul_52bit_fits:
+; NOVLX: # %bb.0:
+; NOVLX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2
+; NOVLX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; NOVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NOVLX-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; NOVLX-NEXT: retq
+;
+; GENERIC-LABEL: test_mul_52bit_fits:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2
+; GENERIC-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; GENERIC-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; GENERIC-NEXT: retq
+;
+; GENERIC-NOVLX-LABEL: test_mul_52bit_fits:
+; GENERIC-NOVLX: # %bb.0:
+; GENERIC-NOVLX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2
+; GENERIC-NOVLX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; GENERIC-NOVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; GENERIC-NOVLX-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; GENERIC-NOVLX-NEXT: retq
+ %a_masked = and <8 x i64> %a, splat (i64 8589934591)
+ %b_masked = and <8 x i64> %b, splat (i64 524287)
+
+ %res = mul <8 x i64> %a_masked, %b_masked
+ ret <8 x i64> %res
+}
+
+; ============================================================================
+; Case 1.5: Non-constant test (using Logical Shift Right to clear high bits)
+; ============================================================================
+
+define <8 x i64> @test_mul_shift_high_bits(<8 x i64> %a, <8 x i64> %b) {
+; CNL-LABEL: test_mul_shift_high_bits:
+; CNL: # %bb.0:
+; CNL-NEXT: vpsrlq $31, %zmm0, %zmm2
+; CNL-NEXT: vpsrlq $45, %zmm1, %zmm1
+; CNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CNL-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; CNL-NEXT: retq
+;
+; NOVLX-LABEL: test_mul_shift_high_bits:
+; NOVLX: # %bb.0:
+; NOVLX-NEXT: vpsrlq $31, %zmm0, %zmm2
+; NOVLX-NEXT: vpsrlq $45, %zmm1, %zmm1
+; NOVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NOVLX-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; NOVLX-NEXT: retq
+;
+; GENERIC-LABEL: test_mul_shift_high_bits:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrlq $31, %zmm0, %zmm2
+; GENERIC-NEXT: vpsrlq $45, %zmm1, %zmm1
+; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; GENERIC-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; GENERIC-NEXT: retq
+;
+; GENERIC-NOVLX-LABEL: test_mul_shift_high_bits:
+; GENERIC-NOVLX: # %bb.0:
+; GENERIC-NOVLX-NEXT: vpsrlq $31, %zmm0, %zmm2
+; GENERIC-NOVLX-NEXT: vpsrlq $45, %zmm1, %zmm1
+; GENERIC-NOVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; GENERIC-NOVLX-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; GENERIC-NOVLX-NEXT: retq
+ %a_shifted = lshr <8 x i64> %a, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
+ %b_shifted = lshr <8 x i64> %b, <i64 45, i64 45, i64 45, i64 45, i64 45, i64 45, i64 45, i64 45>
+
+ %res = mul <8 x i64> %a_shifted, %b_shifted
+ ret <8 x i64> %res
+}
+
+; ============================================================================
+; Case 2: 32-bit Optimization (vpmuludq)
+; ============================================================================
+
+define <8 x i64> @test_mul_32bit_fits(<8 x i64> %a, <8 x i64> %b) {
+; CNL-LABEL: test_mul_32bit_fits:
+; CNL: # %bb.0:
+; CNL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; CNL-NEXT: retq
+;
+; NOVLX-LABEL: test_mul_32bit_fits:
+; NOVLX: # %bb.0:
+; NOVLX-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; NOVLX-NEXT: retq
+;
+; GENERIC-LABEL: test_mul_32bit_fits:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; GENERIC-NEXT: retq
+;
+; GENERIC-NOVLX-LABEL: test_mul_32bit_fits:
+; GENERIC-NOVLX: # %bb.0:
+; GENERIC-NOVLX-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; GENERIC-NOVLX-NEXT: retq
+
+ %a_masked = and <8 x i64> %a, splat (i64 4294967295)
+ %b_masked = and <8 x i64> %b, splat (i64 4294967295)
+
+ %res = mul <8 x i64> %a_masked, %b_masked
+ ret <8 x i64> %res
+}
+
+; ============================================================================
+; Case 3: No Optimization (Full 64-bit)
+; ============================================================================
+
+define <8 x i64> @test_mul_full_64bit(<8 x i64> %a, <8 x i64> %b) {
+; CNL-LABEL: test_mul_full_64bit:
+; CNL: # %bb.0:
+; CNL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
+; CNL-NEXT: retq
+;
+; NOVLX-LABEL: test_mul_full_64bit:
+; NOVLX: # %bb.0:
+; NOVLX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
+; NOVLX-NEXT: retq
+;
+; GENERIC-LABEL: test_mul_full_64bit:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmullq %zmm1, %zmm0, %zmm0
+; GENERIC-NEXT: retq
+;
+; GENERIC-NOVLX-LABEL: test_mul_full_64bit:
+; GENERIC-NOVLX: # %bb.0:
+; GENERIC-NOVLX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
+; GENERIC-NOVLX-NEXT: retq
+ %res = mul <8 x i64> %a, %b
+ ret <8 x i64> %res
+}
+
+; ============================================================================
+; Case 4: Vector Width Variety (Check 256-bit / YMM)
+; ============================================================================
+
+define <4 x i64> @test_mul_52bit_ymm(<4 x i64> %a, <4 x i64> %b) {
+; CNL-LABEL: test_mul_52bit_ymm:
+; CNL: # %bb.0:
+; CNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
+; CNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1
+; CNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CNL-NEXT: vpmadd52luq %ymm1, %ymm2, %ymm0
+; CNL-NEXT: retq
+;
+; NOVLX-LABEL: test_mul_52bit_ymm:
+; NOVLX: # %bb.0:
+; NOVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [8589934591,8589934591,8589934591,8589934591]
+; NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; NOVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [524287,524287,524287,524287]
+; NOVLX-NEXT: vpand %ymm2, %ymm1, %ymm1
+; NOVLX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
+; NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; NOVLX-NEXT: retq
+;
+; GENERIC-LABEL: test_mul_52bit_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
+; GENERIC-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1
+; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; GENERIC-NEXT: vpmadd52luq %ymm1, %ymm2, %ymm0
+; GENERIC-NEXT: retq
+;
+; GENERIC-NOVLX-LABEL: test_mul_52bit_ymm:
+; GENERIC-NOVLX: # %bb.0:
+; GENERIC-NOVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [8589934591,8589934591,8589934591,8589934591]
+; GENERIC-NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; GENERIC-NOVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [524287,524287,524287,524287]
+; GENERIC-NOVLX-NEXT: vpand %ymm2, %ymm1, %ymm1
+; GENERIC-NOVLX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
+; GENERIC-NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; GENERIC-NOVLX-NEXT: retq
+
+ %a_masked = and <4 x i64> %a, splat (i64 8589934591)
+ %b_masked = and <4 x i64> %b, splat (i64 524287)
+
+ %res = mul <4 x i64> %a_masked, %b_masked
+ ret <4 x i64> %res
+}
+
+; ============================================================================
+; Case 1.5: 32-bit Signed Optimization (vpmuldq)
+; ============================================================================
+
+define <8 x i64> @test_mul_32bit_signed(<8 x i32> %a, <8 x i32> %b) {
+; CNL-LABEL: test_mul_32bit_signed:
+; CNL: # %bb.0:
+; CNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CNL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; CNL-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
+; CNL-NEXT: retq
+;
+; NOVLX-LABEL: test_mul_32bit_signed:
+; NOVLX: # %bb.0:
+; NOVLX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; NOVLX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; NOVLX-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
+; NOVLX-NEXT: retq
+;
+; GENERIC-LABEL: test_mul_32bit_signed:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; GENERIC-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
+; GENERIC-NEXT: retq
+;
+; GENERIC-NOVLX-LABEL: test_mul_32bit_signed:
+; GENERIC-NOVLX: # %bb.0:
+; GENERIC-NOVLX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; GENERIC-NOVLX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; GENERIC-NOVLX-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
+; GENERIC-NOVLX-NEXT: retq
+ %a_ = sext <8 x i32> %a to <8 x i64>
+ %b_ = sext <8 x i32> %b to <8 x i64>
+
+ %res = mul <8 x i64> %a_, %b_
+ ret <8 x i64> %res
+}
More information about the llvm-commits
mailing list