[llvm] [X86] Recognise VPMADD52L pattern with AVX512IFMA/AVXIFMA (#153787) (PR #156714)
Justin Riddell via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 4 21:15:33 PDT 2025
https://github.com/Arghnews updated https://github.com/llvm/llvm-project/pull/156714
>From d3581fa624b9b79a7d543b663184b5274fe9a7f2 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Wed, 3 Sep 2025 17:25:56 +0100
Subject: [PATCH 1/4] [X86] Recognise VPMADD52L pattern with AVX512IFMA/AVXIFMA
(#153787)
Match (X * Y) + Z in combineAdd. If target supports and we don't
overflow, rewrite using VPMADD52L
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 50 ++++++++
.../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 111 ++++++++++++++++++
2 files changed, 161 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 47cea933d0836..bd0ab5fe96630 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57966,6 +57966,51 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
Cmov.getOperand(3));
}
+static SDValue matchIntegerMultiplyAdd(SDNode *N, SelectionDAG &DAG,
+ SDValue Op0, SDValue Op1,
+ const SDLoc &DL, EVT VT,
+ const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
+ if (!VT.isVector() || VT.getScalarType() != MVT::i64 ||
+ !Subtarget.hasAVX512() ||
+ (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()) ||
+ !DAG.getTargetLoweringInfo().isOperationLegalOrCustom(X86ISD::VPMADD52L,
+ VT) ||
+ Op0.getValueType() != VT || Op1.getValueType() != VT)
+ return SDValue();
+
+ SDValue X, Y, Acc;
+ if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
+ return SDValue();
+
+ auto CheckMulOperand = [&DAG, &VT](const SDValue &M, SDValue &Xval,
+ SDValue &Yval) -> bool {
+ if (M.getOpcode() != ISD::MUL)
+ return false;
+ const SDValue A = M.getOperand(0);
+ const SDValue B = M.getOperand(1);
+ const APInt Top12Set = APInt::getHighBitsSet(64, 12);
+ if (A.getValueType() != VT || B.getValueType() != VT ||
+ !DAG.MaskedValueIsZero(A, Top12Set) ||
+ !DAG.MaskedValueIsZero(B, Top12Set) ||
+ !DAG.MaskedValueIsZero(M, Top12Set))
+ return false;
+ Xval = A;
+ Yval = B;
+ return true;
+ };
+
+ if (CheckMulOperand(Op0, X, Y)) {
+ Acc = Op1;
+ } else if (CheckMulOperand(Op1, X, Y)) {
+ Acc = Op0;
+ } else {
+ return SDValue();
+ }
+
+ return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Acc, X, Y);
+}
+
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -58069,6 +58114,11 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
Op0.getOperand(0), Op0.getOperand(2));
}
+ if (SDValue node =
+ matchIntegerMultiplyAdd(N, DAG, Op0, Op1, DL, VT, Subtarget)) {
+ return node;
+ }
+
return combineAddOrSubToADCOrSBB(N, DL, DAG);
}
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
new file mode 100644
index 0000000000000..6a37b1b814cdc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=X64
+
+; 67108863 == (1 << 26) - 1
+
+define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
+; X64-LABEL: test_512_combine_evex:
+; X64: # %bb.0:
+; X64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; X64-NEXT: vpandq %zmm3, %zmm0, %zmm0
+; X64-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; X64-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; X64-NOT: vpmul
+; X64-NOT: vpadd
+; X64-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
+; X64-NEXT: retq
+ %4 = and <8 x i64> %0, splat (i64 67108863)
+ %5 = and <8 x i64> %1, splat (i64 67108863)
+ %6 = and <8 x i64> %2, splat (i64 67108863)
+ %7 = mul nuw nsw <8 x i64> %5, %4
+ %8 = add nuw nsw <8 x i64> %7, %6
+ ret <8 x i64> %8
+}
+
+define dso_local <8 x i64> @fff(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
+ %4 = and <8 x i64> %0, splat (i64 67108863)
+ %5 = and <8 x i64> %1, splat (i64 67108863)
+ %6 = and <8 x i64> %2, splat (i64 67108863)
+ %7 = mul nuw nsw <8 x i64> %5, %4
+ %8 = mul nuw nsw <8 x i64> %7, %6
+ %9 = add nuw nsw <8 x i64> %8, %7
+ ret <8 x i64> %9
+}
+
+define dso_local noundef <8 x i64> @test_512_no_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
+; X64-LABEL: test_512_no_combine_evex:
+; X64: # %bb.0:
+; X64-NOT: vpmadd52
+; X64-NEXT: vpmullq %zmm0, %zmm1, %zmm0
+; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; X64-NEXT: retq
+ %4 = mul <8 x i64> %1, %0
+ %5 = add <8 x i64> %4, %2
+ ret <8 x i64> %5
+}
+
+define dso_local <4 x i64> @test_256_combine_evex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #1 {
+; X64-LABEL: test_256_combine_evex:
+; X64: # %bb.0:
+; X64-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; X64-NEXT: vpand %ymm3, %ymm0, %ymm0
+; X64-NEXT: vpand %ymm3, %ymm1, %ymm1
+; X64-NEXT: vpand %ymm3, %ymm2, %ymm2
+; X64-NOT: vpmul
+; X64-NOT: vpadd
+; X64-NEXT: vpmadd52luq %ymm1, %ymm2, %ymm0
+; X64-NEXT: retq
+ %4 = and <4 x i64> %0, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %5 = and <4 x i64> %1, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %6 = and <4 x i64> %2, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %7 = mul nuw nsw <4 x i64> %5, %4
+ %8 = add nuw nsw <4 x i64> %7, %6
+ ret <4 x i64> %8
+}
+
+define dso_local noundef <4 x i64> @test_256_no_combine_evex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #1 {
+; X64-LABEL: test_256_no_combine_evex:
+; X64: # %bb.0:
+; X64-NOT: vpmadd52
+; X64-NEXT: vpmullq %ymm0, %ymm1, %ymm0
+; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %4 = mul <4 x i64> %1, %0
+ %5 = add <4 x i64> %4, %2
+ ret <4 x i64> %5
+}
+
+define dso_local <4 x i64> @test_256_combine_vex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #2 {
+; X64-LABEL: test_256_combine_vex:
+; X64: # %bb.0:
+; X64-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; X64-NEXT: vpand %ymm3, %ymm0, %ymm0
+; X64-NEXT: vpand %ymm3, %ymm1, %ymm1
+; X64-NEXT: vpand %ymm3, %ymm2, %ymm2
+; X64-NOT: vpmul
+; X64-NOT: vpadd
+; X64-NEXT: {vex} vpmadd52luq %ymm1, %ymm2, %ymm0
+; X64-NEXT: retq
+ %4 = and <4 x i64> %0, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %5 = and <4 x i64> %1, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %6 = and <4 x i64> %2, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+ %7 = mul nuw nsw <4 x i64> %5, %4
+ %8 = add nuw nsw <4 x i64> %7, %6
+ ret <4 x i64> %8
+}
+
+define dso_local noundef <4 x i64> @test_256_no_combine_vex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #2 {
+; X64-LABEL: test_256_no_combine_vex:
+; X64: # %bb.0:
+; X64-NOT: vpmadd52
+; X64-NEXT: vpmullq %ymm0, %ymm1, %ymm0
+; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %4 = mul <4 x i64> %1, %0
+ %5 = add <4 x i64> %4, %2
+ ret <4 x i64> %5
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="512" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
>From bfb34a28b8b2faf2242d3ce95419669de3d3076a Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Thu, 4 Sep 2025 19:44:09 +0100
Subject: [PATCH 2/4] Apply review feedback, simplify, add more tests
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 54 +++-----
.../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 120 ++++++++++++++++--
2 files changed, 126 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bd0ab5fe96630..b04ccb5a80aaf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27,6 +27,7 @@
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -57966,47 +57967,28 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
Cmov.getOperand(3));
}
-static SDValue matchIntegerMultiplyAdd(SDNode *N, SelectionDAG &DAG,
- SDValue Op0, SDValue Op1,
- const SDLoc &DL, EVT VT,
- const X86Subtarget &Subtarget) {
+static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
+ EVT VT, const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
- if (!VT.isVector() || VT.getScalarType() != MVT::i64 ||
- !Subtarget.hasAVX512() ||
- (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()) ||
- !DAG.getTargetLoweringInfo().isOperationLegalOrCustom(X86ISD::VPMADD52L,
- VT) ||
- Op0.getValueType() != VT || Op1.getValueType() != VT)
+ if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
+ (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
+ return SDValue();
+
+ // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
+ if (!Subtarget.hasVLX() && VT.getSizeInBits() < 512)
return SDValue();
SDValue X, Y, Acc;
if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
return SDValue();
- auto CheckMulOperand = [&DAG, &VT](const SDValue &M, SDValue &Xval,
- SDValue &Yval) -> bool {
- if (M.getOpcode() != ISD::MUL)
- return false;
- const SDValue A = M.getOperand(0);
- const SDValue B = M.getOperand(1);
- const APInt Top12Set = APInt::getHighBitsSet(64, 12);
- if (A.getValueType() != VT || B.getValueType() != VT ||
- !DAG.MaskedValueIsZero(A, Top12Set) ||
- !DAG.MaskedValueIsZero(B, Top12Set) ||
- !DAG.MaskedValueIsZero(M, Top12Set))
- return false;
- Xval = A;
- Yval = B;
- return true;
- };
-
- if (CheckMulOperand(Op0, X, Y)) {
- Acc = Op1;
- } else if (CheckMulOperand(Op1, X, Y)) {
- Acc = Op0;
- } else {
+ KnownBits KnownX = DAG.computeKnownBits(X);
+ KnownBits KnownY = DAG.computeKnownBits(Y);
+ KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
+ if (KnownX.countMinLeadingZeros() < 12 ||
+ KnownY.countMinLeadingZeros() < 12 ||
+ KnownMul.countMinLeadingZeros() < 12)
return SDValue();
- }
return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Acc, X, Y);
}
@@ -58114,10 +58096,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
Op0.getOperand(0), Op0.getOperand(2));
}
- if (SDValue node =
- matchIntegerMultiplyAdd(N, DAG, Op0, Op1, DL, VT, Subtarget)) {
- return node;
- }
+ if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
+ return IFMA52;
return combineAddOrSubToADCOrSBB(N, DL, DAG);
}
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 6a37b1b814cdc..68822e10a656a 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
; 67108863 == (1 << 26) - 1
+; 4503599627370496 == (1 << 52)
+; 4503599627370495 == (1 << 52) - 1
define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
; X64-LABEL: test_512_combine_evex:
@@ -22,14 +24,16 @@ define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64
ret <8 x i64> %8
}
-define dso_local <8 x i64> @fff(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
- %4 = and <8 x i64> %0, splat (i64 67108863)
- %5 = and <8 x i64> %1, splat (i64 67108863)
- %6 = and <8 x i64> %2, splat (i64 67108863)
+define dso_local <8 x i64> @test_512_no_combine_evex_v2(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
+; X64-LABEL: test_512_no_combine_evex_v2:
+; X64-NOT: vpmadd52luq
+; X64: retq
+ %4 = and <8 x i64> %0, splat (i64 4503599627370495)
+ %5 = and <8 x i64> %1, splat (i64 4503599627370495)
+ %6 = and <8 x i64> %2, splat (i64 4503599627370495)
%7 = mul nuw nsw <8 x i64> %5, %4
- %8 = mul nuw nsw <8 x i64> %7, %6
- %9 = add nuw nsw <8 x i64> %8, %7
- ret <8 x i64> %9
+ %8 = add nuw nsw <8 x i64> %7, %6
+ ret <8 x i64> %8
}
define dso_local noundef <8 x i64> @test_512_no_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
@@ -106,6 +110,100 @@ define dso_local noundef <4 x i64> @test_256_no_combine_vex(<4 x i64> noundef %0
ret <4 x i64> %5
}
-attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="512" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
-attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
-attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+define i64 @scalar_no_ifma(i64 %a, i64 %b, i64 %acc) #0 {
+; X64-LABEL: scalar_no_ifma:
+; X64-NOT: vpmadd52
+; X64-NOT: vpmullq
+; X64: imulq
+; X64: ret
+entry:
+ %mul = mul i64 %a, %b
+ %res = add i64 %acc, %mul
+ ret i64 %res
+}
+
+define <8 x i64> @mixed_width_too_wide(<8 x i64> %a, <8 x i64> %b, <8 x i64> %acc) #0 {
+; X64-LABEL: mixed_width_too_wide:
+; X64-NOT: vpmadd52luq
+; X64: vpmullq
+; X64: ret
+entry:
+ ; 40-bit and 13-bit, product fits < 2^53 (NOT < 2^52)
+ %a40 = and <8 x i64> %a, splat (i64 1099511627775)
+ %b13 = and <8 x i64> %b, splat (i64 8191)
+ %mul = mul <8 x i64> %a40, %b13
+ %res = add <8 x i64> %acc, %mul
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @zext32_inputs_not_safe(<8 x i32> %ai32, <8 x i32> %bi32, <8 x i64> %acc) #0 {
+; X64-LABEL: zext32_inputs_not_safe:
+; X64: vpmul
+; X64-NOT: vpmadd52luq
+; X64: ret
+entry:
+ %a = zext <8 x i32> %ai32 to <8 x i64>
+ %b = zext <8 x i32> %bi32 to <8 x i64>
+ %mul = mul <8 x i64> %a, %b
+ %res = add <8 x i64> %acc, %mul
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @const_2pow51_times_2(<8 x i64> %acc) #0 {
+; X64-LABEL: const_2pow51_times_2:
+; X64-NOT: vpmadd52luq
+; X64: vpaddq
+; X64: ret
+entry:
+ %a = insertelement <8 x i64> undef, i64 2251799813685248, i32 0 ; 2^51
+ %a.s = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> splat (i32 0)
+ %b = insertelement <8 x i64> undef, i64 2, i32 0
+ %b.s = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> splat (i32 0)
+ %mul = mul <8 x i64> %a.s, %b.s ; product = 2^52
+ %res = add <8 x i64> %acc, %mul ; needs full low-64 add
+ ret <8 x i64> %res
+}
+
+define <4 x i64> @safe_ifma_v4(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #1 {
+; X64-LABEL: safe_ifma_v4:
+; X64: vpmadd52luq
+; X64-NOT: vpmullq
+; X64: ret
+entry:
+ %a26 = and <4 x i64> %a, splat (i64 67108863)
+ %b26 = and <4 x i64> %b, splat (i64 67108863)
+ %mul = mul <4 x i64> %a26, %b26
+ %res = add <4 x i64> %acc, %mul
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @safe_ifma_v2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %acc) #1 {
+; X64-LABEL: safe_ifma_v2:
+; X64: vpmadd52luq
+; X64-NOT: vpmullq
+; X64: ret
+entry:
+ %a26 = and <2 x i64> %a, splat (i64 67108863)
+ %b26 = and <2 x i64> %b, splat (i64 67108863)
+ %mul = mul <2 x i64> %a26, %b26
+ %res = add <2 x i64> %acc, %mul
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @v4_no_vl_fallback(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #0 {
+; X64-LABEL: v4_no_vl_fallback:
+; X64-NOT: vpmadd52luq
+; X64: pmul
+; X64: ret
+entry:
+ %a26 = and <4 x i64> %a, splat (i64 67108863)
+ %b26 = and <4 x i64> %b, splat (i64 67108863)
+ %mul = mul <4 x i64> %a26, %b26
+ %res = add <4 x i64> %acc, %mul
+ ret <4 x i64> %res
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,-avx512vl,+cmov,+crc32,+evex512,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #3 = { "target-features"="+avx512dq,+avx512f,+avx512ifma,+avx512vl,-evex512" }
>From 804203791b6d0765d86b085ace327f0af8f0cfe7 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Thu, 4 Sep 2025 20:03:41 +0100
Subject: [PATCH 3/4] Refactor test, remove deprecated undef
---
llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 68822e10a656a..3d655cff71198 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -155,12 +155,9 @@ define <8 x i64> @const_2pow51_times_2(<8 x i64> %acc) #0 {
; X64: vpaddq
; X64: ret
entry:
- %a = insertelement <8 x i64> undef, i64 2251799813685248, i32 0 ; 2^51
- %a.s = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> splat (i32 0)
- %b = insertelement <8 x i64> undef, i64 2, i32 0
- %b.s = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> splat (i32 0)
- %mul = mul <8 x i64> %a.s, %b.s ; product = 2^52
- %res = add <8 x i64> %acc, %mul ; needs full low-64 add
+ ; product = 2^52
+ %mul = mul <8 x i64> splat(i64 2251799813685248), splat(i64 2)
+ %res = add <8 x i64> %acc, %mul ; needs full low-64 add
ret <8 x i64> %res
}
>From 6338a6149c2a36e8bb133d29d9648120168e66a6 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Fri, 5 Sep 2025 05:09:11 +0100
Subject: [PATCH 4/4] Implement SplitOpsAndApply
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 14 +++++++++++++-
llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll | 15 +++++++++++++--
2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b04ccb5a80aaf..64f6b09984900 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4462,6 +4462,7 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned NumSubs = 1;
if ((CheckBWI && Subtarget.useBWIRegs()) ||
(!CheckBWI && Subtarget.useAVX512Regs())) {
+ // if (0) {
if (VT.getSizeInBits() > 512) {
NumSubs = VT.getSizeInBits() / 512;
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -57967,6 +57968,8 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
Cmov.getOperand(3));
}
+// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
+// When upper 12 bits of x, y and MUL(x, y) are known to be 0
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
EVT VT, const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
@@ -57990,7 +57993,16 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
KnownMul.countMinLeadingZeros() < 12)
return SDValue();
- return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Acc, X, Y);
+ auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
+ ArrayRef<SDValue> SubOps) {
+ EVT SubVT = SubOps[0].getValueType();
+ assert(SubVT.getScalarSizeInBits() == 64);
+ return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[0] /*Acc*/,
+ SubOps[1] /*X*/, SubOps[2] /*Y*/);
+ };
+
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
+ /*CheckBWI*/ false);
}
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 3d655cff71198..93671f82d2646 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
; 67108863 == (1 << 26) - 1
@@ -200,7 +199,19 @@ entry:
ret <4 x i64> %res
}
+define <16 x i64> @v16_test_split(<16 x i64> %a, <16 x i64> %b, <16 x i64> %acc) #1 {
+; X64-LABEL: v16_test_split:
+; X64: vpmadd52luq
+; X64: vpmadd52luq
+; X64: ret
+entry:
+ %a26 = and <16 x i64> %a, splat (i64 67108863)
+ %b26 = and <16 x i64> %b, splat (i64 67108863)
+ %mul = mul <16 x i64> %a26, %b26
+ %res = add <16 x i64> %acc, %mul
+ ret <16 x i64> %res
+}
+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,-avx512vl,+cmov,+crc32,+evex512,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
-attributes #3 = { "target-features"="+avx512dq,+avx512f,+avx512ifma,+avx512vl,-evex512" }
More information about the llvm-commits
mailing list