[llvm] [X86] Recognise VPMADD52L pattern with AVX512IFMA/AVXIFMA (#153787) (PR #156714)

Mon Sep 8 13:00:05 PDT 2025

https://github.com/Arghnews updated https://github.com/llvm/llvm-project/pull/156714

>From d3581fa624b9b79a7d543b663184b5274fe9a7f2 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Wed, 3 Sep 2025 17:25:56 +0100
Subject: [PATCH 1/8] [X86] Recognise VPMADD52L pattern with AVX512IFMA/AVXIFMA
 (#153787)

Match (X * Y) + Z in combineAdd. If target supports and we don't
overflow, rewrite using VPMADD52L
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  50 ++++++++
 .../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 111 ++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 47cea933d0836..bd0ab5fe96630 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57966,6 +57966,51 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
                      Cmov.getOperand(3));
 }
 
+static SDValue matchIntegerMultiplyAdd(SDNode *N, SelectionDAG &DAG,
+                                       SDValue Op0, SDValue Op1,
+                                       const SDLoc &DL, EVT VT,
+                                       const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
+  if (!VT.isVector() || VT.getScalarType() != MVT::i64 ||
+      !Subtarget.hasAVX512() ||
+      (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()) ||
+      !DAG.getTargetLoweringInfo().isOperationLegalOrCustom(X86ISD::VPMADD52L,
+                                                            VT) ||
+      Op0.getValueType() != VT || Op1.getValueType() != VT)
+    return SDValue();
+
+  SDValue X, Y, Acc;
+  if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
+    return SDValue();
+
+  auto CheckMulOperand = [&DAG, &VT](const SDValue &M, SDValue &Xval,
+                                     SDValue &Yval) -> bool {
+    if (M.getOpcode() != ISD::MUL)
+      return false;
+    const SDValue A = M.getOperand(0);
+    const SDValue B = M.getOperand(1);
+    const APInt Top12Set = APInt::getHighBitsSet(64, 12);
+    if (A.getValueType() != VT || B.getValueType() != VT ||
+        !DAG.MaskedValueIsZero(A, Top12Set) ||
+        !DAG.MaskedValueIsZero(B, Top12Set) ||
+        !DAG.MaskedValueIsZero(M, Top12Set))
+      return false;
+    Xval = A;
+    Yval = B;
+    return true;
+  };
+
+  if (CheckMulOperand(Op0, X, Y)) {
+    Acc = Op1;
+  } else if (CheckMulOperand(Op1, X, Y)) {
+    Acc = Op0;
+  } else {
+    return SDValue();
+  }
+
+  return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Acc, X, Y);
+}
+
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -58069,6 +58114,11 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                        Op0.getOperand(0), Op0.getOperand(2));
   }
 
+  if (SDValue node =
+          matchIntegerMultiplyAdd(N, DAG, Op0, Op1, DL, VT, Subtarget)) {
+    return node;
+  }
+
   return combineAddOrSubToADCOrSBB(N, DL, DAG);
 }
 
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
new file mode 100644
index 0000000000000..6a37b1b814cdc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=X64
+
+; 67108863 == (1 << 26) - 1
+
+define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
+; X64-LABEL: test_512_combine_evex:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; X64-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; X64-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; X64-NEXT:    vpandq %zmm3, %zmm2, %zmm2
+; X64-NOT:     vpmul
+; X64-NOT:     vpadd
+; X64-NEXT:    vpmadd52luq %zmm1, %zmm2, %zmm0
+; X64-NEXT:    retq
+  %4 = and <8 x i64> %0, splat (i64 67108863)
+  %5 = and <8 x i64> %1, splat (i64 67108863)
+  %6 = and <8 x i64> %2, splat (i64 67108863)
+  %7 = mul nuw nsw <8 x i64> %5, %4
+  %8 = add nuw nsw <8 x i64> %7, %6
+  ret <8 x i64> %8
+}
+
+define dso_local <8 x i64> @fff(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
+  %4 = and <8 x i64> %0, splat (i64 67108863)
+  %5 = and <8 x i64> %1, splat (i64 67108863)
+  %6 = and <8 x i64> %2, splat (i64 67108863)
+  %7 = mul nuw nsw <8 x i64> %5, %4
+  %8 = mul nuw nsw <8 x i64> %7, %6
+  %9 = add nuw nsw <8 x i64> %8, %7
+  ret <8 x i64> %9
+}
+
+define dso_local noundef <8 x i64> @test_512_no_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
+; X64-LABEL: test_512_no_combine_evex:
+; X64:       # %bb.0:
+; X64-NOT:     vpmadd52
+; X64-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; X64-NEXT:    retq
+  %4 = mul <8 x i64> %1, %0
+  %5 = add <8 x i64> %4, %2
+  ret <8 x i64> %5
+}
+
+define dso_local <4 x i64> @test_256_combine_evex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #1 {
+; X64-LABEL: test_256_combine_evex:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; X64-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; X64-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; X64-NOT:     vpmul
+; X64-NOT:     vpadd
+; X64-NEXT:    vpmadd52luq %ymm1, %ymm2, %ymm0
+; X64-NEXT:    retq
+  %4 = and <4 x i64> %0, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+  %5 = and <4 x i64> %1, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+  %6 = and <4 x i64> %2, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+  %7 = mul nuw nsw <4 x i64> %5, %4
+  %8 = add nuw nsw <4 x i64> %7, %6
+  ret <4 x i64> %8
+}
+
+define dso_local noundef <4 x i64> @test_256_no_combine_evex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #1 {
+; X64-LABEL: test_256_no_combine_evex:
+; X64:       # %bb.0:
+; X64-NOT:     vpmadd52
+; X64-NEXT:    vpmullq %ymm0, %ymm1, %ymm0
+; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %4 = mul <4 x i64> %1, %0
+  %5 = add <4 x i64> %4, %2
+  ret <4 x i64> %5
+}
+
+define dso_local <4 x i64> @test_256_combine_vex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #2 {
+; X64-LABEL: test_256_combine_vex:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; X64-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; X64-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; X64-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; X64-NOT:     vpmul
+; X64-NOT:     vpadd
+; X64-NEXT:    {vex} vpmadd52luq %ymm1, %ymm2, %ymm0
+; X64-NEXT:    retq
+  %4 = and <4 x i64> %0, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+  %5 = and <4 x i64> %1, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+  %6 = and <4 x i64> %2, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
+  %7 = mul nuw nsw <4 x i64> %5, %4
+  %8 = add nuw nsw <4 x i64> %7, %6
+  ret <4 x i64> %8
+}
+
+define dso_local noundef <4 x i64> @test_256_no_combine_vex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #2 {
+; X64-LABEL: test_256_no_combine_vex:
+; X64:       # %bb.0:
+; X64-NOT:     vpmadd52
+; X64-NEXT:    vpmullq %ymm0, %ymm1, %ymm0
+; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %4 = mul <4 x i64> %1, %0
+  %5 = add <4 x i64> %4, %2
+  ret <4 x i64> %5
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="512" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }

>From bfb34a28b8b2faf2242d3ce95419669de3d3076a Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Thu, 4 Sep 2025 19:44:09 +0100
Subject: [PATCH 2/8] Apply review feedback, simplify, add more tests

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  54 +++-----
 .../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 120 ++++++++++++++++--
 2 files changed, 126 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bd0ab5fe96630..b04ccb5a80aaf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -57966,47 +57967,28 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
                      Cmov.getOperand(3));
 }
 
-static SDValue matchIntegerMultiplyAdd(SDNode *N, SelectionDAG &DAG,
-                                       SDValue Op0, SDValue Op1,
-                                       const SDLoc &DL, EVT VT,
-                                       const X86Subtarget &Subtarget) {
+static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
+                             EVT VT, const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
-  if (!VT.isVector() || VT.getScalarType() != MVT::i64 ||
-      !Subtarget.hasAVX512() ||
-      (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()) ||
-      !DAG.getTargetLoweringInfo().isOperationLegalOrCustom(X86ISD::VPMADD52L,
-                                                            VT) ||
-      Op0.getValueType() != VT || Op1.getValueType() != VT)
+  if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
+      (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
+    return SDValue();
+
+  // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
+  if (!Subtarget.hasVLX() && VT.getSizeInBits() < 512)
     return SDValue();
 
   SDValue X, Y, Acc;
   if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
     return SDValue();
 
-  auto CheckMulOperand = [&DAG, &VT](const SDValue &M, SDValue &Xval,
-                                     SDValue &Yval) -> bool {
-    if (M.getOpcode() != ISD::MUL)
-      return false;
-    const SDValue A = M.getOperand(0);
-    const SDValue B = M.getOperand(1);
-    const APInt Top12Set = APInt::getHighBitsSet(64, 12);
-    if (A.getValueType() != VT || B.getValueType() != VT ||
-        !DAG.MaskedValueIsZero(A, Top12Set) ||
-        !DAG.MaskedValueIsZero(B, Top12Set) ||
-        !DAG.MaskedValueIsZero(M, Top12Set))
-      return false;
-    Xval = A;
-    Yval = B;
-    return true;
-  };
-
-  if (CheckMulOperand(Op0, X, Y)) {
-    Acc = Op1;
-  } else if (CheckMulOperand(Op1, X, Y)) {
-    Acc = Op0;
-  } else {
+  KnownBits KnownX = DAG.computeKnownBits(X);
+  KnownBits KnownY = DAG.computeKnownBits(Y);
+  KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
+  if (KnownX.countMinLeadingZeros() < 12 ||
+      KnownY.countMinLeadingZeros() < 12 ||
+      KnownMul.countMinLeadingZeros() < 12)
     return SDValue();
-  }
 
   return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Acc, X, Y);
 }
@@ -58114,10 +58096,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                        Op0.getOperand(0), Op0.getOperand(2));
   }
 
-  if (SDValue node =
-          matchIntegerMultiplyAdd(N, DAG, Op0, Op1, DL, VT, Subtarget)) {
-    return node;
-  }
+  if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
+    return IFMA52;
 
   return combineAddOrSubToADCOrSBB(N, DL, DAG);
 }
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 6a37b1b814cdc..68822e10a656a 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
 
 ; 67108863 == (1 << 26) - 1
+; 4503599627370496 == (1 << 52)
+; 4503599627370495 == (1 << 52) - 1
 
 define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
 ; X64-LABEL: test_512_combine_evex:
@@ -22,14 +24,16 @@ define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64
   ret <8 x i64> %8
 }
 
-define dso_local <8 x i64> @fff(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
-  %4 = and <8 x i64> %0, splat (i64 67108863)
-  %5 = and <8 x i64> %1, splat (i64 67108863)
-  %6 = and <8 x i64> %2, splat (i64 67108863)
+define dso_local <8 x i64> @test_512_no_combine_evex_v2(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
+; X64-LABEL: test_512_no_combine_evex_v2:
+; X64-NOT:   vpmadd52luq
+; X64:       retq
+  %4 = and <8 x i64> %0, splat (i64 4503599627370495)
+  %5 = and <8 x i64> %1, splat (i64 4503599627370495)
+  %6 = and <8 x i64> %2, splat (i64 4503599627370495)
   %7 = mul nuw nsw <8 x i64> %5, %4
-  %8 = mul nuw nsw <8 x i64> %7, %6
-  %9 = add nuw nsw <8 x i64> %8, %7
-  ret <8 x i64> %9
+  %8 = add nuw nsw <8 x i64> %7, %6
+  ret <8 x i64> %8
 }
 
 define dso_local noundef <8 x i64> @test_512_no_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
@@ -106,6 +110,100 @@ define dso_local noundef <4 x i64> @test_256_no_combine_vex(<4 x i64> noundef %0
   ret <4 x i64> %5
 }
 
-attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="512" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
-attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
-attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+define i64 @scalar_no_ifma(i64 %a, i64 %b, i64 %acc) #0 {
+; X64-LABEL: scalar_no_ifma:
+; X64-NOT: vpmadd52
+; X64-NOT: vpmullq
+; X64:     imulq
+; X64:     ret
+entry:
+  %mul = mul i64 %a, %b
+  %res = add i64 %acc, %mul
+  ret i64 %res
+}
+
+define <8 x i64> @mixed_width_too_wide(<8 x i64> %a, <8 x i64> %b, <8 x i64> %acc) #0 {
+; X64-LABEL: mixed_width_too_wide:
+; X64-NOT:   vpmadd52luq
+; X64:       vpmullq
+; X64:       ret
+entry:
+  ; 40-bit and 13-bit, product fits < 2^53 (NOT < 2^52)
+  %a40 = and <8 x i64> %a, splat (i64 1099511627775)
+  %b13 = and <8 x i64> %b, splat (i64 8191)
+  %mul = mul <8 x i64> %a40, %b13
+  %res = add <8 x i64> %acc, %mul
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @zext32_inputs_not_safe(<8 x i32> %ai32, <8 x i32> %bi32, <8 x i64> %acc) #0 {
+; X64-LABEL: zext32_inputs_not_safe:
+; X64:       vpmul
+; X64-NOT:   vpmadd52luq
+; X64:       ret
+entry:
+  %a = zext <8 x i32> %ai32 to <8 x i64>
+  %b = zext <8 x i32> %bi32 to <8 x i64>
+  %mul = mul <8 x i64> %a, %b
+  %res = add <8 x i64> %acc, %mul
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @const_2pow51_times_2(<8 x i64> %acc) #0 {
+; X64-LABEL: const_2pow51_times_2:
+; X64-NOT:   vpmadd52luq
+; X64:       vpaddq
+; X64:       ret
+entry:
+  %a = insertelement <8 x i64> undef, i64 2251799813685248, i32 0 ; 2^51
+  %a.s = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> splat (i32 0)
+  %b = insertelement <8 x i64> undef, i64 2, i32 0
+  %b.s = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> splat (i32 0)
+  %mul = mul <8 x i64> %a.s, %b.s        ; product = 2^52
+  %res = add <8 x i64> %acc, %mul        ; needs full low-64 add
+  ret <8 x i64> %res
+}
+
+define <4 x i64> @safe_ifma_v4(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #1 {
+; X64-LABEL: safe_ifma_v4:
+; X64:       vpmadd52luq
+; X64-NOT:   vpmullq
+; X64:       ret
+entry:
+  %a26 = and <4 x i64> %a, splat (i64 67108863)
+  %b26 = and <4 x i64> %b, splat (i64 67108863)
+  %mul = mul <4 x i64> %a26, %b26
+  %res = add <4 x i64> %acc, %mul
+  ret <4 x i64> %res
+}
+
+define <2 x i64> @safe_ifma_v2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %acc) #1 {
+; X64-LABEL: safe_ifma_v2:
+; X64:       vpmadd52luq
+; X64-NOT:   vpmullq
+; X64:       ret
+entry:
+  %a26 = and <2 x i64> %a, splat (i64 67108863)
+  %b26 = and <2 x i64> %b, splat (i64 67108863)
+  %mul = mul <2 x i64> %a26, %b26
+  %res = add <2 x i64> %acc, %mul
+  ret <2 x i64> %res
+}
+
+define <4 x i64> @v4_no_vl_fallback(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #0 {
+; X64-LABEL: v4_no_vl_fallback:
+; X64-NOT:   vpmadd52luq
+; X64:       pmul
+; X64:       ret
+entry:
+  %a26 = and <4 x i64> %a, splat (i64 67108863)
+  %b26 = and <4 x i64> %b, splat (i64 67108863)
+  %mul = mul <4 x i64> %a26, %b26
+  %res = add <4 x i64> %acc, %mul
+  ret <4 x i64> %res
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,-avx512vl,+cmov,+crc32,+evex512,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #3 = { "target-features"="+avx512dq,+avx512f,+avx512ifma,+avx512vl,-evex512" }

>From 804203791b6d0765d86b085ace327f0af8f0cfe7 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Thu, 4 Sep 2025 20:03:41 +0100
Subject: [PATCH 3/8] Refactor test, remove deprecated undef

---
 llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 68822e10a656a..3d655cff71198 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -155,12 +155,9 @@ define <8 x i64> @const_2pow51_times_2(<8 x i64> %acc) #0 {
 ; X64:       vpaddq
 ; X64:       ret
 entry:
-  %a = insertelement <8 x i64> undef, i64 2251799813685248, i32 0 ; 2^51
-  %a.s = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> splat (i32 0)
-  %b = insertelement <8 x i64> undef, i64 2, i32 0
-  %b.s = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> splat (i32 0)
-  %mul = mul <8 x i64> %a.s, %b.s        ; product = 2^52
-  %res = add <8 x i64> %acc, %mul        ; needs full low-64 add
+  ; product = 2^52
+  %mul = mul <8 x i64> splat(i64 2251799813685248), splat(i64 2)
+  %res = add <8 x i64> %acc, %mul    ; needs full low-64 add
   ret <8 x i64> %res
 }
 

>From 6338a6149c2a36e8bb133d29d9648120168e66a6 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Fri, 5 Sep 2025 05:09:11 +0100
Subject: [PATCH 4/8] Implement SplitOpsAndApply

---
 llvm/lib/Target/X86/X86ISelLowering.cpp        | 14 +++++++++++++-
 llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll | 15 +++++++++++++--
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b04ccb5a80aaf..64f6b09984900 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4462,6 +4462,7 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
   unsigned NumSubs = 1;
   if ((CheckBWI && Subtarget.useBWIRegs()) ||
       (!CheckBWI && Subtarget.useAVX512Regs())) {
+    // if (0) {
     if (VT.getSizeInBits() > 512) {
       NumSubs = VT.getSizeInBits() / 512;
       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -57967,6 +57968,8 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
                      Cmov.getOperand(3));
 }
 
+// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
+// When upper 12 bits of x, y and MUL(x, y) are known to be 0
 static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
                              EVT VT, const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
@@ -57990,7 +57993,16 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
       KnownMul.countMinLeadingZeros() < 12)
     return SDValue();
 
-  return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Acc, X, Y);
+  auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
+                            ArrayRef<SDValue> SubOps) {
+    EVT SubVT = SubOps[0].getValueType();
+    assert(SubVT.getScalarSizeInBits() == 64);
+    return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[0] /*Acc*/,
+                     SubOps[1] /*X*/, SubOps[2] /*Y*/);
+  };
+
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
+                          /*CheckBWI*/ false);
 }
 
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 3d655cff71198..93671f82d2646 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
 
 ; 67108863 == (1 << 26) - 1
@@ -200,7 +199,19 @@ entry:
   ret <4 x i64> %res
 }
 
+define <16 x i64> @v16_test_split(<16 x i64> %a, <16 x i64> %b, <16 x i64> %acc) #1 {
+; X64-LABEL: v16_test_split:
+; X64:       vpmadd52luq
+; X64:       vpmadd52luq
+; X64:       ret
+entry:
+  %a26 = and <16 x i64> %a, splat (i64 67108863)
+  %b26 = and <16 x i64> %b, splat (i64 67108863)
+  %mul = mul <16 x i64> %a26, %b26
+  %res = add <16 x i64> %acc, %mul
+  ret <16 x i64> %res
+}
+
 attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,-avx512vl,+cmov,+crc32,+evex512,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
 attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
 attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
-attributes #3 = { "target-features"="+avx512dq,+avx512f,+avx512ifma,+avx512vl,-evex512" }

>From 1798672fac2e660d1359f761b6a0df9fa5720676 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Fri, 5 Sep 2025 15:06:27 +0100
Subject: [PATCH 5/8] Refactor and cleanup tests

Changed all constants to splats
Removed const_2pow51_times_2 as folds early
Remove unnecessary acc masks
Change numbered vars to more readable names
Remove attributes and use RUN lines
Remove dso_local/noundef/local_unnamed_addr
---
 .../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 252 +++++-------------
 1 file changed, 63 insertions(+), 189 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 93671f82d2646..f8078209cdaeb 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -1,217 +1,91 @@
-; RUN: llc < %s -O1 -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL
 
 ; 67108863 == (1 << 26) - 1
 ; 4503599627370496 == (1 << 52)
 ; 4503599627370495 == (1 << 52) - 1
 
-define dso_local <8 x i64> @test_512_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
-; X64-LABEL: test_512_combine_evex:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
-; X64-NEXT:    vpandq %zmm3, %zmm0, %zmm0
-; X64-NEXT:    vpandq %zmm3, %zmm1, %zmm1
-; X64-NEXT:    vpandq %zmm3, %zmm2, %zmm2
-; X64-NOT:     vpmul
-; X64-NOT:     vpadd
-; X64-NEXT:    vpmadd52luq %zmm1, %zmm2, %zmm0
-; X64-NEXT:    retq
-  %4 = and <8 x i64> %0, splat (i64 67108863)
-  %5 = and <8 x i64> %1, splat (i64 67108863)
-  %6 = and <8 x i64> %2, splat (i64 67108863)
-  %7 = mul nuw nsw <8 x i64> %5, %4
-  %8 = add nuw nsw <8 x i64> %7, %6
-  ret <8 x i64> %8
-}
-
-define dso_local <8 x i64> @test_512_no_combine_evex_v2(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
-; X64-LABEL: test_512_no_combine_evex_v2:
-; X64-NOT:   vpmadd52luq
-; X64:       retq
-  %4 = and <8 x i64> %0, splat (i64 4503599627370495)
-  %5 = and <8 x i64> %1, splat (i64 4503599627370495)
-  %6 = and <8 x i64> %2, splat (i64 4503599627370495)
-  %7 = mul nuw nsw <8 x i64> %5, %4
-  %8 = add nuw nsw <8 x i64> %7, %6
-  ret <8 x i64> %8
+define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+  %x_masked = and <8 x i64> %x, splat (i64 67108863)
+  %y_masked = and <8 x i64> %y, splat (i64 67108863)
+  %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
+  %res = add nuw nsw <8 x i64> %mul, %z
+  ret <8 x i64> %res
 }
 
-define dso_local noundef <8 x i64> @test_512_no_combine_evex(<8 x i64> noundef %0, <8 x i64> noundef %1, <8 x i64> noundef %2) local_unnamed_addr #0 {
-; X64-LABEL: test_512_no_combine_evex:
-; X64:       # %bb.0:
-; X64-NOT:     vpmadd52
-; X64-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
-; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; X64-NEXT:    retq
-  %4 = mul <8 x i64> %1, %0
-  %5 = add <8 x i64> %4, %2
-  ret <8 x i64> %5
+define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+  %x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1
+  %y_masked = and <8 x i64> %y, splat (i64 3)
+  %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
+  %res = add nuw nsw <8 x i64> %mul, %z
+  ret <8 x i64> %res
 }
 
-define dso_local <4 x i64> @test_256_combine_evex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #1 {
-; X64-LABEL: test_256_combine_evex:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
-; X64-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; X64-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; X64-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; X64-NOT:     vpmul
-; X64-NOT:     vpadd
-; X64-NEXT:    vpmadd52luq %ymm1, %ymm2, %ymm0
-; X64-NEXT:    retq
-  %4 = and <4 x i64> %0, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
-  %5 = and <4 x i64> %1, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
-  %6 = and <4 x i64> %2, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
-  %7 = mul nuw nsw <4 x i64> %5, %4
-  %8 = add nuw nsw <4 x i64> %7, %6
-  ret <4 x i64> %8
+define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+  %x_masked = and <8 x i64> %x, splat (i64 4503599627370495)
+  %y_masked = and <8 x i64> %y, splat (i64 4503599627370495)
+  %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
+  %res = add nuw nsw <8 x i64> %mul, %z
+  ret <8 x i64> %res
 }
 
-define dso_local noundef <4 x i64> @test_256_no_combine_evex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #1 {
-; X64-LABEL: test_256_no_combine_evex:
-; X64:       # %bb.0:
-; X64-NOT:     vpmadd52
-; X64-NEXT:    vpmullq %ymm0, %ymm1, %ymm0
-; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; X64-NEXT:    retq
-  %4 = mul <4 x i64> %1, %0
-  %5 = add <4 x i64> %4, %2
-  ret <4 x i64> %5
+define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+  %mul = mul <8 x i64> %x, %y
+  %res = add <8 x i64> %mul, %z
+  ret <8 x i64> %res
 }
 
-define dso_local <4 x i64> @test_256_combine_vex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #2 {
-; X64-LABEL: test_256_combine_vex:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
-; X64-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; X64-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; X64-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; X64-NOT:     vpmul
-; X64-NOT:     vpadd
-; X64-NEXT:    {vex} vpmadd52luq %ymm1, %ymm2, %ymm0
-; X64-NEXT:    retq
-  %4 = and <4 x i64> %0, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
-  %5 = and <4 x i64> %1, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
-  %6 = and <4 x i64> %2, <i64 67108863, i64 67108863, i64 67108863, i64 67108863>
-  %7 = mul nuw nsw <4 x i64> %5, %4
-  %8 = add nuw nsw <4 x i64> %7, %6
-  ret <4 x i64> %8
+define <4 x i64> @test_256_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
+  %x_masked = and <4 x i64> %x, splat(i64 67108863)
+  %y_masked = and <4 x i64> %y, splat(i64 67108863)
+  %mul = mul nuw nsw <4 x i64> %x_masked, %y_masked
+  %res = add nuw nsw <4 x i64> %z, %mul
+  ret <4 x i64> %res
 }
 
-define dso_local noundef <4 x i64> @test_256_no_combine_vex(<4 x i64> noundef %0, <4 x i64> noundef %1, <4 x i64> noundef %2) local_unnamed_addr #2 {
-; X64-LABEL: test_256_no_combine_vex:
-; X64:       # %bb.0:
-; X64-NOT:     vpmadd52
-; X64-NEXT:    vpmullq %ymm0, %ymm1, %ymm0
-; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; X64-NEXT:    retq
-  %4 = mul <4 x i64> %1, %0
-  %5 = add <4 x i64> %4, %2
-  ret <4 x i64> %5
+define <4 x i64> @test_256_no_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
+  %mul = mul <4 x i64> %x, %y
+  %res = add <4 x i64> %mul, %z
+  ret <4 x i64> %res
 }
 
-define i64 @scalar_no_ifma(i64 %a, i64 %b, i64 %acc) #0 {
-; X64-LABEL: scalar_no_ifma:
-; X64-NOT: vpmadd52
-; X64-NOT: vpmullq
-; X64:     imulq
-; X64:     ret
-entry:
-  %mul = mul i64 %a, %b
-  %res = add i64 %acc, %mul
-  ret i64 %res
+define <2 x i64> @test_128_combine(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
+  %x_masked = and <2 x i64> %x, splat (i64 67108863)
+  %y_masked = and <2 x i64> %y, splat (i64 67108863)
+  %mul = mul <2 x i64> %x_masked, %y_masked
+  %res = add <2 x i64> %z, %mul
+  ret <2 x i64> %res
 }
 
-define <8 x i64> @mixed_width_too_wide(<8 x i64> %a, <8 x i64> %b, <8 x i64> %acc) #0 {
-; X64-LABEL: mixed_width_too_wide:
-; X64-NOT:   vpmadd52luq
-; X64:       vpmullq
-; X64:       ret
-entry:
-  ; 40-bit and 13-bit, product fits < 2^53 (NOT < 2^52)
-  %a40 = and <8 x i64> %a, splat (i64 1099511627775)
-  %b13 = and <8 x i64> %b, splat (i64 8191)
-  %mul = mul <8 x i64> %a40, %b13
-  %res = add <8 x i64> %acc, %mul
-  ret <8 x i64> %res
+; Sanity check we're not applying this here
+define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
+  %mul = mul <1 x i64> %x, %y
+  %res = add <1 x i64> %mul, %z
+  ret <1 x i64> %res
 }
 
-define <8 x i64> @zext32_inputs_not_safe(<8 x i32> %ai32, <8 x i32> %bi32, <8 x i64> %acc) #0 {
-; X64-LABEL: zext32_inputs_not_safe:
-; X64:       vpmul
-; X64-NOT:   vpmadd52luq
-; X64:       ret
-entry:
-  %a = zext <8 x i32> %ai32 to <8 x i64>
-  %b = zext <8 x i32> %bi32 to <8 x i64>
-  %mul = mul <8 x i64> %a, %b
-  %res = add <8 x i64> %acc, %mul
-  ret <8 x i64> %res
+define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+  ; 40-bit and 13-bit, too wide
+  %x40 = and <8 x i64> %x, splat (i64 1099511627775)
+  %y13 = and <8 x i64> %y, splat (i64 8191)
+  %mul = mul <8 x i64> %x40, %y13
+  %res = add <8 x i64> %z, %mul
+  ret <8 x i64> %z
 }
 
-define <8 x i64> @const_2pow51_times_2(<8 x i64> %acc) #0 {
-; X64-LABEL: const_2pow51_times_2:
-; X64-NOT:   vpmadd52luq
-; X64:       vpaddq
-; X64:       ret
-entry:
-  ; product = 2^52
-  %mul = mul <8 x i64> splat(i64 2251799813685248), splat(i64 2)
-  %res = add <8 x i64> %acc, %mul    ; needs full low-64 add
+define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) {
+  %x = zext <8 x i32> %xi32 to <8 x i64>
+  %y = zext <8 x i32> %yi32 to <8 x i64>
+  %mul = mul <8 x i64> %x, %y
+  %res = add <8 x i64> %z, %mul
   ret <8 x i64> %res
 }
 
-define <4 x i64> @safe_ifma_v4(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #1 {
-; X64-LABEL: safe_ifma_v4:
-; X64:       vpmadd52luq
-; X64-NOT:   vpmullq
-; X64:       ret
-entry:
-  %a26 = and <4 x i64> %a, splat (i64 67108863)
-  %b26 = and <4 x i64> %b, splat (i64 67108863)
-  %mul = mul <4 x i64> %a26, %b26
-  %res = add <4 x i64> %acc, %mul
-  ret <4 x i64> %res
-}
-
-define <2 x i64> @safe_ifma_v2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %acc) #1 {
-; X64-LABEL: safe_ifma_v2:
-; X64:       vpmadd52luq
-; X64-NOT:   vpmullq
-; X64:       ret
-entry:
-  %a26 = and <2 x i64> %a, splat (i64 67108863)
-  %b26 = and <2 x i64> %b, splat (i64 67108863)
-  %mul = mul <2 x i64> %a26, %b26
-  %res = add <2 x i64> %acc, %mul
-  ret <2 x i64> %res
-}
-
-define <4 x i64> @v4_no_vl_fallback(<4 x i64> %a, <4 x i64> %b, <4 x i64> %acc) #0 {
-; X64-LABEL: v4_no_vl_fallback:
-; X64-NOT:   vpmadd52luq
-; X64:       pmul
-; X64:       ret
-entry:
-  %a26 = and <4 x i64> %a, splat (i64 67108863)
-  %b26 = and <4 x i64> %b, splat (i64 67108863)
-  %mul = mul <4 x i64> %a26, %b26
-  %res = add <4 x i64> %acc, %mul
-  ret <4 x i64> %res
-}
-
-define <16 x i64> @v16_test_split(<16 x i64> %a, <16 x i64> %b, <16 x i64> %acc) #1 {
-; X64-LABEL: v16_test_split:
-; X64:       vpmadd52luq
-; X64:       vpmadd52luq
-; X64:       ret
-entry:
-  %a26 = and <16 x i64> %a, splat (i64 67108863)
-  %b26 = and <16 x i64> %b, splat (i64 67108863)
-  %mul = mul <16 x i64> %a26, %b26
-  %res = add <16 x i64> %acc, %mul
+define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) {
+  %x_masked = and <16 x i64> %x, splat (i64 67108863)
+  %y_masked = and <16 x i64> %y, splat (i64 67108863)
+  %mul = mul <16 x i64> %x_masked, %y_masked
+  %res = add <16 x i64> %z, %mul
   ret <16 x i64> %res
 }
-
-attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,-avx512vl,+cmov,+crc32,+evex512,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
-attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512ifma,+avx512vl,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
-attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "target-features"="+avx,+avx2,+avx512dq,+avx512f,+avx512vl,+avxifma,+cmov,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }

>From 89bae92ae295e5b47f0580c77cb8e0f9cde1560e Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Fri, 5 Sep 2025 15:09:23 +0100
Subject: [PATCH 6/8] Refactor X86ISelLowering.cpp and add test checks

Address feedback:
Removed ValueTracking.h (sorry think clion autoadded and I missed it)
Removed leftover comment // if (0
Added !Subtarget.hasAVXIFMA guard to prevent missed application of this
change on AVXIFMA 128/256bit
Add assert message
Change X86ISD::VPMADD52L call to reflect different X86 op order
Add test CHECK lines from script
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  12 +-
 .../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 266 ++++++++++++++++++
 2 files changed, 272 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 64f6b09984900..1102ee0b74fd4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27,7 +27,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -4462,7 +4461,6 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
   unsigned NumSubs = 1;
   if ((CheckBWI && Subtarget.useBWIRegs()) ||
       (!CheckBWI && Subtarget.useAVX512Regs())) {
-    // if (0) {
     if (VT.getSizeInBits() > 512) {
       NumSubs = VT.getSizeInBits() / 512;
       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -57978,7 +57976,8 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
     return SDValue();
 
   // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
-  if (!Subtarget.hasVLX() && VT.getSizeInBits() < 512)
+  if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
+      VT.getSizeInBits() < 512)
     return SDValue();
 
   SDValue X, Y, Acc;
@@ -57996,9 +57995,10 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
   auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
                             ArrayRef<SDValue> SubOps) {
     EVT SubVT = SubOps[0].getValueType();
-    assert(SubVT.getScalarSizeInBits() == 64);
-    return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[0] /*Acc*/,
-                     SubOps[1] /*X*/, SubOps[2] /*Y*/);
+    assert(SubVT.getScalarSizeInBits() == 64 &&
+           "Unexpected element size, only supports 64bit size");
+    return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
+                     SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
   };
 
   return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index f8078209cdaeb..54e2eb049fd63 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL
@@ -7,6 +8,27 @@
 ; 4503599627370495 == (1 << 52) - 1
 
 define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_512_combine:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX-NEXT:    vpand %ymm6, %ymm0, %ymm0
+; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm0
+; AVX-NEXT:    vpand %ymm6, %ymm1, %ymm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
+; AVX-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_512_combine:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpmadd52luq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 67108863)
   %y_masked = and <8 x i64> %y, splat (i64 67108863)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -15,6 +37,27 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 }
 
 define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_512_combine_v2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3]
+; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
+; AVX-NEXT:    vpand %ymm7, %ymm0, %ymm0
+; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm0
+; AVX-NEXT:    vpand %ymm7, %ymm1, %ymm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
+; AVX-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_512_combine_v2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512-NEXT:    vpmadd52luq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1
   %y_masked = and <8 x i64> %y, splat (i64 3)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -23,6 +66,48 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 }
 
 define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_512_no_combine:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495]
+; AVX-NEXT:    vpand %ymm6, %ymm0, %ymm7
+; AVX-NEXT:    vpand %ymm6, %ymm1, %ymm8
+; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm9
+; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm6
+; AVX-NEXT:    vpsrlq $32, %ymm8, %ymm8
+; AVX-NEXT:    vpmuludq %ymm3, %ymm8, %ymm8
+; AVX-NEXT:    vpsrlq $32, %ymm6, %ymm6
+; AVX-NEXT:    vpmuludq %ymm6, %ymm1, %ymm6
+; AVX-NEXT:    vpaddq %ymm6, %ymm8, %ymm6
+; AVX-NEXT:    vpsllq $32, %ymm6, %ymm6
+; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    vpsrlq $32, %ymm7, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVX-NEXT:    vpsrlq $32, %ymm9, %ymm7
+; AVX-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
+; AVX-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
+; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVX-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_512_no_combine:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495]
+; AVX512-NEXT:    vpandq %zmm3, %zmm0, %zmm4
+; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm3
+; AVX512-NEXT:    vpsrlq $32, %zmm4, %zmm4
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm4, %zmm4
+; AVX512-NEXT:    vpsrlq $32, %zmm3, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 4503599627370495)
   %y_masked = and <8 x i64> %y, splat (i64 4503599627370495)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -31,12 +116,72 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 }
 
 define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_512_no_combine_v2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm6
+; AVX-NEXT:    vpmuludq %ymm3, %ymm6, %ymm6
+; AVX-NEXT:    vpsrlq $32, %ymm3, %ymm7
+; AVX-NEXT:    vpmuludq %ymm7, %ymm1, %ymm7
+; AVX-NEXT:    vpaddq %ymm6, %ymm7, %ymm6
+; AVX-NEXT:    vpsllq $32, %ymm6, %ymm6
+; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVX-NEXT:    vpsrlq $32, %ymm2, %ymm7
+; AVX-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
+; AVX-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
+; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVX-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_512_no_combine_v2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm3, %zmm3
+; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm4
+; AVX512-NEXT:    vpmuludq %zmm4, %zmm0, %zmm4
+; AVX512-NEXT:    vpaddq %zmm3, %zmm4, %zmm3
+; AVX512-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    retq
   %mul = mul <8 x i64> %x, %y
   %res = add <8 x i64> %mul, %z
   ret <8 x i64> %res
 }
 
 define <4 x i64> @test_256_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
+; AVX-LABEL: test_256_combine:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-NOVL-LABEL: test_256_combine:
+; AVX512-NOVL:       # %bb.0:
+; AVX512-NOVL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVX512-NOVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512-NOVL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NOVL-NEXT:    retq
+;
+; AVX512VL-LABEL: test_256_combine:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
   %x_masked = and <4 x i64> %x, splat(i64 67108863)
   %y_masked = and <4 x i64> %y, splat(i64 67108863)
   %mul = mul nuw nsw <4 x i64> %x_masked, %y_masked
@@ -45,12 +190,50 @@ define <4 x i64> @test_256_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
 }
 
 define <4 x i64> @test_256_no_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
+; X64-LABEL: test_256_no_combine:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; X64-NEXT:    vpmuludq %ymm1, %ymm3, %ymm3
+; X64-NEXT:    vpsrlq $32, %ymm1, %ymm4
+; X64-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
+; X64-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
+; X64-NEXT:    vpsllq $32, %ymm3, %ymm3
+; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; X64-NEXT:    retq
   %mul = mul <4 x i64> %x, %y
   %res = add <4 x i64> %mul, %z
   ret <4 x i64> %res
 }
 
 define <2 x i64> @test_128_combine(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
+; AVX-LABEL: test_128_combine:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
+; AVX-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    {vex} vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-NOVL-LABEL: test_128_combine:
+; AVX512-NOVL:       # %bb.0:
+; AVX512-NOVL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
+; AVX512-NOVL-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX512-NOVL-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX512-NOVL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
+; AVX512-NOVL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512-NOVL-NEXT:    retq
+;
+; AVX512VL-LABEL: test_128_combine:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX512VL-NEXT:    retq
   %x_masked = and <2 x i64> %x, splat (i64 67108863)
   %y_masked = and <2 x i64> %y, splat (i64 67108863)
   %mul = mul <2 x i64> %x_masked, %y_masked
@@ -60,6 +243,11 @@ define <2 x i64> @test_128_combine(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
 
 ; Sanity check we're not applying this here
 define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
+; X64-LABEL: test_scalar_no_ifma:
+; X64:       # %bb.0:
+; X64-NEXT:    imulq %rsi, %rdi
+; X64-NEXT:    leaq (%rdi,%rdx), %rax
+; X64-NEXT:    retq
   %mul = mul <1 x i64> %x, %y
   %res = add <1 x i64> %mul, %z
   ret <1 x i64> %res
@@ -67,6 +255,16 @@ define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z)
 
 define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
   ; 40-bit and 13-bit, too wide
+; AVX-LABEL: test_mixed_width_too_wide:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %ymm5, %ymm1
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_mixed_width_too_wide:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512-NEXT:    retq
   %x40 = and <8 x i64> %x, splat (i64 1099511627775)
   %y13 = and <8 x i64> %y, splat (i64 8191)
   %mul = mul <8 x i64> %x40, %y13
@@ -75,6 +273,27 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64
 }
 
 define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) {
+; AVX-LABEL: test_zext32_inputs_not_safe:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpmuludq %ymm5, %ymm4, %ymm4
+; AVX-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpaddq %ymm4, %ymm2, %ymm0
+; AVX-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zext32_inputs_not_safe:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
   %x = zext <8 x i32> %xi32 to <8 x i64>
   %y = zext <8 x i32> %yi32 to <8 x i64>
   %mul = mul <8 x i64> %x, %y
@@ -83,6 +302,53 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32,
 }
 
 define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) {
+; AVX-LABEL: test_1024_combine_split:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    .cfi_offset %rbp, -16
+; AVX-NEXT:    movq %rsp, %rbp
+; AVX-NEXT:    .cfi_def_cfa_register %rbp
+; AVX-NEXT:    andq $-32, %rsp
+; AVX-NEXT:    subq $32, %rsp
+; AVX-NEXT:    vmovdqa 112(%rbp), %ymm8
+; AVX-NEXT:    vmovdqa 80(%rbp), %ymm9
+; AVX-NEXT:    vmovdqa 48(%rbp), %ymm10
+; AVX-NEXT:    vmovdqa 16(%rbp), %ymm11
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm3, %ymm12, %ymm3
+; AVX-NEXT:    vpand %ymm2, %ymm12, %ymm2
+; AVX-NEXT:    vpand %ymm1, %ymm12, %ymm1
+; AVX-NEXT:    vpand %ymm0, %ymm12, %ymm0
+; AVX-NEXT:    vpand %ymm7, %ymm12, %ymm7
+; AVX-NEXT:    {vex} vpmadd52luq %ymm7, %ymm3, %ymm8
+; AVX-NEXT:    vpand %ymm6, %ymm12, %ymm3
+; AVX-NEXT:    {vex} vpmadd52luq %ymm3, %ymm2, %ymm9
+; AVX-NEXT:    vpand %ymm5, %ymm12, %ymm2
+; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm1, %ymm10
+; AVX-NEXT:    vpand %ymm4, %ymm12, %ymm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm11
+; AVX-NEXT:    vmovdqa %ymm11, %ymm0
+; AVX-NEXT:    vmovdqa %ymm10, %ymm1
+; AVX-NEXT:    vmovdqa %ymm9, %ymm2
+; AVX-NEXT:    vmovdqa %ymm8, %ymm3
+; AVX-NEXT:    movq %rbp, %rsp
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    .cfi_def_cfa %rsp, 8
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_1024_combine_split:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512-NEXT:    vpandq %zmm6, %zmm2, %zmm2
+; AVX512-NEXT:    vpandq %zmm6, %zmm0, %zmm0
+; AVX512-NEXT:    vpmadd52luq %zmm2, %zmm0, %zmm4
+; AVX512-NEXT:    vpandq %zmm6, %zmm3, %zmm0
+; AVX512-NEXT:    vpandq %zmm6, %zmm1, %zmm1
+; AVX512-NEXT:    vpmadd52luq %zmm0, %zmm1, %zmm5
+; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512-NEXT:    retq
   %x_masked = and <16 x i64> %x, splat (i64 67108863)
   %y_masked = and <16 x i64> %y, splat (i64 67108863)
   %mul = mul <16 x i64> %x_masked, %y_masked

>From 9a211314476c0769ffdaf10eacfcfef1c8d7a089 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Fri, 5 Sep 2025 16:42:41 +0100
Subject: [PATCH 7/8] Add nounwind to silence cfi noise

---
 llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 54e2eb049fd63..24028c124fc3f 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -301,14 +301,11 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32,
   ret <8 x i64> %res
 }
 
-define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) {
+define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) nounwind {
 ; AVX-LABEL: test_1024_combine_split:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    pushq %rbp
-; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    .cfi_offset %rbp, -16
 ; AVX-NEXT:    movq %rsp, %rbp
-; AVX-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX-NEXT:    andq $-32, %rsp
 ; AVX-NEXT:    subq $32, %rsp
 ; AVX-NEXT:    vmovdqa 112(%rbp), %ymm8
@@ -334,7 +331,6 @@ define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i
 ; AVX-NEXT:    vmovdqa %ymm8, %ymm3
 ; AVX-NEXT:    movq %rbp, %rsp
 ; AVX-NEXT:    popq %rbp
-; AVX-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_1024_combine_split:

>From 74fd607f1d2f048038e6ccb788bfbe5a2d78f0d3 Mon Sep 17 00:00:00 2001
From: Justin Riddell <arghnews at hotmail.co.uk>
Date: Mon, 8 Sep 2025 20:58:09 +0100
Subject: [PATCH 8/8] Add tests for odd sizes

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   4 +
 .../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 206 ++++++++++++++++++
 2 files changed, 210 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1102ee0b74fd4..dde5feac0fca8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57980,6 +57980,10 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
       VT.getSizeInBits() < 512)
     return SDValue();
 
+  const auto totalSize = VT.getSizeInBits();
+  if (totalSize < 128 || !isPowerOf2_64(totalSize))
+    return SDValue();
+
   SDValue X, Y, Acc;
   if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
     return SDValue();
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 24028c124fc3f..c9f61207d3c9c 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -351,3 +351,209 @@ define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i
   %res = add <16 x i64> %z, %mul
   ret <16 x i64> %res
 }
+
+define <1 x i64> @test_not_i1(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
+; X64-LABEL: test_not_i1:
+; X64:       # %bb.0:
+; X64-NEXT:    andl $67108863, %edi # imm = 0x3FFFFFF
+; X64-NEXT:    imulq %rdi, %rdi
+; X64-NEXT:    leaq (%rdi,%rdx), %rax
+; X64-NEXT:    retq
+  %x_masked = and <1 x i64> %x, splat (i64 67108863)
+  %y_masked = and <1 x i64> %x, splat (i64 67108863)
+  %mul = mul <1 x i64> %x_masked, %y_masked
+  %res = add <1 x i64> %mul, %z
+  ret <1 x i64> %res
+}
+
+define <3 x i64> @test_i3(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
+; AVX-LABEL: test_i3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-NOVL-LABEL: test_i3:
+; AVX512-NOVL:       # %bb.0:
+; AVX512-NOVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
+; AVX512-NOVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    retq
+;
+; AVX512VL-LABEL: test_i3:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %x_masked = and <3 x i64> %x, splat (i64 67108863)
+  %y_masked = and <3 x i64> %x, splat (i64 67108863)
+  %mul = mul <3 x i64> %x_masked, %y_masked
+  %res = add <3 x i64> %mul, %z
+  ret <3 x i64> %res
+}
+
+define <5 x i64> @test_i5(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
+; AVX-LABEL: test_i5:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vmovq %r8, %xmm0
+; AVX-NEXT:    vmovq %rcx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vmovq %rdx, %xmm1
+; AVX-NEXT:    vmovq %rsi, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm2
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVX-NEXT:    vmovq %rcx, %xmm3
+; AVX-NEXT:    vmovq %r9, %xmm4
+; AVX-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
+; AVX-NEXT:    vpsllq $33, %xmm4, %xmm4
+; AVX-NEXT:    vpmuludq %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
+; AVX-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVX-NEXT:    vmovq %xmm1, 32(%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_i5:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x_masked = and <5 x i64> %x, splat (i64 67108863)
+  %y_masked = and <5 x i64> %x, splat (i64 67108863)
+  %mul = mul <5 x i64> %x_masked, %y_masked
+  %res = add <5 x i64> %mul, %z
+  ret <5 x i64> %res
+}
+
+define <6 x i64> @test_i6(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
+; AVX-LABEL: test_i6:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vmovq %r8, %xmm0
+; AVX-NEXT:    vmovq %rcx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vmovq %rdx, %xmm1
+; AVX-NEXT:    vmovq %rsi, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm1
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm1
+; AVX-NEXT:    vmovq %r9, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, 32(%rdi)
+; AVX-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_i6:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x_masked = and <6 x i64> %x, splat (i64 67108863)
+  %y_masked = and <6 x i64> %x, splat (i64 67108863)
+  %mul = mul <6 x i64> %x_masked, %y_masked
+  %res = add <6 x i64> %mul, %z
+  ret <6 x i64> %res
+}
+
+define <9 x i64> @test_i9(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) {
+; AVX-LABEL: test_i9:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vmovq %r8, %xmm0
+; AVX-NEXT:    vmovq %rcx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vmovq %rdx, %xmm1
+; AVX-NEXT:    vmovq %rsi, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovq %r9, %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm3
+; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm4
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVX-NEXT:    vmovq %rcx, %xmm5
+; AVX-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
+; AVX-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; AVX-NEXT:    vpsrlq $32, %xmm5, %xmm6
+; AVX-NEXT:    vpmuludq %xmm6, %xmm5, %xmm6
+; AVX-NEXT:    vpsllq $33, %xmm6, %xmm6
+; AVX-NEXT:    vpmuludq %xmm5, %xmm5, %xmm5
+; AVX-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVX-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm4
+; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm1, %ymm3
+; AVX-NEXT:    vmovdqa %ymm3, 32(%rdi)
+; AVX-NEXT:    vmovdqa %ymm4, (%rdi)
+; AVX-NEXT:    vmovq %xmm2, 64(%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_i9:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    vmovq %r8, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %rdx, %xmm1
+; AVX512-NEXT:    vmovq %rsi, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovq %r9, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVX512-NEXT:    vmovq %rcx, %xmm3
+; AVX512-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX512-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
+; AVX512-NEXT:    vpsllq $33, %xmm4, %xmm4
+; AVX512-NEXT:    vpmuludq %xmm3, %xmm3, %xmm3
+; AVX512-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX512-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; AVX512-NEXT:    vpmadd52luq %zmm0, %zmm0, %zmm2
+; AVX512-NEXT:    vmovq %xmm1, 64(%rdi)
+; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x_masked = and <9 x i64> %x, splat (i64 67108863)
+  %y_masked = and <9 x i64> %x, splat (i64 67108863)
+  %mul = mul <9 x i64> %x_masked, %y_masked
+  %res = add <9 x i64> %mul, %z
+  ret <9 x i64> %res
+}