[clang] [X86][Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow AVX/AVX512 IFMA madd52 intrinsics to be used in constexpr (PR #161056)

NagaChaitanya Vellanki via cfe-commits cfe-commits at lists.llvm.org
Sun Sep 28 03:08:41 PDT 2025


https://github.com/chaitanyav updated https://github.com/llvm/llvm-project/pull/161056

>From bc897b3e9806bbafdd67e8ec75d43847c3553454 Mon Sep 17 00:00:00 2001
From: NagaChaitanya Vellanki <pnagato at protonmail.com>
Date: Sun, 28 Sep 2025 00:29:57 -0700
Subject: [PATCH] [X86][Clang] VectorExprEvaluator::VisitCallExpr /
 InterpretBuiltin - Allow AVX/AVX512 IFMA madd52 intrinsics to be used in
 constexpr

Resolves #160498
---
 clang/include/clang/Basic/BuiltinsX86.td      |  42 +--
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  22 ++
 clang/lib/AST/ExprConstant.cpp                |  51 ++++
 clang/lib/Headers/avx512ifmaintrin.h          |  71 ++---
 clang/lib/Headers/avxifmaintrin.h             |  18 +-
 .../test/AST/ByteCode/x86-ifma-constexpr.cpp  |  36 +++
 clang/test/Sema/x86-ifma-constexpr.cpp        | 262 ++++++++++++++++++
 7 files changed, 443 insertions(+), 59 deletions(-)
 create mode 100644 clang/test/AST/ByteCode/x86-ifma-constexpr.cpp
 create mode 100644 clang/test/Sema/x86-ifma-constexpr.cpp

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 77e599587edc3..a5247629e255f 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -2101,27 +2101,6 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
   def movdqa64store256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
 }
 
-let Features = "avx512ifma", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
-  def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
-}
-
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
 let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vcomisd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>, _Constant int, _Constant int)">;
   def vcomiss : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>, _Constant int, _Constant int)">;
@@ -3128,6 +3107,27 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in {
   def kordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
 }
 
+let Features = "avx512ifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+  def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+  def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
 let Features = "avx512dq", Attributes = [NoThrow, Const] in {
   def kortestcqi : X86Builtin<"int(unsigned char, unsigned char)">;
   def kortestzqi : X86Builtin<"int(unsigned char, unsigned char)">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 891344d4e6ed0..cf6a739cc5c60 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3564,6 +3564,28 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return F;
         });
 
+  case X86::BI__builtin_ia32_vpmadd52luq128:
+  case X86::BI__builtin_ia32_vpmadd52luq256:
+  case X86::BI__builtin_ia32_vpmadd52luq512:
+    return interp__builtin_elementwise_triop(
+        S, OpPC, Call, [](const APSInt &A, const APSInt &B, const APSInt &C) {
+          APSInt result = A * B + C;
+          APSInt mask(APSInt::getAllOnes(52).zext(64), false);
+          APSInt masked_result = result & mask;
+          return APSInt(masked_result, true); // unsigned result
+        });
+  case X86::BI__builtin_ia32_vpmadd52huq128:
+  case X86::BI__builtin_ia32_vpmadd52huq256:
+  case X86::BI__builtin_ia32_vpmadd52huq512:
+    return interp__builtin_elementwise_triop(
+        S, OpPC, Call, [](const APSInt &A, const APSInt &B, const APSInt &C) {
+          APSInt result = A * B + C;
+          APSInt mask(APSInt::getAllOnes(52).zext(64), false);
+          APSInt shifted_result = result >> 52;
+          APSInt masked_result = shifted_result & mask;
+          return APSInt(masked_result, true); // unsigned result
+        });
+
   case X86::BI__builtin_ia32_vpshldd128:
   case X86::BI__builtin_ia32_vpshldd256:
   case X86::BI__builtin_ia32_vpshldd512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b706b14945b6d..c9d8a2b01dd74 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -60,10 +60,12 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LSP/Logging.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/SipHash.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
+
 #include <cstring>
 #include <functional>
 #include <limits>
@@ -11869,6 +11871,55 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
+  case X86::BI__builtin_ia32_vpmadd52luq128:
+  case X86::BI__builtin_ia32_vpmadd52luq256:
+  case X86::BI__builtin_ia32_vpmadd52luq512: {
+    APValue A, B, C;
+    if (!EvaluateAsRValue(Info, E->getArg(0), A) ||
+        !EvaluateAsRValue(Info, E->getArg(1), B) ||
+        !EvaluateAsRValue(Info, E->getArg(2), C))
+      return false;
+
+    unsigned ALen = A.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(ALen);
+
+    for (unsigned EltNum = 0; EltNum < ALen; EltNum += 1) {
+      APInt AElt = A.getVectorElt(EltNum).getInt();
+      APInt BElt = B.getVectorElt(EltNum).getInt();
+      APInt CElt = C.getVectorElt(EltNum).getInt();
+      APInt ResElt(AElt.zext(128) * BElt.zext(128) + CElt.zext(128));
+      APInt Mask(64, 0x000FFFFFFFFFFFFFULL);
+      ResultElements.push_back(APValue(APSInt(ResElt.trunc(64) & Mask, false)));
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case X86::BI__builtin_ia32_vpmadd52huq128:
+  case X86::BI__builtin_ia32_vpmadd52huq256:
+  case X86::BI__builtin_ia32_vpmadd52huq512: {
+    APValue A, B, C;
+    if (!EvaluateAsRValue(Info, E->getArg(0), A) ||
+        !EvaluateAsRValue(Info, E->getArg(1), B) ||
+        !EvaluateAsRValue(Info, E->getArg(2), C))
+      return false;
+
+    unsigned ALen = A.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(ALen);
+
+    for (unsigned EltNum = 0; EltNum < ALen; EltNum += 1) {
+      APInt AElt = A.getVectorElt(EltNum).getInt();
+      APInt BElt = B.getVectorElt(EltNum).getInt();
+      APInt CElt = C.getVectorElt(EltNum).getInt();
+      APInt ResElt(AElt.zext(128) * BElt.zext(128) + CElt.zext(128));
+      APInt Mask(64, 0x000FFFFFFFFFFFFFULL);
+      ResultElements.push_back(
+          APValue(APSInt(ResElt.lshr(52).trunc(64) & Mask, false)));
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case clang::X86::BI__builtin_ia32_vprotbi:
   case clang::X86::BI__builtin_ia32_vprotdi:
   case clang::X86::BI__builtin_ia32_vprotqi:
diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h
index f01b322ce7787..6d800f25e5798 100644
--- a/clang/lib/Headers/avx512ifmaintrin.h
+++ b/clang/lib/Headers/avx512ifmaintrin.h
@@ -19,52 +19,55 @@
   __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
                  __min_vector_width__(512)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
-                                                (__v8di) __Z);
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di)__X, (__v8di)__Y,
+                                                (__v8di)__Z);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
-                                   (__v8di)__W);
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X,
+                           __m512i __Y) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
-                                   (__v8di)_mm512_setzero_si512());
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y,
+                            __m512i __Z) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
+      (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
-                                                (__v8di) __Z);
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di)__X, (__v8di)__Y,
+                                                (__v8di)__Z);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
-                                   (__v8di)__W);
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X,
+                           __m512i __Y) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
-                                   (__v8di)_mm512_setzero_si512());
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y,
+                            __m512i __Z) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
+      (__v8di)_mm512_setzero_si512());
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
index 5c782d2a5b865..1a9aaaf53affa 100644
--- a/clang/lib/Headers/avxifmaintrin.h
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -22,6 +22,14 @@
   __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
                  __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#else
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#endif
+
 // must vex-encoding
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
@@ -55,7 +63,7 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
   return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
                                                 (__v2di)__Z);
@@ -92,7 +100,7 @@ _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
   return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
                                                 (__v4di)__Z);
@@ -129,7 +137,7 @@ _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
   return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
                                                 (__v2di)__Z);
@@ -166,12 +174,14 @@ _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
   return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
                                                 (__v4di)__Z);
 }
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
 
 #endif // __AVXIFMAINTRIN_H
diff --git a/clang/test/AST/ByteCode/x86-ifma-constexpr.cpp b/clang/test/AST/ByteCode/x86-ifma-constexpr.cpp
new file mode 100644
index 0000000000000..72dde7d27b3ec
--- /dev/null
+++ b/clang/test/AST/ByteCode/x86-ifma-constexpr.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++2a -fsyntax-only \
+// RUN:   -triple x86_64-unknown-unknown -target-feature +avxifma -ffreestanding \
+// RUN:   -verify %s
+
+// Test constexpr evaluation of X86 IFMA intrinsics with the ByteCode interpreter.
+
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+
+// Declare required IFMA builtin functions.
+extern "C" {
+__m128i __builtin_ia32_vpmadd52luq128(__m128i, __m128i, __m128i);
+__m128i __builtin_ia32_vpmadd52huq128(__m128i, __m128i, __m128i);
+}
+
+// Intrinsic wrapper functions.
+static constexpr __inline__ __m128i __attribute__((__always_inline__, __nodebug__, __target__("avxifma")))
+_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return __builtin_ia32_vpmadd52luq128(__X, __Y, __Z);
+}
+
+// Simple test to check if IFMA intrinsics can be used in constexpr context
+constexpr bool test_basic_ifma() {
+  __m128i a = (__m128i){5ULL, 3ULL};
+  __m128i b = (__m128i){7ULL, 4ULL};
+  __m128i c = (__m128i){2ULL, 1ULL};
+
+  // Just test that we can call the intrinsic in constexpr context
+  __m128i result = _mm_madd52lo_epu64(a, b, c);
+  (void)result; // Suppress unused variable warning
+  return true;
+}
+
+// Basic test to verify constexpr evaluation works
+static_assert(test_basic_ifma(), "Basic IFMA constexpr test failed");
+
+// expected-no-diagnostics
\ No newline at end of file
diff --git a/clang/test/Sema/x86-ifma-constexpr.cpp b/clang/test/Sema/x86-ifma-constexpr.cpp
new file mode 100644
index 0000000000000..eb1fca13dfea5
--- /dev/null
+++ b/clang/test/Sema/x86-ifma-constexpr.cpp
@@ -0,0 +1,262 @@
+// RUN: %clang_cc1 -std=c++2a -fsyntax-only -triple x86_64-unknown-unknown \
+// RUN:   -target-feature +avxifma -ffreestanding -verify %s
+
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
+typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
+
+typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
+typedef unsigned long long __v4du __attribute__((__vector_size__(32)));
+typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
+
+extern "C" {
+__m128i __builtin_ia32_vpmadd52luq128(__m128i, __m128i, __m128i);
+__m128i __builtin_ia32_vpmadd52huq128(__m128i, __m128i, __m128i);
+__m256i __builtin_ia32_vpmadd52luq256(__m256i, __m256i, __m256i);
+__m256i __builtin_ia32_vpmadd52huq256(__m256i, __m256i, __m256i);
+__m512i __builtin_ia32_vpmadd52luq512(__m512i, __m512i, __m512i);
+__m512i __builtin_ia32_vpmadd52huq512(__m512i, __m512i, __m512i);
+}
+static constexpr __inline__ __m128i __attribute__((__always_inline__, __nodebug__, __target__("avxifma")))
+_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return __builtin_ia32_vpmadd52luq128(__X, __Y, __Z);
+}
+
+static constexpr __inline__ __m128i __attribute__((__always_inline__, __nodebug__, __target__("avxifma")))
+_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return __builtin_ia32_vpmadd52huq128(__X, __Y, __Z);
+}
+
+static constexpr __inline__ __m256i __attribute__((__always_inline__, __nodebug__, __target__("avxifma")))
+_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return __builtin_ia32_vpmadd52luq256(__X, __Y, __Z);
+}
+
+static constexpr __inline__ __m256i __attribute__((__always_inline__, __nodebug__, __target__("avxifma")))
+_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return __builtin_ia32_vpmadd52huq256(__X, __Y, __Z);
+}
+
+static constexpr __inline__ __m512i __attribute__((__always_inline__, __nodebug__, __target__("avxifma")))
+_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  return __builtin_ia32_vpmadd52luq512(__X, __Y, __Z);
+}
+
+static constexpr __inline__ __m512i __attribute__((__always_inline__, __nodebug__, __target__("avxifma")))
+_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  return __builtin_ia32_vpmadd52huq512(__X, __Y, __Z);
+}
+
+#define TEST_CONSTEXPR(expr) static_assert(expr, "constexpr test failed")
+
+constexpr bool match_v2du(__m128i result, unsigned long long e0, unsigned long long e1) {
+  __v2du v = (__v2du)result;
+  return v[0] == e0 && v[1] == e1;
+}
+
+constexpr bool match_v4du(__m256i result, unsigned long long e0, unsigned long long e1,
+                          unsigned long long e2, unsigned long long e3) {
+  __v4du v = (__v4du)result;
+  return v[0] == e0 && v[1] == e1 && v[2] == e2 && v[3] == e3;
+}
+
+constexpr bool match_v8du(__m512i result, unsigned long long e0, unsigned long long e1,
+                          unsigned long long e2, unsigned long long e3,
+                          unsigned long long e4, unsigned long long e5,
+                          unsigned long long e6, unsigned long long e7) {
+  __v8du v = (__v8du)result;
+  return v[0] == e0 && v[1] == e1 && v[2] == e2 && v[3] == e3 &&
+         v[4] == e4 && v[5] == e5 && v[6] == e6 && v[7] == e7;
+}
+
+constexpr unsigned long long compute_madd52lo_manual(
+    unsigned long long a, unsigned long long b, unsigned long long c) {
+  constexpr unsigned long long mask52 = 0x000FFFFFFFFFFFFFULL;
+  return (a * b + c) & mask52;
+}
+
+constexpr unsigned long long compute_madd52hi_manual(
+    unsigned long long a, unsigned long long b, unsigned long long c) {
+  constexpr unsigned long long mask52 = 0x000FFFFFFFFFFFFFULL;
+  return ((a * b + c) >> 52) & mask52;
+}
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){5, 3},
+    (__m128i)(__v2du){7, 4},
+    (__m128i)(__v2du){2, 1}),
+    37, 13));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64(
+    (__m256i)(__v4du){5, 3, 2, 6},
+    (__m256i)(__v4du){7, 4, 8, 3},
+    (__m256i)(__v4du){2, 1, 5, 4}),
+    37, 13, 21, 22));
+
+TEST_CONSTEXPR(match_v8du(_mm512_madd52lo_epu64(
+    (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+    (__m512i)(__v8du){8, 7, 6, 5, 4, 3, 2, 1},
+    (__m512i)(__v8du){1, 1, 1, 1, 1, 1, 1, 1}),
+    9, 15, 19, 21, 21, 19, 15, 9));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){0, 0},
+    (__m128i)(__v2du){0, 0},
+    (__m128i)(__v2du){0, 0}),
+    0, 0));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64(
+    (__m256i)(__v4du){0, 0, 0, 0},
+    (__m256i)(__v4du){0, 0, 0, 0},
+    (__m256i)(__v4du){0, 0, 0, 0}),
+    0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8du(_mm512_madd52lo_epu64(
+    (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0}),
+    0, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){0, 0},
+    (__m128i)(__v2du){123, 456},
+    (__m128i)(__v2du){42, 73}),
+    42, 73));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64(
+    (__m256i)(__v4du){0, 0, 0, 0},
+    (__m256i)(__v4du){123, 456, 789, 321},
+    (__m256i)(__v4du){42, 73, 11, 99}),
+    42, 73, 11, 99));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){5, 3},
+    (__m128i)(__v2du){7, 4},
+    (__m128i)(__v2du){0, 0}),
+    35, 12));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64(
+    (__m256i)(__v4du){5, 3, 7, 2},
+    (__m256i)(__v4du){7, 4, 3, 8},
+    (__m256i)(__v4du){0, 0, 0, 0}),
+    35, 12, 21, 16));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52hi_epu64(
+    (__m128i)(__v2du){0x0010000000000000ULL, 0x0008000000000000ULL},
+    (__m128i)(__v2du){1, 2},
+    (__m128i)(__v2du){0, 0}),
+    1, 1));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52hi_epu64(
+    (__m256i)(__v4du){0x0010000000000000ULL, 0x0008000000000000ULL, 0x0020000000000000ULL, 0x0004000000000000ULL},
+    (__m256i)(__v4du){1, 2, 1, 4},
+    (__m256i)(__v4du){0, 0, 0, 0}),
+    1, 1, 2, 1));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){0x0010000000000000ULL, 0x0008000000000000ULL},
+    (__m128i)(__v2du){1, 2},
+    (__m128i)(__v2du){0, 0}),
+    0, 0));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64(
+    (__m256i)(__v4du){0x0010000000000000ULL, 0x0008000000000000ULL, 0x0020000000000000ULL, 0x0004000000000000ULL},
+    (__m256i)(__v4du){1, 2, 1, 4},
+    (__m256i)(__v4du){0, 0, 0, 0}),
+    0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL},
+    (__m128i)(__v2du){1, 2},
+    (__m128i)(__v2du){0, 1}),
+    compute_madd52lo_manual(0x000FFFFFFFFFFFFFULL, 1, 0),
+    compute_madd52lo_manual(0x000FFFFFFFFFFFFFULL, 2, 1)));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52hi_epu64(
+    (__m128i)(__v2du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL},
+    (__m128i)(__v2du){1, 2},
+    (__m128i)(__v2du){0, 1}),
+    compute_madd52hi_manual(0x000FFFFFFFFFFFFFULL, 1, 0),
+    compute_madd52hi_manual(0x000FFFFFFFFFFFFFULL, 2, 1)));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){123456789, 987654321},
+    (__m128i)(__v2du){1, 1},
+    (__m128i)(__v2du){0, 0}),
+    123456789, 987654321));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64(
+    (__m256i)(__v4du){123456789, 987654321, 555666777, 111222333},
+    (__m256i)(__v4du){1, 1, 1, 1},
+    (__m256i)(__v4du){0, 0, 0, 0}),
+    123456789, 987654321, 555666777, 111222333));
+
+TEST_CONSTEXPR(match_v8du(_mm512_madd52lo_epu64(
+    (__m512i)(__v8du){123456789, 987654321, 555666777, 111222333, 444555666, 777888999, 100200300, 999888777},
+    (__m512i)(__v8du){1, 1, 1, 1, 1, 1, 1, 1},
+    (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0}),
+    123456789, 987654321, 555666777, 111222333, 444555666, 777888999, 100200300, 999888777));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){123, 456},
+    (__m128i)(__v2du){789, 321},
+    (__m128i)(__v2du){100, 200}),
+    compute_madd52lo_manual(123, 789, 100), compute_madd52lo_manual(456, 321, 200)));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64(
+    (__m256i)(__v4du){123, 456, 789, 321},
+    (__m256i)(__v4du){789, 321, 123, 456},
+    (__m256i)(__v4du){100, 200, 300, 400}),
+    compute_madd52lo_manual(123, 789, 100), compute_madd52lo_manual(456, 321, 200),
+    compute_madd52lo_manual(789, 123, 300), compute_madd52lo_manual(321, 456, 400)));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){789, 321},
+    (__m128i)(__v2du){123, 456},
+    (__m128i)(__v2du){100, 200}),
+    compute_madd52lo_manual(789, 123, 100), compute_madd52lo_manual(321, 456, 200)));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52hi_epu64(
+    (__m128i)(__v2du){0x0010000000000000ULL, 0x0020000000000000ULL},
+    (__m128i)(__v2du){2, 3},
+    (__m128i)(__v2du){0, 0}),
+    2, 6));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52hi_epu64(
+    (__m256i)(__v4du){0x0010000000000000ULL, 0x0020000000000000ULL, 0x0030000000000000ULL, 0x0040000000000000ULL},
+    (__m256i)(__v4du){2, 3, 2, 2},
+    (__m256i)(__v4du){0, 0, 0, 0}),
+    2, 6, 6, 8));
+
+TEST_CONSTEXPR(match_v8du(_mm512_madd52hi_epu64(
+    (__m512i)(__v8du){0x0010000000000000ULL, 0x0020000000000000ULL, 0x0030000000000000ULL, 0x0040000000000000ULL,
+                      0x0050000000000000ULL, 0x0060000000000000ULL, 0x0070000000000000ULL, 0x0080000000000000ULL},
+    (__m512i)(__v8du){2, 3, 2, 2, 1, 1, 1, 1},
+    (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0}),
+    2, 6, 6, 8, 5, 6, 7, 8));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52hi_epu64(
+    (__m128i)(__v2du){1, 1},
+    (__m128i)(__v2du){1, 1},
+    (__m128i)(__v2du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}),
+    1, 1));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52hi_epu64(
+    (__m256i)(__v4du){1, 1, 1, 1},
+    (__m256i)(__v4du){1, 1, 1, 1},
+    (__m256i)(__v4du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}),
+    1, 1, 1, 1));
+
+TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64(
+    (__m128i)(__v2du){1, 1},
+    (__m128i)(__v2du){1, 1},
+    (__m128i)(__v2du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}),
+    0, 0));
+
+TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64(
+    (__m256i)(__v4du){1, 1, 1, 1},
+    (__m256i)(__v4du){1, 1, 1, 1},
+    (__m256i)(__v4du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}),
+    0, 0, 0, 0));
+
+// expected-no-diagnostics
\ No newline at end of file



More information about the cfe-commits mailing list