[clang] [X86][Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow AVX/AVX512 IFMA madd52 intrinsics to be used in constexpr (PR #161056)
NagaChaitanya Vellanki via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 2 22:36:03 PDT 2025
https://github.com/chaitanyav updated https://github.com/llvm/llvm-project/pull/161056
>From 83f379bd8ea7b079411a68b1827bd7130cf5ec1e Mon Sep 17 00:00:00 2001
From: NagaChaitanya Vellanki <pnagato at protonmail.com>
Date: Sun, 28 Sep 2025 00:29:57 -0700
Subject: [PATCH] [X86][Clang] VectorExprEvaluator::VisitCallExpr /
InterpretBuiltin - Allow AVX/AVX512 IFMA madd52 intrinsics to be used in
constexpr
Resolves #160498
---
clang/include/clang/Basic/BuiltinsX86.td | 14 +-
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 15 ++
clang/lib/AST/ExprConstant.cpp | 46 ++++
clang/lib/Headers/avx512ifmaintrin.h | 63 +++--
clang/lib/Headers/avx512ifmavlintrin.h | 43 +++-
clang/lib/Headers/avxifmaintrin.h | 18 +-
clang/test/CodeGen/X86/avx512ifma-builtins.c | 224 +++++++++++++++++-
.../test/CodeGen/X86/avx512ifmavl-builtins.c | 212 +++++++++++++++--
clang/test/CodeGen/X86/avxifma-builtins.c | 158 +++++++++---
9 files changed, 667 insertions(+), 126 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index e98bee28c15be..9a0919739d2bd 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -2101,24 +2101,18 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
def movdqa64store256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
}
-let Features = "avx512ifma", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512ifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
}
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
}
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+ def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
}
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index a2e97fcafdfef..30801b6f9060f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3523,6 +3523,21 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return F;
});
+ case X86::BI__builtin_ia32_vpmadd52luq128:
+ case X86::BI__builtin_ia32_vpmadd52luq256:
+ case X86::BI__builtin_ia32_vpmadd52luq512:
+ return interp__builtin_elementwise_triop(
+ S, OpPC, Call, [](const APSInt &A, const APSInt &B, const APSInt &C) {
+ return A + (B.trunc(52) * C.trunc(52)).trunc(52).zext(64);
+ });
+ case X86::BI__builtin_ia32_vpmadd52huq128:
+ case X86::BI__builtin_ia32_vpmadd52huq256:
+ case X86::BI__builtin_ia32_vpmadd52huq512:
+ return interp__builtin_elementwise_triop(
+ S, OpPC, Call, [](const APSInt &A, const APSInt &B, const APSInt &C) {
+ return A + llvm::APIntOps::mulhu(B.trunc(52), C.trunc(52)).zext(64);
+ });
+
case X86::BI__builtin_ia32_vpshldd128:
case X86::BI__builtin_ia32_vpshldd256:
case X86::BI__builtin_ia32_vpshldd512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b706b14945b6d..93926bcf02179 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11869,6 +11869,52 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_vpmadd52luq128:
+ case X86::BI__builtin_ia32_vpmadd52luq256:
+ case X86::BI__builtin_ia32_vpmadd52luq512: {
+ APValue A, B, C;
+ if (!EvaluateAsRValue(Info, E->getArg(0), A) ||
+ !EvaluateAsRValue(Info, E->getArg(1), B) ||
+ !EvaluateAsRValue(Info, E->getArg(2), C))
+ return false;
+
+ unsigned ALen = A.getVectorLength();
+ SmallVector<APValue, 4> ResultElements;
+ ResultElements.reserve(ALen);
+
+ for (unsigned EltNum = 0; EltNum < ALen; EltNum += 1) {
+ APInt AElt = A.getVectorElt(EltNum).getInt();
+ APInt BElt = B.getVectorElt(EltNum).getInt().trunc(52);
+ APInt CElt = C.getVectorElt(EltNum).getInt().trunc(52);
+ APSInt ResElt(AElt + (BElt * CElt).trunc(52).zext(64), false);
+ ResultElements.push_back(APValue(ResElt));
+ }
+
+ return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+ }
+ case X86::BI__builtin_ia32_vpmadd52huq128:
+ case X86::BI__builtin_ia32_vpmadd52huq256:
+ case X86::BI__builtin_ia32_vpmadd52huq512: {
+ APValue A, B, C;
+ if (!EvaluateAsRValue(Info, E->getArg(0), A) ||
+ !EvaluateAsRValue(Info, E->getArg(1), B) ||
+ !EvaluateAsRValue(Info, E->getArg(2), C))
+ return false;
+
+ unsigned ALen = A.getVectorLength();
+ SmallVector<APValue, 4> ResultElements;
+ ResultElements.reserve(ALen);
+
+ for (unsigned EltNum = 0; EltNum < ALen; EltNum += 1) {
+ APInt AElt = A.getVectorElt(EltNum).getInt();
+ APInt BElt = B.getVectorElt(EltNum).getInt().trunc(52);
+ APInt CElt = C.getVectorElt(EltNum).getInt().trunc(52);
+ APSInt ResElt(AElt + llvm::APIntOps::mulhu(BElt, CElt).zext(64), false);
+ ResultElements.push_back(APValue(ResElt));
+ }
+
+ return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+ }
case clang::X86::BI__builtin_ia32_vprotbi:
case clang::X86::BI__builtin_ia32_vprotdi:
case clang::X86::BI__builtin_ia32_vprotqi:
diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h
index f01b322ce7787..625a8ff66dc60 100644
--- a/clang/lib/Headers/avx512ifmaintrin.h
+++ b/clang/lib/Headers/avx512ifmaintrin.h
@@ -15,54 +15,53 @@
#define __IFMAINTRIN_H
/* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS \
+ constexpr \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \
+ __min_vector_width__(512)))
+#else
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \
__min_vector_width__(512)))
+#endif
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
- return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
- (__v8di) __Z);
+_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+ return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di)__X, (__v8di)__Y,
+ (__v8di)__Z);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
- (__v8di)__W);
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_madd52hi_epu64(
+ __m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+ return (__m512i)__builtin_ia32_selectq_512(
+ __M, (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), (__v8di)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
- (__v8di)_mm512_setzero_si512());
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_madd52hi_epu64(
+ __mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
+ return (__m512i)__builtin_ia32_selectq_512(
+ __M, (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
+ (__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
- return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
- (__v8di) __Z);
+_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+ return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di)__X, (__v8di)__Y,
+ (__v8di)__Z);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
- (__v8di)__W);
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_madd52lo_epu64(
+ __m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+ return (__m512i)__builtin_ia32_selectq_512(
+ __M, (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), (__v8di)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
- (__v8di)_mm512_setzero_si512());
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_madd52lo_epu64(
+ __mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
+ return (__m512i)__builtin_ia32_selectq_512(
+ __M, (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
+ (__v8di)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index a72b56113a12b..9cfac4dd01455 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -15,6 +15,16 @@
#define __IFMAVLINTRIN_H
/* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128 \
+ constexpr __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512ifma,avx512vl"), \
+ __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ constexpr __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512ifma,avx512vl"), \
+ __min_vector_width__(256)))
+#else
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512ifma,avx512vl"), \
@@ -23,22 +33,31 @@
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512ifma,avx512vl"), \
__min_vector_width__(256)))
+#endif
-#define _mm_madd52hi_epu64(X, Y, Z) \
- ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \
- (__v2di)(Z)))
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+ return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
+ (__v2di)__Z);
+}
-#define _mm256_madd52hi_epu64(X, Y, Z) \
- ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y), \
- (__v4di)(Z)))
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+ return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+ (__v4di)__Z);
+}
-#define _mm_madd52lo_epu64(X, Y, Z) \
- ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y), \
- (__v2di)(Z)))
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+ return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+ (__v2di)__Z);
+}
-#define _mm256_madd52lo_epu64(X, Y, Z) \
- ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \
- (__v4di)(Z)))
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+ return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+ (__v4di)__Z);
+}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
index 5c782d2a5b865..6916be7b49b15 100644
--- a/clang/lib/Headers/avxifmaintrin.h
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -22,6 +22,14 @@
__attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
__min_vector_width__(256)))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR constexpr __DEFAULT_FN_ATTRS256
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR constexpr __DEFAULT_FN_ATTRS128
+#else
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#endif
+
// must vex-encoding
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
@@ -55,7 +63,7 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
(__v2di)__Z);
@@ -92,7 +100,7 @@ _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
@@ -129,7 +137,7 @@ _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
(__v2di)__Z);
@@ -166,12 +174,14 @@ _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
#endif // __AVXIFMAINTRIN_H
diff --git a/clang/test/CodeGen/X86/avx512ifma-builtins.c b/clang/test/CodeGen/X86/avx512ifma-builtins.c
index eebefb0bad4ab..9e8c5464db383 100644
--- a/clang/test/CodeGen/X86/avx512ifma-builtins.c
+++ b/clang/test/CodeGen/X86/avx512ifma-builtins.c
@@ -3,50 +3,250 @@
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror | FileCheck %s
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-
-
#include <immintrin.h>
+#include "builtin_test_helpers.h"
__m512i test_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
// CHECK-LABEL: test_mm512_madd52hi_epu64
// CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
- return _mm512_madd52hi_epu64(__X, __Y, __Z);
+ return _mm512_madd52hi_epu64(__X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v8di(_mm512_madd52hi_epu64(
+ (__m512i)(__v8du){100, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){10, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){5, 0, 0, 0, 0, 0, 0, 0}),
+ 100, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_madd52hi_epu64: high bits remain in accumulator");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52hi_epu64(
+ (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){0xFFFFFFFFFFFFFull, 0, 0, 0,
+ 0, 0, 0, 0},
+ (__m512i)(__v8du){0xFFFFFFFFFFFFFull, 0, 0, 0,
+ 0, 0, 0, 0}),
+ 0xFFFFFFFFFFFFEull, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_madd52hi_epu64: max value multiply high bits");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52hi_epu64(
+ (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+ (__m512i)(__v8du){0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull},
+ (__m512i)(__v8du){0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull}),
+ 4503599627370495ull, 4503599627370496ull,
+ 4503599627370497ull, 4503599627370498ull,
+ 4503599627370499ull, 4503599627370500ull,
+ 4503599627370501ull, 4503599627370502ull),
+ "mm512_madd52hi_epu64: eight-lane high bits with carry");
+
__m512i test_mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
// CHECK-LABEL: test_mm512_mask_madd52hi_epu64
// CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
- return _mm512_mask_madd52hi_epu64(__W, __M, __X, __Y);
+ return _mm512_mask_madd52hi_epu64(__W, __M, __X, __Y);
}
+TEST_CONSTEXPR(match_v8di(_mm512_mask_madd52hi_epu64(
+ (__m512i)(__v8du){111, 222, 333, 444, 555, 666,
+ 777, 888},
+ 0x00,
+ (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80}),
+ 111, 222, 333, 444, 555, 666, 777, 888),
+ "mm512_mask_madd52hi_epu64: zero mask preserves __W");
+
+TEST_CONSTEXPR(match_v8di(_mm512_mask_madd52hi_epu64(
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80},
+ 0xFF,
+ (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+ 700, 800},
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80}),
+ 10, 20, 30, 40, 50, 60, 70, 80),
+ "mm512_mask_madd52hi_epu64: all mask bits set");
+
__m512i test_mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
// CHECK-LABEL: test_mm512_maskz_madd52hi_epu64
// CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
- return _mm512_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
+ return _mm512_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_madd52hi_epu64(
+ 0x00,
+ (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80},
+ (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+ 700, 800}),
+ 0, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_maskz_madd52hi_epu64: zero mask");
+
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_madd52hi_epu64(
+ 0xFF,
+ (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80},
+ (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+ 700, 800}),
+ 1, 2, 3, 4, 5, 6, 7, 8),
+ "mm512_maskz_madd52hi_epu64: all mask bits set");
+
__m512i test_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
// CHECK-LABEL: test_mm512_madd52lo_epu64
// CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
- return _mm512_madd52lo_epu64(__X, __Y, __Z);
+ return _mm512_madd52lo_epu64(__X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){10, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){5, 0, 0, 0, 0, 0, 0, 0}),
+ 50, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_madd52lo_epu64: basic multiply-add low bits");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){100, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){20, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){30, 0, 0, 0, 0, 0, 0, 0}),
+ 700, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_madd52lo_epu64: accumulator test");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){0xFFFFFFFFFFFFFull, 0, 0, 0,
+ 0, 0, 0, 0},
+ (__m512i)(__v8du){1, 0, 0, 0, 0, 0, 0, 0}),
+ 0xFFFFFFFFFFFFFull, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_madd52lo_epu64: max 52-bit value");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){0x1F000000000000ull, 0, 0, 0,
+ 0, 0, 0, 0},
+ (__m512i)(__v8du){2, 0, 0, 0, 0, 0, 0, 0}),
+ 0xE000000000000ull, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_madd52lo_epu64: 52-bit truncation test");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80},
+ (__m512i)(__v8du){2, 3, 4, 5, 6, 7, 8, 9}),
+ 21, 62, 123, 204, 305, 426, 567, 728),
+ "mm512_madd52lo_epu64: eight-lane computation");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){0xFFFFFFFFFFFFFull, 0, 0, 0,
+ 0, 0, 0, 0},
+ (__m512i)(__v8du){10, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){5, 0, 0, 0, 0, 0, 0, 0}),
+ 4503599627370545ull, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_madd52lo_epu64: accumulator with large value");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80},
+ (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+ 700, 800},
+ (__m512i)(__v8du){2, 3, 4, 5, 6, 7, 8, 9}),
+ 210, 620, 1230, 2040, 3050, 4260, 5670, 7280),
+ "mm512_madd52lo_epu64: eight-lane with larger values");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){0x1F000000000000ull,
+ 0x1F000000000000ull, 0, 0, 0,
+ 0, 0, 0},
+ (__m512i)(__v8du){2, 3, 0, 0, 0, 0, 0, 0}),
+ 0xE000000000000ull, 0xD000000000000ull, 0, 0, 0, 0,
+ 0, 0),
+ "mm512_madd52lo_epu64: two-lane truncation test");
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+ (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+ (__m512i)(__v8du){0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull},
+ (__m512i)(__v8du){1, 1, 1, 1, 1, 1, 1, 1}),
+ 0xFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFull),
+ "mm512_madd52lo_epu64: all lanes max value");
+
__m512i test_mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
// CHECK-LABEL: test_mm512_mask_madd52lo_epu64
// CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
- return _mm512_mask_madd52lo_epu64(__W, __M, __X, __Y);
+ return _mm512_mask_madd52lo_epu64(__W, __M, __X, __Y);
}
+TEST_CONSTEXPR(match_v8di(_mm512_mask_madd52lo_epu64(
+ (__m512i)(__v8du){111, 222, 333, 444, 555, 666,
+ 777, 888},
+ 0x00,
+ (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80}),
+ 111, 222, 333, 444, 555, 666, 777, 888),
+ "mm512_mask_madd52lo_epu64: zero mask preserves __W");
+
+TEST_CONSTEXPR(match_v8di(_mm512_mask_madd52lo_epu64(
+ (__m512i)(__v8du){1000, 2000, 3000, 4000, 5000,
+ 6000, 7000, 8000},
+ 0xFF,
+ (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+ 700, 800},
+ (__m512i)(__v8du){20, 30, 40, 50, 60, 70, 80,
+ 90}),
+ 3000, 8000, 15000, 24000, 35000, 48000, 63000,
+ 80000),
+ "mm512_mask_madd52lo_epu64: all mask bits set");
+
__m512i test_mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
// CHECK-LABEL: test_mm512_maskz_madd52lo_epu64
// CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
- return _mm512_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
+ return _mm512_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
}
+
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_madd52lo_epu64(
+ 0x00,
+ (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+ (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+ 80},
+ (__m512i)(__v8du){2, 3, 4, 5, 6, 7, 8, 9}),
+ 0, 0, 0, 0, 0, 0, 0, 0),
+ "mm512_maskz_madd52lo_epu64: zero mask");
+
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_madd52lo_epu64(
+ 0xFF,
+ (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+ 700, 800},
+ (__m512i)(__v8du){20, 30, 40, 50, 60, 70, 80,
+ 90},
+ (__m512i)(__v8du){30, 40, 50, 60, 70, 80, 90,
+ 100}),
+ 700, 1400, 2300, 3400, 4700, 6200, 7900, 9800),
+ "mm512_maskz_madd52lo_epu64: all mask bits set");
diff --git a/clang/test/CodeGen/X86/avx512ifmavl-builtins.c b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
index 89108fc037520..b41c821a9049b 100644
--- a/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
@@ -3,90 +3,266 @@
// RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
// RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -x c %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-
-
#include <immintrin.h>
+#include "builtin_test_helpers.h"
__m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
// CHECK-LABEL: test_mm_madd52hi_epu64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
- return _mm_madd52hi_epu64(__X, __Y, __Z);
+ return _mm_madd52hi_epu64(__X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_epu64(
+ (__m128i)((__v2du){100, 0}),
+ (__m128i)((__v2du){10, 0}),
+ (__m128i)((__v2du){5, 0})),
+ 100, 0),
+ "mm_madd52hi_epu64: high bits remain in accumulator");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_epu64(
+ (__m128i)((__v2du){0, 0}),
+ (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0}),
+ (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0})),
+ 0xFFFFFFFFFFFFEull, 0),
+ "mm_madd52hi_epu64: max value multiply high bits");
+
__m128i test_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
// CHECK-LABEL: test_mm_mask_madd52hi_epu64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
- return _mm_mask_madd52hi_epu64(__W, __M, __X, __Y);
+ return _mm_mask_madd52hi_epu64(__W, __M, __X, __Y);
}
+TEST_CONSTEXPR(match_v2di(_mm_mask_madd52hi_epu64((__m128i)((__v2du){111, 222}),
+ 0x0,
+ (__m128i)((__v2du){1, 2}),
+ (__m128i)((__v2du){10, 20})),
+ 111, 222),
+ "mm_mask_madd52hi_epu64: zero mask preserves __W");
+
+TEST_CONSTEXPR(match_v2di(_mm_mask_madd52hi_epu64((__m128i)((__v2du){10, 20}),
+ 0x2,
+ (__m128i)((__v2du){0x1000000000000ULL, 0x1000000000000ULL}),
+ (__m128i)((__v2du){0x1000000000000ULL, 0x1000000000000ULL})),
+ 10, 0x100000000014ULL),
+ "mm_mask_madd52hi_epu64: mask 0x2");
+
__m128i test_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
// CHECK-LABEL: test_mm_maskz_madd52hi_epu64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
- return _mm_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
+ return _mm_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v2di(_mm_maskz_madd52hi_epu64(0x3,
+ (__m128i)((__v2du){1, 2}),
+ (__m128i)((__v2du){10, 20}),
+ (__m128i)((__v2du){100, 200})),
+ 1, 2),
+ "mm_maskz_madd52hi_epu64: all mask bits set");
+
+TEST_CONSTEXPR(match_v2di(_mm_maskz_madd52hi_epu64(0x1,
+ (__m128i)((__v2du){0x1000000000000ULL, 0x1000000000000ULL}),
+ (__m128i)((__v2du){0x1000000000000ULL, 0x1000000000000ULL}),
+ (__m128i)((__v2du){0, 0})),
+ 0x1000000000000ULL, 0),
+ "mm_maskz_madd52hi_epu64: mask 0x1");
+
__m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
// CHECK-LABEL: test_mm256_madd52hi_epu64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
- return _mm256_madd52hi_epu64(__X, __Y, __Z);
+ return _mm256_madd52hi_epu64(__X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_epu64(
+ (__m256i)((__v4du){100, 200, 300, 400}),
+ (__m256i)((__v4du){10, 20, 30, 40}),
+ (__m256i)((__v4du){5, 6, 7, 8})),
+ 100, 200, 300, 400),
+ "mm256_madd52hi_epu64: accumulator preserved");
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_epu64(
+ (__m256i)((__v4du){0, 0, 0, 0}),
+ (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+ 0}),
+ (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+ 0})),
+ 0xFFFFFFFFFFFFEull, 0, 0, 0),
+ "mm256_madd52hi_epu64: single lane max value");
+
__m256i test_mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
// CHECK-LABEL: test_mm256_mask_madd52hi_epu64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
- return _mm256_mask_madd52hi_epu64(__W, __M, __X, __Y);
+ return _mm256_mask_madd52hi_epu64(__W, __M, __X, __Y);
}
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52hi_epu64((__m256i)((__v4du){111, 222, 333, 444}),
+ 0x0,
+ (__m256i)((__v4du){1, 2, 3, 4}),
+ (__m256i)((__v4du){10, 20, 30, 40})),
+ 111, 222, 333, 444),
+ "mm256_mask_madd52hi_epu64: zero mask preserves __W");
+
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52hi_epu64((__m256i)((__v4du){10, 20, 30, 40}),
+ 0xA,
+ (__m256i)((__v4du){0x1000000000000ULL, 0x1000000000000ULL,
+ 0x1000000000000ULL, 0x1000000000000ULL}),
+ (__m256i)((__v4du){0x1000000000000ULL, 0x1000000000000ULL,
+ 0x1000000000000ULL, 0x1000000000000ULL})),
+ 10, 0x100000000014ULL, 30, 0x100000000028ULL),
+ "mm256_mask_madd52hi_epu64: mask 0xA");
+
__m256i test_mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
// CHECK-LABEL: test_mm256_maskz_madd52hi_epu64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
- return _mm256_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
+ return _mm256_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_madd52hi_epu64(0xF,
+ (__m256i)((__v4du){1, 2, 3, 4}),
+ (__m256i)((__v4du){10, 20, 30, 40}),
+ (__m256i)((__v4du){100, 200, 300, 400})),
+ 1, 2, 3, 4),
+ "mm256_maskz_madd52hi_epu64: all mask bits set");
+
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_madd52hi_epu64(0x5,
+ (__m256i)((__v4du){0x1000000000000ULL, 0x1000000000000ULL,
+ 0x1000000000000ULL, 0x1000000000000ULL}),
+ (__m256i)((__v4du){0x1000000000000ULL, 0x1000000000000ULL,
+ 0x1000000000000ULL, 0x1000000000000ULL}),
+ (__m256i)((__v4du){0, 0, 0, 0})),
+ 0x1000000000000ULL, 0, 0x1000000000000ULL, 0),
+ "mm256_maskz_madd52hi_epu64: mask 0x5");
+
__m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
// CHECK-LABEL: test_mm_madd52lo_epu64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
- return _mm_madd52lo_epu64(__X, __Y, __Z);
+ return _mm_madd52lo_epu64(__X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+ (__m128i)((__v2du){0, 0}),
+ (__m128i)((__v2du){10, 0}),
+ (__m128i)((__v2du){5, 0})),
+ 50, 0),
+ "mm_madd52lo_epu64: basic multiply-add low bits");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+ (__m128i)((__v2du){100, 0}),
+ (__m128i)((__v2du){20, 0}),
+ (__m128i)((__v2du){30, 0})),
+ 700, 0),
+ "mm_madd52lo_epu64: accumulator test");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+ (__m128i)((__v2du){1, 2}),
+ (__m128i)((__v2du){10, 20}),
+ (__m128i)((__v2du){2, 3})),
+ 21, 62),
+ "mm_madd52lo_epu64: two-lane computation");
+
__m128i test_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
// CHECK-LABEL: test_mm_mask_madd52lo_epu64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
- return _mm_mask_madd52lo_epu64(__W, __M, __X, __Y);
+ return _mm_mask_madd52lo_epu64(__W, __M, __X, __Y);
}
+TEST_CONSTEXPR(match_v2di(_mm_mask_madd52lo_epu64((__m128i)((__v2du){1000, 2000}),
+ 0x3,
+ (__m128i)((__v2du){100, 200}),
+ (__m128i)((__v2du){20, 30})),
+ 3000, 8000),
+ "mm_mask_madd52lo_epu64: all mask bits set");
+
+TEST_CONSTEXPR(match_v2di(_mm_mask_madd52lo_epu64((__m128i)((__v2du){111, 222}),
+ 0x0,
+ (__m128i)((__v2du){1, 2}),
+ (__m128i)((__v2du){10, 20})),
+ 111, 222),
+ "mm_mask_madd52lo_epu64: zero mask preserves __W");
+
__m128i test_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
// CHECK-LABEL: test_mm_maskz_madd52lo_epu64
// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
// CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
- return _mm_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
+ return _mm_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v2di(_mm_maskz_madd52lo_epu64(0x3,
+ (__m128i)((__v2du){100, 200}),
+ (__m128i)((__v2du){20, 30}),
+ (__m128i)((__v2du){30, 40})),
+ 700, 1400),
+ "mm_maskz_madd52lo_epu64: all mask bits set");
+
+TEST_CONSTEXPR(match_v2di(_mm_maskz_madd52lo_epu64(0x1,
+ (__m128i)((__v2du){100, 0}),
+ (__m128i)((__v2du){20, 0}),
+ (__m128i)((__v2du){30, 0})),
+ 700, 0),
+ "mm_maskz_madd52lo_epu64: first lane only");
+
__m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
// CHECK-LABEL: test_mm256_madd52lo_epu64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
- return _mm256_madd52lo_epu64(__X, __Y, __Z);
+ return _mm256_madd52lo_epu64(__X, __Y, __Z);
}
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_epu64(
+ (__m256i)((__v4du){1, 2, 3, 4}),
+ (__m256i)((__v4du){10, 20, 30, 40}),
+ (__m256i)((__v4du){2, 3, 4, 5})),
+ 21, 62, 123, 204),
+ "mm256_madd52lo_epu64: four-lane computation");
+
__m256i test_mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
// CHECK-LABEL: test_mm256_mask_madd52lo_epu64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
- return _mm256_mask_madd52lo_epu64(__W, __M, __X, __Y);
+ return _mm256_mask_madd52lo_epu64(__W, __M, __X, __Y);
}
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52lo_epu64((__m256i)((__v4du){1000, 2000, 3000, 4000}),
+ 0xF,
+ (__m256i)((__v4du){100, 200, 300, 400}),
+ (__m256i)((__v4du){20, 30, 40, 50})),
+ 3000, 8000, 15000, 24000),
+ "mm256_mask_madd52lo_epu64: all mask bits set");
+
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52lo_epu64((__m256i)((__v4du){111, 222, 333, 444}),
+ 0x0,
+ (__m256i)((__v4du){1, 2, 3, 4}),
+ (__m256i)((__v4du){10, 20, 30, 40})),
+ 111, 222, 333, 444),
+ "mm256_mask_madd52lo_epu64: zero mask preserves __W");
+
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52lo_epu64((__m256i)((__v4du){11, 22, 33, 44}),
+ 0x5,
+ (__m256i)((__v4du){100, 200, 300, 400}),
+ (__m256i)((__v4du){10, 20, 30, 40})),
+ 1011, 22, 9033, 44),
+ "mm256_mask_madd52lo_epu64: mask 0x5");
+
__m256i test_mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
// CHECK-LABEL: test_mm256_maskz_madd52lo_epu64
// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
// CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
- return _mm256_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
+ return _mm256_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
}
+
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_madd52lo_epu64(0xF,
+ (__m256i)((__v4du){100, 200, 300, 400}),
+ (__m256i)((__v4du){20, 30, 40, 50}),
+ (__m256i)((__v4du){30, 40, 50, 60})),
+ 700, 1400, 2300, 3400),
+ "mm256_maskz_madd52lo_epu64: all mask bits set");
+
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_madd52lo_epu64(0x9,
+ (__m256i)((__v4du){100, 200, 300, 400}),
+ (__m256i)((__v4du){10, 20, 30, 40}),
+ (__m256i)((__v4du){5, 10, 15, 20})),
+ 150, 0, 0, 1200),
+ "mm256_maskz_madd52lo_epu64: mask 0x9");
diff --git a/clang/test/CodeGen/X86/avxifma-builtins.c b/clang/test/CodeGen/X86/avxifma-builtins.c
index aa151591ed143..6150f87f039fc 100644
--- a/clang/test/CodeGen/X86/avxifma-builtins.c
+++ b/clang/test/CodeGen/X86/avxifma-builtins.c
@@ -3,58 +3,140 @@
// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s
// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-
-
#include <immintrin.h>
-
-__m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-// CHECK-LABEL: test_mm_madd52hi_epu64
-// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
- return _mm_madd52hi_epu64(__X, __Y, __Z);
-}
-
-__m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-// CHECK-LABEL: test_mm256_madd52hi_epu64
-// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
- return _mm256_madd52hi_epu64(__X, __Y, __Z);
-}
-
-__m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-// CHECK-LABEL: test_mm_madd52lo_epu64
-// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
- return _mm_madd52lo_epu64(__X, __Y, __Z);
-}
-
-__m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-// CHECK-LABEL: test_mm256_madd52lo_epu64
-// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
- return _mm256_madd52lo_epu64(__X, __Y, __Z);
-}
+#include "builtin_test_helpers.h"
__m128i test_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-// CHECK-LABEL: test_mm_madd52hi_avx_epu64
-// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+ // CHECK-LABEL: test_mm_madd52hi_avx_epu64
+ // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
return _mm_madd52hi_avx_epu64(__X, __Y, __Z);
}
__m256i test_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-// CHECK-LABEL: test_mm256_madd52hi_avx_epu64
-// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+ // CHECK-LABEL: test_mm256_madd52hi_avx_epu64
+ // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
return _mm256_madd52hi_avx_epu64(__X, __Y, __Z);
}
__m128i test_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-// CHECK-LABEL: test_mm_madd52lo_avx_epu64
-// CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+ // CHECK-LABEL: test_mm_madd52lo_avx_epu64
+ // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
return _mm_madd52lo_avx_epu64(__X, __Y, __Z);
}
__m256i test_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-// CHECK-LABEL: test_mm256_madd52lo_avx_epu64
-// CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+ // CHECK-LABEL: test_mm256_madd52lo_avx_epu64
+ // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
return _mm256_madd52lo_avx_epu64(__X, __Y, __Z);
}
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+ (__m128i)((__v2du){0, 0}),
+ (__m128i)((__v2du){10, 0}),
+ (__m128i)((__v2du){5, 0})),
+ 50, 0),
+ "mm_madd52lo_avx_epu64: basic multiply-add low bits");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+ (__m128i)((__v2du){100, 0}),
+ (__m128i)((__v2du){20, 0}),
+ (__m128i)((__v2du){30, 0})),
+ 700, 0),
+ "mm_madd52lo_avx_epu64: accumulator test");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+ (__m128i)((__v2du){1, 2}),
+ (__m128i)((__v2du){10, 20}),
+ (__m128i)((__v2du){2, 3})),
+ 21, 62),
+ "mm_madd52lo_avx_epu64: two-lane computation");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+ (__m128i)((__v2du){0, 0}),
+ (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0}),
+ (__m128i)((__v2du){1, 0})),
+ 0xFFFFFFFFFFFFFull, 0),
+ "mm_madd52lo_avx_epu64: max 52-bit value");
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_avx_epu64(
+ (__m256i)((__v4du){1, 2, 3, 4}),
+ (__m256i)((__v4du){10, 20, 30, 40}),
+ (__m256i)((__v4du){2, 3, 4, 5})),
+ 21, 62, 123, 204),
+ "mm256_madd52lo_avx_epu64: four-lane computation");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_avx_epu64(
+ (__m128i)((__v2du){100, 0}),
+ (__m128i)((__v2du){10, 0}),
+ (__m128i)((__v2du){5, 0})),
+ 100, 0),
+ "mm_madd52hi_avx_epu64: high bits remain in accumulator");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_avx_epu64(
+ (__m128i)((__v2du){0, 0}),
+ (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0}),
+ (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0})),
+ 0xFFFFFFFFFFFFEull, 0),
+ "mm_madd52hi_avx_epu64: max value multiply high bits");
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_avx_epu64(
+ (__m256i)((__v4du){100, 200, 300, 400}),
+ (__m256i)((__v4du){10, 20, 30, 40}),
+ (__m256i)((__v4du){5, 6, 7, 8})),
+ 100, 200, 300, 400),
+ "mm256_madd52hi_avx_epu64: accumulator preserved");
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_avx_epu64(
+ (__m256i)((__v4du){0, 0, 0, 0}),
+ (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+ 0}),
+ (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+ 0})),
+ 0xFFFFFFFFFFFFEull, 0, 0, 0),
+ "mm256_madd52hi_avx_epu64: single lane max value");
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_avx_epu64(
+ (__m256i)((__v4du){0, 0, 0, 0}),
+ (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+ 0}),
+ (__m256i)((__v4du){1, 0, 0, 0})),
+ 0xFFFFFFFFFFFFFull, 0, 0, 0),
+ "mm256_madd52lo_avx_epu64: single lane max value");
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_avx_epu64(
+ (__m256i)((__v4du){0, 0, 0, 0}),
+ (__m256i)((__v4du){0x1F000000000000ull, 0, 0,
+ 0}),
+ (__m256i)((__v4du){2, 0, 0, 0})),
+ 0xE000000000000ull, 0, 0, 0),
+ "mm256_madd52lo_avx_epu64: 52-bit truncation test");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+ (__m128i)((__v2du){5, 10}),
+ (__m128i)((__v2du){100, 200}),
+ (__m128i)((__v2du){7, 8})),
+ 705, 1610),
+ "mm_madd52lo_avx_epu64: two-lane accumulator");
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_avx_epu64(
+ (__m256i)((__v4du){1, 2, 3, 4}),
+ (__m256i)((__v4du){10, 20, 30, 40}),
+ (__m256i)((__v4du){2, 3, 4, 5})),
+ 21, 62, 123, 204),
+ "mm256_madd52lo_avx_epu64: four-lane computation");
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_avx_epu64(
+ (__m128i)((__v2du){50, 100}),
+ (__m128i)((__v2du){10, 20}),
+ (__m128i)((__v2du){5, 6})),
+ 50, 100),
+ "mm_madd52hi_avx_epu64: accumulator preserved");
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_avx_epu64(
+ (__m256i)((__v4du){0, 0, 0, 0}),
+ (__m256i)((__v4du){0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull, 0, 0}),
+ (__m256i)((__v4du){0xFFFFFFFFFFFFFull,
+ 0xFFFFFFFFFFFFFull, 0, 0})),
+ 0xFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFEull, 0, 0),
+ "mm256_madd52hi_avx_epu64: two-lane max value");
More information about the cfe-commits
mailing list