[clang] [Headers][X86] Allow FMA intrinsics to be used in constexpr (PR #156385)

Tue Sep 2 06:08:42 PDT 2025

================
@@ -2501,125 +2501,124 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
                                              -(__v8df)(__m512d)(C), \
                                              (__mmask8)(U), (int)(R)))
 
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
+  return (__m512d)__builtin_elementwise_fma((__v8df)__A, (__v8df)__B,
+                                            (__v8df)__C);
+}
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) -1,
-                                                    _MM_FROUND_CUR_DIRECTION);
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
+  return (__m512d)__builtin_ia32_selectpd_512(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8df)__A, (__v8df)__B, (__v8df)__C),
+      (__v8df)__A);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
-  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
-                                                    (__v8df) __B,
-                                                    (__v8df) __C,
-                                                    (__mmask8) __U,
-                                                    _MM_FROUND_CUR_DIRECTION);
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
+  return (__m512d)__builtin_ia32_selectpd_512(
+      (__mmask8)__U,
+      __builtin_elementwise_fma((__v8df)__A, (__v8df)__B, (__v8df)__C),
+      (__v8df)__C);
----------------
ckoparkar wrote:

Good idea to reuse the base version.

Doing this for 128-bit and 256-bit intrinsics generates a lot of errors, the 512-bit ones are fine. Is this expected and the reason why the base intrinsic wasn't reused anywhere in `avx512vlintrin.h`? This is the error message I get for all of them:

`always_inline function '_mm_fmadd_pd' requires target feature 'evex512', but would be inlined into function '_mm_mask_fmadd_pd' that is compiled without support for 'evex512'`

I don't fully understand this but https://github.com/llvm/llvm-project/issues/70002#issuecomment-1801398114 suggests `-march=native` could cause this. I don't see the test using this flag, it is compiled with: 

`// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s`.

https://github.com/llvm/llvm-project/pull/156385