[clang] [Clang][X86] Replace unnecessary `vfmadd*` builtins with `element_wise_fma` (PR #152545)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Aug 7 09:25:48 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
@llvm/pr-subscribers-clang-codegen
Author: None (moorabbit)
<details>
<summary>Changes</summary>
The following intrinsics were replaced by `__builtin_elementwise_fma`:
- `__builtin_ia32_vfmaddps(256)`
- `__builtin_ia32_vfmaddpd(256)`
- `__builtin_ia32_vfmaddph(256)`
- `__builtin_ia32_vfmaddbf16(128 | 256 | 512)`
All `__builtin_ia32_vfmadd*` intrinsics are lowered to `__builtin_elementwise_fma`, so keeping them is an unnecessary indirection.
Fixes [#<!-- -->152461](https://github.com/llvm/llvm-project/issues/152461)
---
Patch is 59.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152545.diff
9 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsX86.td (-24)
- (modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (-9)
- (modified) clang/lib/Headers/avx10_2_512bf16intrin.h (+4-4)
- (modified) clang/lib/Headers/avx10_2bf16intrin.h (+8-8)
- (modified) clang/lib/Headers/avx512vlfp16intrin.h (+30-30)
- (modified) clang/lib/Headers/avx512vlintrin.h (+48-48)
- (modified) clang/lib/Headers/fma4intrin.h (+16-16)
- (modified) clang/lib/Headers/fmaintrin.h (+16-16)
- (modified) clang/test/CodeGen/target-builtin-noerror.c (+3-3)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index a4acc72fdc37d..609b720357e38 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -878,11 +878,6 @@ let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}
-let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vfmaddps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
- def vfmaddpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
-}
-
let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
@@ -899,8 +894,6 @@ let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128
}
let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vfmaddps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
- def vfmaddpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
def vfmaddsubps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
def vfmaddsubpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
}
@@ -4140,14 +4133,6 @@ let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVecto
def vcvtps2phx512_mask : X86Builtin<"_Vector<16, _Float16>(_Vector<16, float>, _Vector<16, _Float16>, unsigned short, _Constant int)">;
}
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vfmaddph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vfmaddph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>, _Vector<16, _Float16>, _Vector<16, _Float16>)">;
-}
-
let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vfmaddph512_mask : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
def vfmaddph512_mask3 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Vector<32, _Float16>, _Vector<32, _Float16>, unsigned int, _Constant int)">;
@@ -5373,13 +5358,4 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
- def vfmaddbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vfmaddbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vfmaddbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>)">;
}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index e23d19d2f6b6b..b508709e4bbae 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -1051,18 +1051,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vfmsubsd3_mask3:
return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
/*NegAcc*/ true);
- case X86::BI__builtin_ia32_vfmaddph:
- case X86::BI__builtin_ia32_vfmaddps:
- case X86::BI__builtin_ia32_vfmaddpd:
- case X86::BI__builtin_ia32_vfmaddph256:
- case X86::BI__builtin_ia32_vfmaddps256:
- case X86::BI__builtin_ia32_vfmaddpd256:
case X86::BI__builtin_ia32_vfmaddph512_mask:
case X86::BI__builtin_ia32_vfmaddph512_maskz:
case X86::BI__builtin_ia32_vfmaddph512_mask3:
- case X86::BI__builtin_ia32_vfmaddbf16128:
- case X86::BI__builtin_ia32_vfmaddbf16256:
- case X86::BI__builtin_ia32_vfmaddbf16512:
case X86::BI__builtin_ia32_vfmaddps512_mask:
case X86::BI__builtin_ia32_vfmaddps512_maskz:
case X86::BI__builtin_ia32_vfmaddps512_mask3:
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 75290d22ef259..a1613420eee2b 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -441,7 +441,7 @@ _mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
- return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B,
+ return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, (__v32bf)__B,
(__v32bf)__C);
}
@@ -469,7 +469,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh(
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
- return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B,
+ return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, (__v32bf)__B,
-(__v32bf)__C);
}
@@ -497,7 +497,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh(
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
- return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B,
+ return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, -(__v32bf)__B,
(__v32bf)__C);
}
@@ -527,7 +527,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh(
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
- return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B,
+ return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, -(__v32bf)__B,
-(__v32bf)__C);
}
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 66797ae00fe4f..8cf1f78f189aa 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -852,7 +852,7 @@ _mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) {
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_fmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
- return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B,
+ return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, (__v16bf)__B,
(__v16bf)__C);
}
@@ -880,7 +880,7 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pbh(
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_fmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
- return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B,
+ return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, (__v16bf)__B,
-(__v16bf)__C);
}
@@ -908,7 +908,7 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pbh(
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_fnmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
- return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B,
+ return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, -(__v16bf)__B,
(__v16bf)__C);
}
@@ -938,7 +938,7 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pbh(
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_fnmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
- return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B,
+ return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, -(__v16bf)__B,
-(__v16bf)__C);
}
@@ -969,7 +969,7 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pbh(
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmadd_pbh(__m128bh __A,
__m128bh __B,
__m128bh __C) {
- return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B,
+ return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, (__v8bf)__B,
(__v8bf)__C);
}
@@ -997,7 +997,7 @@ _mm_maskz_fmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsub_pbh(__m128bh __A,
__m128bh __B,
__m128bh __C) {
- return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B,
+ return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, (__v8bf)__B,
-(__v8bf)__C);
}
@@ -1025,7 +1025,7 @@ _mm_maskz_fmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmadd_pbh(__m128bh __A,
__m128bh __B,
__m128bh __C) {
- return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B,
+ return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, -(__v8bf)__B,
(__v8bf)__C);
}
@@ -1053,7 +1053,7 @@ _mm_maskz_fnmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmsub_pbh(__m128bh __A,
__m128bh __B,
__m128bh __C) {
- return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B,
+ return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, -(__v8bf)__B,
-(__v8bf)__C);
}
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index a12acb7d9a24a..ba27f0459ff25 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -1419,7 +1419,7 @@ _mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
__m128h __B,
__m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
+ return (__m128h)__builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B,
(__v8hf)__C);
}
@@ -1429,7 +1429,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
__m128h __C) {
return (__m128h)__builtin_ia32_selectph_128(
(__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
(__v8hf)__A);
}
@@ -1437,7 +1437,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
return (__m128h)__builtin_ia32_selectph_128(
(__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
(__v8hf)__C);
}
@@ -1445,14 +1445,14 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
return (__m128h)__builtin_ia32_selectph_128(
(__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
(__v8hf)_mm_setzero_ph());
}
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
__m128h __B,
__m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
+ return (__m128h)__builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B,
-(__v8hf)__C);
}
@@ -1476,7 +1476,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
return (__m128h)__builtin_ia32_selectph_128(
(__mmask8)__U,
- __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
(__v8hf)__C);
}
@@ -1484,7 +1484,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
return (__m128h)__builtin_ia32_selectph_128(
(__mmask8)__U,
- __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
(__v8hf)_mm_setzero_ph());
}
@@ -1492,14 +1492,14 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
return (__m128h)__builtin_ia32_selectph_128(
(__mmask8)__U,
- __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
(__v8hf)_mm_setzero_ph());
}
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
__m256h __B,
__m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
+ return (__m256h)__builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B,
(__v16hf)__C);
}
@@ -1507,7 +1507,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
(__v16hf)__A);
}
@@ -1515,7 +1515,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
(__v16hf)__C);
}
@@ -1523,14 +1523,14 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
(__v16hf)_mm256_setzero_ph());
}
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
__m256h __B,
__m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
+ return (__m256h)__builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B,
-(__v16hf)__C);
}
@@ -1538,7 +1538,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
(__v16hf)__A);
}
@@ -1546,7 +1546,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
(__v16hf)_mm256_setzero_ph());
}
@@ -1554,7 +1554,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
(__v16hf)__C);
}
@@ -1562,7 +1562,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
(__v16hf)_mm256_setzero_ph());
}
@@ -1570,7 +1570,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
(__v16hf)_mm256_setzero_ph());
}
@@ -1684,7 +1684,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
return (__m128h)__builtin_ia32_selectph_128(
(__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
(__v8hf)__C);
}
@@ -1692,7 +1692,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
return (__m256h)__builtin_ia32_selectph_256(
(__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
(__v16hf)__C);
}
@@ -1715,7 +1715,7 @@ _mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
__m128h __B,
__m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
+ return (__m128h)__builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B,
(__v8hf)__C);
}
@@ -1723,14 +1723,14 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
return (__m128h)__builtin_ia32_selectph_128(
(__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
+ __builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
(__v8hf)__A);
}
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/152545
More information about the cfe-commits
mailing list