[clang] [Clang] Replace some x86 builtins with the generic __builtin_elementwise versions (PR #165682)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 30 08:40:14 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
Author: Nikolas Klauser (philnik777)
<details>
<summary>Changes</summary>
---
Full diff: https://github.com/llvm/llvm-project/pull/165682.diff
8 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsX86.td (-26)
- (modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (+19-45)
- (modified) clang/lib/Headers/avx10_2_512bf16intrin.h (+1-1)
- (modified) clang/lib/Headers/avx10_2bf16intrin.h (+2-2)
- (modified) clang/lib/Headers/avx512vlfp16intrin.h (+2-2)
- (modified) clang/lib/Headers/avxintrin.h (+4-8)
- (modified) clang/lib/Headers/emmintrin.h (+2-3)
- (modified) clang/lib/Headers/xmmintrin.h (+5-8)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index acd8f70c4a5f2..23808056296bb 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -199,8 +199,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
- def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
- def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
}
@@ -222,8 +220,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
- def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
- def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
@@ -501,8 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
- def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
- def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3603,14 +3597,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
}
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
-}
-
let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
}
@@ -5129,15 +5115,3 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
-}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a4974e45caf10..f96c0bab0fd34 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2183,21 +2183,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateBitCast(Res, Ops[0]->getType());
}
- case X86::BI__builtin_ia32_sqrtss:
- case X86::BI__builtin_ia32_sqrtsd: {
- Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
- Function *F;
- if (Builder.getIsFPConstrained()) {
- CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
- F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
- A->getType());
- A = Builder.CreateConstrainedFPCall(F, {A});
- } else {
- F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
- A = Builder.CreateCall(F, {A});
- }
- return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
- }
case X86::BI__builtin_ia32_sqrtsh_round_mask:
case X86::BI__builtin_ia32_sqrtsd_round_mask:
case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2237,40 +2222,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
}
- case X86::BI__builtin_ia32_sqrtpd256:
- case X86::BI__builtin_ia32_sqrtpd:
- case X86::BI__builtin_ia32_sqrtps256:
- case X86::BI__builtin_ia32_sqrtps:
- case X86::BI__builtin_ia32_sqrtph256:
- case X86::BI__builtin_ia32_sqrtph:
case X86::BI__builtin_ia32_sqrtph512:
- case X86::BI__builtin_ia32_vsqrtbf16256:
- case X86::BI__builtin_ia32_vsqrtbf16:
- case X86::BI__builtin_ia32_vsqrtbf16512:
case X86::BI__builtin_ia32_sqrtps512:
case X86::BI__builtin_ia32_sqrtpd512: {
- if (Ops.size() == 2) {
- unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
- // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
- // otherwise keep the intrinsic.
- if (CC != 4) {
- Intrinsic::ID IID;
-
- switch (BuiltinID) {
- default:
- llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_sqrtph512:
- IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
- break;
- case X86::BI__builtin_ia32_sqrtps512:
- IID = Intrinsic::x86_avx512_sqrt_ps_512;
- break;
- case X86::BI__builtin_ia32_sqrtpd512:
- IID = Intrinsic::x86_avx512_sqrt_pd_512;
- break;
- }
- return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+ unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+ // otherwise keep the intrinsic.
+ if (CC != 4) {
+ Intrinsic::ID IID;
+
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_sqrtph512:
+ IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtps512:
+ IID = Intrinsic::x86_avx512_sqrt_ps_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtpd512:
+ IID = Intrinsic::x86_avx512_sqrt_pd_512;
+ break;
}
+ return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
if (Builder.getIsFPConstrained()) {
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 95e9bd7a36f9b..84075a6cabeeb 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -423,7 +423,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
- return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 0c7f381f04fa5..ca1eca8fb5db6 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -818,7 +818,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
(__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
- return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -835,7 +835,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
}
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
- return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 98ad9b54eef39..99bb6c52208bd 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -611,7 +611,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
(__mmask16)(U)))
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
- return __builtin_ia32_sqrtph((__v8hf)__a);
+ return __builtin_elementwise_sqrt(__a);
}
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -628,7 +628,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
}
static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
- return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+ return __builtin_elementwise_sqrt(__a);
}
static __inline__ __m256h __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index b8cfaee7cfb46..435157e00fb3b 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -344,10 +344,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
/// A 256-bit vector of [4 x double].
/// \returns A 256-bit vector of [4 x double] containing the square roots of the
/// values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
- return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the square roots of the values in a 256-bit vector of
@@ -361,10 +359,8 @@ _mm256_sqrt_pd(__m256d __a)
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [8 x float] containing the square roots of the
/// values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
- return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the reciprocal square roots of the values in a 256-bit
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e15a260514f2d..dd2e57d9b91b9 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -248,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
/// bits are copied from the upper 64 bits of operand \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
__m128d __b) {
- __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
- return __extension__(__m128d){__c[0], __a[1]};
+ return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
}
/// Calculates the square root of the each of two values stored in a
@@ -264,7 +263,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
/// \returns A 128-bit vector of [2 x double] containing the square roots of the
/// values in the operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
- return __builtin_ia32_sqrtpd((__v2df)__a);
+ return __builtin_elementwise_sqrt(__a);
}
/// Compares lower 64-bit double-precision values of both operands, and
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6d44cff46661f..cb1665b6b1b71 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -239,10 +239,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
/// used in the calculation.
/// \returns A 128-bit vector of [4 x float] containing the square root of the
/// value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
- return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
+ __a[0] = __builtin_elementwise_sqrt(__a[0]);
+ return __a;
}
/// Calculates the square roots of the values stored in a 128-bit vector
@@ -256,10 +255,8 @@ _mm_sqrt_ss(__m128 __a)
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [4 x float] containing the square roots of the
/// values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
- return __builtin_ia32_sqrtps((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the approximate reciprocal of the value stored in the
``````````
</details>
https://github.com/llvm/llvm-project/pull/165682
More information about the cfe-commits
mailing list