[clang] [Clang][X86] Replace F16C `vcvtph2ps/256` intrinsics with `(convert|shuffle)vector` builtins (PR #152911)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Aug 11 06:59:41 PDT 2025
https://github.com/moorabbit updated https://github.com/llvm/llvm-project/pull/152911
>From ea2135f42f784a10663ed546217dba78999debff Mon Sep 17 00:00:00 2001
From: moorabbit <moorabbit at proton.me>
Date: Sun, 10 Aug 2025 07:29:05 -0400
Subject: [PATCH 1/2] [Clang][X86] Replace F16C `vcvtph2ps/256` intrinsics with
__builtin_convertvector
The following intrinsics were replaced by a combination of `__builtin_shufflevector` and
`__builtin_convertvector`:
- `__builtin_ia32_vcvtph2ps`
- `__builtin_ia32_vcvtph2ps256`
Fixes #152749
---
clang/include/clang/Basic/BuiltinsX86.td | 8 --------
clang/lib/CodeGen/TargetBuiltins/X86.cpp | 2 --
clang/lib/Headers/emmintrin.h | 5 +++++
clang/lib/Headers/f16cintrin.h | 8 +++++---
clang/test/CodeGen/X86/f16c-builtins-constrained.c | 4 ++--
clang/test/CodeGen/X86/f16c-builtins.c | 4 ++--
6 files changed, 14 insertions(+), 17 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 3efc0be8fa698..fc1ee3be7889f 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -757,14 +757,6 @@ let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
}
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
-}
-
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
-}
-
let Features = "rdrnd", Attributes = [NoThrow] in {
def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b508709e4bbae..f8d451bd20fa3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2841,8 +2841,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
// f16c half2float intrinsics
- case X86::BI__builtin_ia32_vcvtph2ps:
- case X86::BI__builtin_ia32_vcvtph2ps256:
case X86::BI__builtin_ia32_vcvtph2ps_mask:
case X86::BI__builtin_ia32_vcvtph2ps256_mask:
case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 60d2000dfb809..9b5567396f60b 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -40,12 +40,17 @@ typedef signed char __v16qs __attribute__((__vector_size__(16)));
#ifdef __SSE2__
/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v4hf __attribute__((__vector_size__(8)));
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+#else
+/* Use __fp16 when _Float16 is not supported. */
+typedef __fp16 __v4hf __attribute__((__vector_size__(8)));
+typedef __fp16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
#endif
/* Define the default attributes for the functions in this file. */
diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h
index 94a662c1d93a8..98b25f5a56953 100644
--- a/clang/lib/Headers/f16cintrin.h
+++ b/clang/lib/Headers/f16cintrin.h
@@ -39,7 +39,8 @@ static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
- __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
+ __v4hi __w = __builtin_shufflevector(__v, __v, 0, 1, 2, 3);
+ __v4sf __r = __builtin_convertvector((__v4hf)__w, __v4sf);
return __r[0];
}
@@ -109,7 +110,8 @@ _cvtsh_ss(unsigned short __a)
static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)
{
- return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+ __v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);
+ return __builtin_convertvector((__v4hf)__v, __v4sf);
}
/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
@@ -153,7 +155,7 @@ _mm_cvtph_ps(__m128i __a)
static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)
{
- return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+ return __builtin_convertvector((__v8hf)__a, __v8sf);
}
#undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/f16c-builtins-constrained.c b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
index bbd4d8f83b53a..5716f09a677f5 100644
--- a/clang/test/CodeGen/X86/f16c-builtins-constrained.c
+++ b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
@@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _cvtsh_ss(a);
@@ -38,7 +38,7 @@ unsigned short test_cvtss_sh(float a) {
__m128 test_mm_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm_cvtph_ps
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: call {{.*}}<4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
return _mm_cvtph_ps(a);
}
diff --git a/clang/test/CodeGen/X86/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
index 3c6d64c225b32..61ffa24e6f5c7 100644
--- a/clang/test/CodeGen/X86/f16c-builtins.c
+++ b/clang/test/CodeGen/X86/f16c-builtins.c
@@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: fpext <4 x half> %{{.*}} to <4 x float>
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _cvtsh_ss(a);
@@ -35,7 +35,7 @@ unsigned short test_cvtss_sh(float a) {
__m128 test_mm_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm_cvtph_ps
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: fpext <4 x half> %{{.*}} to <4 x float>
return _mm_cvtph_ps(a);
}
>From 98baf57b4a66b536bf7a6eb4ec5f3d14bd2d9378 Mon Sep 17 00:00:00 2001
From: moorabbit <moorabbit at proton.me>
Date: Mon, 11 Aug 2025 09:58:28 -0400
Subject: [PATCH 2/2] RKSimon's comment
---
clang/lib/Headers/emmintrin.h | 5 -----
clang/lib/Headers/f16cintrin.h | 13 +++++++------
clang/test/CodeGen/X86/f16c-builtins-constrained.c | 13 ++-----------
clang/test/CodeGen/X86/f16c-builtins.c | 13 ++-----------
4 files changed, 11 insertions(+), 33 deletions(-)
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 9b5567396f60b..60d2000dfb809 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -40,17 +40,12 @@ typedef signed char __v16qs __attribute__((__vector_size__(16)));
#ifdef __SSE2__
/* Both _Float16 and __bf16 require SSE2 being enabled. */
-typedef _Float16 __v4hf __attribute__((__vector_size__(8)));
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
-#else
-/* Use __fp16 when _Float16 is not supported. */
-typedef __fp16 __v4hf __attribute__((__vector_size__(8)));
-typedef __fp16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
#endif
/* Define the default attributes for the functions in this file. */
diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h
index 98b25f5a56953..e425420f88eed 100644
--- a/clang/lib/Headers/f16cintrin.h
+++ b/clang/lib/Headers/f16cintrin.h
@@ -38,10 +38,7 @@
static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
- __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
- __v4hi __w = __builtin_shufflevector(__v, __v, 0, 1, 2, 3);
- __v4sf __r = __builtin_convertvector((__v4hf)__w, __v4sf);
- return __r[0];
+ return (float)__builtin_bit_cast(__fp16, __a);
}
/// Converts a 32-bit single-precision float value to a 16-bit
@@ -110,8 +107,10 @@ _cvtsh_ss(unsigned short __a)
static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)
{
+ typedef __fp16 __v4fp16 __attribute__((__vector_size__(8)));
+
__v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);
- return __builtin_convertvector((__v4hf)__v, __v4sf);
+ return __builtin_convertvector((__v4fp16)__v, __v4sf);
}
/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
@@ -155,7 +154,9 @@ _mm_cvtph_ps(__m128i __a)
static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)
{
- return __builtin_convertvector((__v8hf)__a, __v8sf);
+ typedef __fp16 __v8fp16 __attribute__((__vector_size__(16), __aligned__(16)));
+
+ return __builtin_convertvector((__v8fp16)__a, __v8sf);
}
#undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/f16c-builtins-constrained.c b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
index 5716f09a677f5..50afea8e5fc1d 100644
--- a/clang/test/CodeGen/X86/f16c-builtins-constrained.c
+++ b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
@@ -8,17 +8,8 @@
float test_cvtsh_ss(unsigned short a) {
// CHECK-LABEL: test_cvtsh_ss
- // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 4
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- // CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
- // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: [[CONV:%.*]] = call {{.*}}float @llvm.experimental.constrained.fpext.f32.f16(half %{{.*}}, metadata !"fpexcept.strict")
+ // CHECK: ret float [[CONV]]
return _cvtsh_ss(a);
}
diff --git a/clang/test/CodeGen/X86/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
index 61ffa24e6f5c7..1bee8364f2ef8 100644
--- a/clang/test/CodeGen/X86/f16c-builtins.c
+++ b/clang/test/CodeGen/X86/f16c-builtins.c
@@ -8,17 +8,8 @@
float test_cvtsh_ss(unsigned short a) {
// CHECK-LABEL: test_cvtsh_ss
- // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 4
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
- // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- // CHECK: fpext <4 x half> %{{.*}} to <4 x float>
- // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: [[CONV:%.*]] = fpext half %{{.*}} to float
+ // CHECK: ret float [[CONV]]
return _cvtsh_ss(a);
}
More information about the cfe-commits
mailing list