[clang] [clang][x86] Add constexpr support for some basic SSE1 intrinsics (PR #111001)
Simon Pilgrim via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 3 07:42:37 PDT 2024
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/111001
This is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles.
The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises.
>From d3e47ffe0bc2b6080cf74846baf78a3e7ec5b1ca Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 3 Oct 2024 15:19:44 +0100
Subject: [PATCH] [clang][x86] Add constexpr support for some basic SSE1
intrinsics
This is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles.
The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises.
---
clang/lib/Headers/xmmintrin.h | 128 +++++++++++---------------
clang/test/CodeGen/X86/sse-builtins.c | 83 +++++++++++++++++
2 files changed, 139 insertions(+), 72 deletions(-)
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 9958e91bfceaa9..e755556f9cd319 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -48,6 +48,14 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
__min_vector_width__(128)))
#endif
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
+#endif
+
#define __trunc64(x) \
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
#define __zext128(x) \
@@ -75,9 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
/// of the lower 32 bits of both operands. The upper 96 bits are copied from
/// the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ss(__m128 __a, __m128 __b) {
__a[0] += __b[0];
return __a;
}
@@ -95,9 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the sums of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a + (__v4sf)__b);
}
@@ -117,9 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
/// difference of the lower 32 bits of both operands. The upper 96 bits are
/// copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ss(__m128 __a, __m128 __b) {
__a[0] -= __b[0];
return __a;
}
@@ -138,9 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the differences between
/// both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a - (__v4sf)__b);
}
@@ -160,9 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the product of the lower
/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
/// bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ss(__m128 __a, __m128 __b) {
__a[0] *= __b[0];
return __a;
}
@@ -180,9 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the products of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a * (__v4sf)__b);
}
@@ -202,9 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the quotients of the
/// lower 32 bits of both operands. The upper 96 bits are copied from the
/// upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ss(__m128 __a, __m128 __b) {
__a[0] /= __b[0];
return __a;
}
@@ -221,9 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing the divisor.
/// \returns A 128-bit vector of [4 x float] containing the quotients of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a / (__v4sf)__b);
}
@@ -437,9 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
/// A 128-bit vector containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_and_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_and_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a & (__v4su)__b);
}
@@ -459,9 +458,8 @@ _mm_and_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
/// one's complement of the first operand and the values in the second
/// operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_andnot_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_andnot_ps(__m128 __a, __m128 __b) {
return (__m128)(~(__v4su)__a & (__v4su)__b);
}
@@ -477,9 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_or_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_or_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a | (__v4su)__b);
}
@@ -496,9 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
/// of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_xor_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_xor_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a ^ (__v4su)__b);
}
@@ -1738,9 +1734,8 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
/// used in the extraction.
/// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm_cvtss_f32(__m128 __a)
-{
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtss_f32(__m128 __a) {
return __a[0];
}
@@ -1931,9 +1926,8 @@ _mm_undefined_ps(void)
/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
/// lower 32 bits contain the value provided in the source operand. The
/// upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ss(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ss(float __w) {
return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
}
@@ -1949,9 +1943,8 @@ _mm_set_ss(float __w)
/// A single-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set1_ps(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_ps(float __w) {
return __extension__ (__m128){ __w, __w, __w, __w };
}
@@ -1968,9 +1961,8 @@ _mm_set1_ps(float __w)
/// A single-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps1(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps1(float __w) {
return _mm_set1_ps(__w);
}
@@ -1995,9 +1987,8 @@ _mm_set_ps1(float __w)
/// A single-precision floating-point value used to initialize bits [31:0]
/// of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps(float __z, float __y, float __x, float __w) {
return __extension__ (__m128){ __w, __x, __y, __z };
}
@@ -2023,9 +2014,8 @@ _mm_set_ps(float __z, float __y, float __x, float __w)
/// A single-precision floating-point value used to initialize bits [127:96]
/// of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setr_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setr_ps(float __z, float __y, float __x, float __w) {
return __extension__ (__m128){ __z, __y, __x, __w };
}
@@ -2038,9 +2028,8 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
///
/// \returns An initialized 128-bit floating-point vector of [4 x float] with
/// all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setzero_ps(void)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setzero_ps(void) {
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
}
@@ -2786,9 +2775,8 @@ void _mm_setcsr(unsigned int __i);
/// Bits [95:64] are written to bits [63:32] of the destination. \n
/// Bits [127:96] are written to bits [127:96] of the destination.
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpackhi_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
}
@@ -2808,9 +2796,8 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
/// Bits [31:0] are written to bits [63:32] of the destination. \n
/// Bits [63:32] are written to bits [127:96] of the destination.
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpacklo_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
}
@@ -2830,9 +2817,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
/// written to the lower 32 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_move_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_move_ss(__m128 __a, __m128 __b) {
__a[0] = __b[0];
return __a;
}
@@ -2852,9 +2838,8 @@ _mm_move_ss(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
/// written to the lower 64 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehl_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movehl_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
}
@@ -2873,9 +2858,8 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
/// written to the upper 64 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movelh_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movelh_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
}
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 74f62e93c800b6..932d6f36b09b66 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -869,3 +869,86 @@ __m128 test_mm_xor_ps(__m128 A, __m128 B) {
// CHECK: xor <4 x i32>
return _mm_xor_ps(A, B);
}
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+
+void test_constexpr() {
+ constexpr __m128 k1 {+1.0f,+0.0f,+2.0f,+4.0f};
+ constexpr __m128 k2 {+8.0f,+4.0f,+2.0f,+1.0f};
+ constexpr __m128 k3 {-4.0f,-5.0f,+6.0f,+7.0f};
+ constexpr __m128 k4 {+0.0f,-0.0f,-0.0f,+0.0f};
+
+ constexpr __m128 v_mm_set_ss = _mm_set_ss(1.0f);
+ static_assert(v_mm_set_ss[0] == +1.0f && v_mm_set_ss[1] == +0.0f && v_mm_set_ss[2] == +0.0f && v_mm_set_ss[3] == +0.0f);
+
+ constexpr __m128 v_mm_set1_ps = _mm_set1_ps(2.0f);
+ static_assert(v_mm_set1_ps[0] == +2.0f && v_mm_set1_ps[1] == +2.0f && v_mm_set1_ps[2] == +2.0f && v_mm_set1_ps[3] == +2.0f);
+
+ constexpr __m128 v_mm_set_ps1 = _mm_set_ps1(-2.0f);
+ static_assert(v_mm_set_ps1[0] == -2.0f && v_mm_set_ps1[1] == -2.0f && v_mm_set_ps1[2] == -2.0f && v_mm_set_ps1[3] == -2.0f);
+
+ constexpr __m128 v_mm_set_ps = _mm_set_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+ static_assert(v_mm_set_ps[0] == +3.0f && v_mm_set_ps[1] == +2.0f && v_mm_set_ps[2] == +1.0f && v_mm_set_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_setr_ps = _mm_setr_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+ static_assert(v_mm_setr_ps[0] == +0.0f && v_mm_setr_ps[1] == +1.0f && v_mm_setr_ps[2] == +2.0f && v_mm_setr_ps[3] == +3.0f);
+
+ constexpr __m128 v_mm_setzero_ps = _mm_setzero_ps();
+ static_assert(v_mm_setzero_ps[0] == +0.0f && v_mm_setzero_ps[1] == +0.0f && v_mm_setzero_ps[2] == +0.0f && v_mm_setzero_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_add_ss = _mm_add_ss(k1, k2);
+ static_assert(v_mm_add_ss[0] == +9.0f && v_mm_add_ss[1] == +0.0f && v_mm_add_ss[2] == +2.0f && v_mm_add_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_add_ps = _mm_add_ps(k1, k2);
+ static_assert(v_mm_add_ps[0] == +9.0f && v_mm_add_ps[1] == +4.0f && v_mm_add_ps[2] == +4.0f && v_mm_add_ps[3] == +5.0f);
+
+ constexpr __m128 v_mm_sub_ss = _mm_sub_ss(k1, k2);
+ static_assert(v_mm_sub_ss[0] == -7.0f && v_mm_sub_ss[1] == +0.0f && v_mm_sub_ss[2] == +2.0f && v_mm_sub_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_sub_ps = _mm_sub_ps(k1, k2);
+ static_assert(v_mm_sub_ps[0] == -7.0f && v_mm_sub_ps[1] == -4.0f && v_mm_sub_ps[2] == +0.0f && v_mm_sub_ps[3] == +3.0f);
+
+ constexpr __m128 v_mm_mul_ss = _mm_mul_ss(k1, k2);
+ static_assert(v_mm_mul_ss[0] == +8.0f && v_mm_mul_ss[1] == +0.0f && v_mm_mul_ss[2] == +2.0f && v_mm_mul_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_mul_ps = _mm_mul_ps(k1, k2);
+ static_assert(v_mm_mul_ps[0] == +8.0f && v_mm_mul_ps[1] == +0.0f && v_mm_mul_ps[2] == +4.0f && v_mm_mul_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_div_ss = _mm_div_ss(k1, k2);
+ static_assert(v_mm_div_ss[0] == +0.125f && v_mm_div_ss[1] == +0.0f && v_mm_div_ss[2] == +2.0f && v_mm_div_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_div_ps = _mm_div_ps(k1, k2);
+ static_assert(v_mm_div_ps[0] == +0.125f && v_mm_div_ps[1] == +0.0f && v_mm_div_ps[2] == +1.0f && v_mm_div_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_and_ps = _mm_and_ps(k3, k4);
+ static_assert(v_mm_and_ps[0] == +0.0f && v_mm_and_ps[1] == +0.0f && v_mm_and_ps[2] == +0.0f && v_mm_and_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_andnot_ps = _mm_andnot_ps(k3, k4);
+ static_assert(v_mm_andnot_ps[0] == +0.0f && v_mm_andnot_ps[1] == +0.0f && v_mm_andnot_ps[2] == +0.0f && v_mm_andnot_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_or_ps = _mm_or_ps(k3, k4);
+ static_assert(v_mm_or_ps[0] == -4.0f && v_mm_or_ps[1] == -5.0f && v_mm_or_ps[2] == -6.0f && v_mm_or_ps[3] == +7.0f);
+
+ constexpr __m128 v_mm_xor_ps = _mm_xor_ps(k3, k4);
+ static_assert(v_mm_xor_ps[0] == -4.0f && v_mm_xor_ps[1] == +5.0f && v_mm_xor_ps[2] == -6.0f && v_mm_xor_ps[3] == +7.0f);
+
+ constexpr __m128 v_mm_unpackhi_ps = _mm_unpackhi_ps(k1, k2);
+ static_assert(v_mm_unpackhi_ps[0] == +2.0f && v_mm_unpackhi_ps[1] == +2.0f && v_mm_unpackhi_ps[2] == +4.0f && v_mm_unpackhi_ps[3] == +1.0f);
+
+ constexpr __m128 v_mm_unpacklo_ps = _mm_unpacklo_ps(k1, k2);
+ static_assert(v_mm_unpacklo_ps[0] == +1.0f && v_mm_unpacklo_ps[1] == +8.0f && v_mm_unpacklo_ps[2] == +0.0f && v_mm_unpacklo_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_move_ss = _mm_move_ss(k1, k2);
+ static_assert(v_mm_move_ss[0] == +8.0f && v_mm_move_ss[1] == +0.0f && v_mm_move_ss[2] == +2.0f && v_mm_move_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_movehl_ps = _mm_movehl_ps(k1, k2);
+ static_assert(v_mm_movehl_ps[0] == +2.0f && v_mm_movehl_ps[1] == +1.0f && v_mm_movehl_ps[2] == +2.0f && v_mm_movehl_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_movelh_ps = _mm_movelh_ps(k1, k2);
+ static_assert(v_mm_movelh_ps[0] == +1.0f && v_mm_movelh_ps[1] == +0.0f && v_mm_movelh_ps[2] == +8.0f && v_mm_movelh_ps[3] == +4.0f);
+
+ static_assert(_mm_cvtss_f32(k2) == +8.0f);
+}
+
+#endif
\ No newline at end of file
More information about the cfe-commits
mailing list