[clang] [X86] Add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr (PR #156822)

Thu Sep 25 22:52:26 PDT 2025

https://github.com/whytolearn updated https://github.com/llvm/llvm-project/pull/156822

>From a81c4068096b960de65c3517f18d2d31004afbce Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Thu, 4 Sep 2025 15:52:57 +0800
Subject: [PATCH 1/6] deal this issues 155395
 VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX
 PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395

---
 clang/lib/Headers/avx2intrin.h          | 27 ++++-----
 clang/lib/Headers/avxintrin.h           | 11 ++--
 clang/lib/Headers/pmmintrin.h           | 20 +++----
 clang/lib/Headers/tmmintrin.h           | 80 +++++++++++--------------
 clang/test/CodeGen/X86/avx-builtins.c   | 29 +++++++++
 clang/test/CodeGen/X86/avx2-builtins.c  | 63 +++++++++++++++++++
 clang/test/CodeGen/X86/mmx-builtins.c   | 48 +++++++++++++++
 clang/test/CodeGen/X86/ssse3-builtins.c | 49 +++++++++++++++
 8 files changed, 250 insertions(+), 77 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 384faa35d246f..f8fb808f7f29c 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -854,10 +854,9 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadd_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
@@ -886,7 +885,7 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hadd_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
@@ -921,10 +920,9 @@ _mm256_hadd_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadds_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadds_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -957,10 +955,9 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hsub_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
@@ -989,7 +986,7 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hsub_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
@@ -1025,7 +1022,7 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 26096da949447..976710a64e80e 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -703,7 +703,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
 ///    elements of a vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
 ///    both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hadd_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
@@ -726,9 +726,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hadd_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 
+_mm256_hadd_ps(__m256 __a, __m256 __b) {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
@@ -749,7 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b)
 ///    odd-indexed elements of a vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the horizontal
 ///    differences of both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
@@ -772,7 +771,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal
 ///    differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hsub_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index cd605df7fb52d..400b28bb877a1 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -89,9 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hadd_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 
+_mm_hadd_ps(__m128 __a, __m128 __b) {
   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
@@ -174,9 +173,8 @@ _mm_moveldup_ps(__m128 __a)
 ///    A 128-bit vector of [2 x double] containing the right source operand.
 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
 ///    and differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_addsub_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_addsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }
 
@@ -197,9 +195,8 @@ _mm_addsub_pd(__m128d __a, __m128d __b)
 ///    destination.
 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hadd_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
 }
 
@@ -220,9 +217,8 @@ _mm_hadd_pd(__m128d __a, __m128d __b)
 ///    the destination.
 /// \returns A 128-bit vector of [2 x double] containing the horizontal
 ///    differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hsub_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
 }
 
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index f01c61afa8ea2..d79f7f6ea4091 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -204,10 +204,10 @@ _mm_abs_epi32(__m128i __a) {
 ///    destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddw128(
+      (__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -227,10 +227,9 @@ _mm_hadd_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -250,11 +249,10 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadd_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+ _mm_hadd_pi16(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -274,7 +272,7 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phaddd128(
@@ -301,10 +299,9 @@ _mm_hadd_pi32(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadds_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadds_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -327,7 +324,7 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phaddsw128(
@@ -351,10 +348,9 @@ _mm_hadds_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -374,10 +370,9 @@ _mm_hsub_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -397,7 +392,7 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubw128(
@@ -421,7 +416,7 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubd128(
@@ -448,10 +443,9 @@ _mm_hsub_pi32(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsubs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -474,7 +468,7 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubsw128(
@@ -509,10 +503,9 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_maddubs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -539,11 +532,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
-                                                 (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_maddubs_pi16(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                               (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -560,7 +552,7 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b)
 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 4a048744faa61..f381faebededf 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1083,24 +1083,53 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hadd_pd(A, B);
 }
+constexpr bool test_mm256_hadd_epi32_constexpr() {
+    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
+    constexpr __m256d result = _mm256_hadd_pd(a, b);
+    return match_m256d(result,1.0+2.0,3.0+4.0,5.0+6.0,7.0+8.0);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
 
 __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hadd_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hadd_ps(A, B);
 }
+constexpr bool test_mm256_hadd_ps_constexpr() {
+    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
+    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f);
+    constexpr __m256 result = _mm256_hadd_ps(a, b);
+    return match_m256(result,1.0f+2.0f,3.0f+4.0f,5.0f+6.0f,7.0f+8.0f,
+                             9.0f+10.0f,11.0f+12.0f,13.0f+14.0f,15.0f+16.0f);
+}
+TEST_CONSTEXPR(test_mm256_hadd_ps_constexpr())
 
 __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_hsub_pd
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
+constexpr bool test_mm256_hsub_pd_constexpr() {
+    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
+    constexpr __m256d result = _mm256_hsub_pd(a, b);
+    return match_m256d(result,1.0-2.0,3.0-4.0,5.0-6.0,7.0-8.0);
+}
+TEST_CONSTEXPR(test_mm256_hsub_pd_constexpr())
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hsub_ps(A, B);
 }
+constexpr bool test_mm256_hsub_ps_constexpr() {
+    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
+    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f);
+    constexpr __m256 result = _mm256_hsub_ps(a, b);
+    return match_m256(result,1.0f-2.0f,3.0f-4.0f,5.0f-6.0f,7.0f-8.0f,
+                             9.0f-10.0f,11.0f-12.0f,13.0f-14.0f,15.0f-16.0f);
+}
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index a39ce513837ea..02845b9417a1f 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -462,17 +462,48 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
   return _mm256_hadd_epi16(a, b);
 }
 
+constexpr bool test_mm256_hadd_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
+        8,9,10,11,12,13,14,15,16);
+    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
+        24,25,26,27,28,29,30,31,32);
+    
+    constexpr __m256i result = _mm256_hadd_epi16(a, b);
+    return match_v16si(result,1+2,3+4,5+6,7+8,9+10,11+12,13+14,15+16,17+18,19+20,21+22,23+24,25+26,27+28,29+30,31+32);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi16_constexpr())
+
 __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hadd_epi32(a, b);
 }
 
+constexpr bool test_mm256_hadd_epi32_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
+    
+    constexpr __m256i result = _mm256_hadd_epi32(a, b);
+    return match_v8si(result,10+20,30+40,50+60,70+80,5+15,25+35, 45+55,65+75);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
+
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadds_epi16(a, b);
 }
+constexpr bool test_mm256_hadds_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
+        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m256i b = _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 
+        1, 1, 1, 1, 1, 1, 1);
+    constexpr __m256i result = _mm256_hadds_epi16(a, b);
+
+    return match_v16si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm256_hadds_epi16_constexpr())
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
@@ -480,18 +511,50 @@ __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   return _mm256_hsub_epi16(a, b);
 }
 
+constexpr bool test_mm256_hsub_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
+        8,9,10,11,12,13,14,15,16);
+    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
+        24,25,26,27,28,29,30,31,32);
+    
+    constexpr __m256i result = _mm256_hsub_epi16(a, b);
+    return match_v16si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16,17-18,19-20,21-22,23-24,25-26,27-28,29-30,31-32);
+}
+TEST_CONSTEXPR(test_mm256_hsub_epi16_constexpr())
+
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hsub_epi32(a, b);
 }
 
+constexpr bool test_mm256_hsub_epi32_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
+    
+    constexpr __m256i result = _mm256_hsub_epi32(a, b);
+    return match_v8si(result,10-20,30-40,50-60,70-80,5-15,25-35, 45-55,65-75);
+}
+TEST_CONSTEXPR(test_mm256_hsub_epi32_constexpr())
+
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
 
+constexpr bool test_mm256_hsubs_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
+        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m256i b = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    constexpr __m256i result3 = _mm256_hsubs_epi16(a, b);
+
+    return match_v16si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm256_hsubs_epi16_constexpr())
+
+
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 7bd2475399bf9..8da0e8c814879 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -309,36 +309,84 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
+constexpr bool test_mm_hadd_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
+    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
+    
+    constexpr __m64 result = _mm_hadd_pi16(a, b);
+    return match_v4si(result,1+2,3+4,5+6,7+8);
+}
+TEST_CONSTEXPR(test_mm_hadd_pi16_constexpr())
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
+constexpr bool test_mm_hadd_pi32_constexpr() {
+    constexpr __m64 a = _mm_setr_pi32(1, 2);
+    constexpr __m64 b = _mm_setr_pi32(3, 4);
+    
+    constexpr __m64 result = _mm_hadd_pi32(a, b);
+    return match_v2si(result,1+2,3+4);
+}
+TEST_CONSTEXPR(test_mm_hadd_pi32_constexpr())
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
+constexpr bool test_mm_hadds_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
+    constexpr __m64 b = _mm_setr_pi16(1,1,1,1);
+    
+    constexpr __m64 result = _mm_hadds_pi16(a, b);
+    return match_v4si(result,32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hadds_pi16_constexpr())
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
+constexpr bool test_mm_hsub_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
+    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
+    
+    constexpr __m64 result = _mm_hsub_pi16(a, b);
+    return match_v4si(result,1-2,3-4,5-6,7-8);
+}
+TEST_CONSTEXPR(test_mm_hsub_pi16_constexpr())
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
+constexpr bool test_mm_hsub_pi32_constexpr() {
+    constexpr __m64 a = _mm_setr_pi32(1, 2);
+    constexpr __m64 b = _mm_setr_pi32(3, 4);
+    
+    constexpr __m64 result = _mm_hsub_pi32(a, b);
+    return match_v2si(result,1-2,3-4);
+}
+TEST_CONSTEXPR(test_mm_hsub_pi32_constexpr())
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
+constexpr bool test_mm_hsubs_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
+    constexpr __m64 b = _mm_setr_pi16(-1,-1,-1,-1);
+    
+    constexpr __m64 result = _mm_hsubs_pi16(a, b);
+    return match_v4si(result,32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hsubs_pi16_constexpr())
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 56ff73f08ab32..bd0ef43278217 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -60,36 +60,85 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadd_epi16(a, b);
 }
+constexpr bool test_mm_hadd_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    constexpr __m128i b = _mm_setr_epi16(17,18,19,20,21,22,23,24);
+    
+    constexpr __m128i result = _mm_hadd_epi16(a, b);
+    return match_v8si(result,1+2,3+4,5+6,7+8,17+18,19+20,21+22,23+24);
+}
+TEST_CONSTEXPR(test_mm_hadd_epi16_constexpr())
 
 __m128i test_mm_hadd_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hadd_epi32(a, b);
 }
+constexpr bool test_mm_hadd_epi32_constexpr() {
+    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
+    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
+    
+    constexpr __m128i result = _mm_hadd_epi32(a, b);
+    return match_v4si(result,1+2,3+4,5+6,7+8);
+}
+TEST_CONSTEXPR(test_mm_hadd_epi32_constexpr())
 
 __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadds_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
+constexpr bool test_mm_hadds_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m128i b = _mm_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+    constexpr __m128i result = _mm_hadds_epi16(a, b);
+
+    return match_v8si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hadds_epi16_constexpr())
+
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
+constexpr bool test_mm_hsub_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    constexpr __m128i b = _mm_setr_epi16(9,10,11,12,13,14,15,16);
+    
+    constexpr __m128i result = _mm_hsub_epi16(a, b);
+    return match_v8si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16);
+}
+TEST_CONSTEXPR(test_mm_hsub_epi16_constexpr())
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
+constexpr bool test_mm_hsub_epi32_constexpr() {
+    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
+    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
+    
+    constexpr __m128i result = _mm_hsub_epi32(a, b);
+    return match_v4si(result,1-2,3-4,5-6,7-8);
+}
+TEST_CONSTEXPR(test_mm_hsub_epi32_constexpr())
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
+constexpr bool test_mm_hsubs_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m128i b = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
+    constexpr __m128i result3 = _mm_hsubs_epi16(a, b);
+
+    return match_v8si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hsubs_epi16_constexpr())
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16

>From 2fadf3fd261935e25adff5b26ad8ee0734746a26 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Thu, 4 Sep 2025 15:55:44 +0800
Subject: [PATCH 2/6] deal issues 15595 [Clang]
 VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX
 PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395

---
 clang/lib/Headers/avx2intrin.h | 15 ++++-----
 clang/lib/Headers/avxintrin.h  | 15 ++++-----
 clang/lib/Headers/pmmintrin.h  |  4 +--
 clang/lib/Headers/tmmintrin.h  | 57 +++++++++++++++-------------------
 4 files changed, 39 insertions(+), 52 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index f8fb808f7f29c..c39f94c7fc16b 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -886,9 +886,8 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hadd_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+_mm256_hadd_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
@@ -987,9 +986,8 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hsub_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+_mm256_hsub_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -1023,9 +1021,8 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hsubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 976710a64e80e..48d79063f9b61 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -704,8 +704,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
 ///    both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hadd_pd(__m256d __a, __m256d __b)
-{
+_mm256_hadd_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -726,8 +725,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 
-_mm256_hadd_ps(__m256 __a, __m256 __b) {
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
+                                                                   __m256 __b) {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
@@ -749,8 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) {
 /// \returns A 256-bit vector of [4 x double] containing the horizontal
 ///    differences of both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hsub_pd(__m256d __a, __m256d __b)
-{
+_mm256_hsub_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -771,9 +769,8 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal
 ///    differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hsub_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a,
+                                                                   __m256 __b) {
   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index 400b28bb877a1..67f2a7ffd1f56 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -89,8 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 
-_mm_hadd_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_ps(__m128 __a,
+                                                                  __m128 __b) {
   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index d79f7f6ea4091..b408c6a3404ec 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -206,8 +206,7 @@ _mm_abs_epi32(__m128i __a) {
 ///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadd_epi16(__m128i __a, __m128i __b) {
-  return (__m128i)__builtin_ia32_phaddw128(
-      (__v8hi)__a, (__v8hi)__b);
+  return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -249,8 +248,8 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) {
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
- _mm_hadd_pi16(__m64 __a, __m64 __b) {
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi16(__m64 __a,
+                                                                   __m64 __b) {
   return __trunc64(__builtin_ia32_phaddw128(
       (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
@@ -272,11 +271,10 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hadd_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddd128(
-        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddd128(
+      (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -324,11 +322,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) {
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hadds_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddsw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadds_pi16(__m64 __a,
+                                                                    __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddsw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -392,11 +389,10 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) {
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsub_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi16(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -416,11 +412,10 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsub_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubd128(
-        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubd128(
+      (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -468,11 +463,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) {
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubsw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a,
+                                                                    __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubsw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -553,9 +547,8 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) {
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_mulhrs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+_mm_mulhrs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit

>From ed4a09fb51ab347b4778b81d1f8c511d31d106a7 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Thu, 11 Sep 2025 13:48:13 +0800
Subject: [PATCH 3/6] constexpr deal

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 243 +++++++++++++++++----
 clang/lib/AST/ExprConstant.cpp           | 266 +++++++++++++++++------
 2 files changed, 407 insertions(+), 102 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8c2b71160f7f3..f6027c78935c3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -8,6 +8,7 @@
 #include "../ExprConstShared.h"
 #include "Boolean.h"
 #include "EvalEmitter.h"
+#include "Floating.h"
 #include "Interp.h"
 #include "InterpBuiltinBitCast.h"
 #include "PrimType.h"
@@ -19,6 +20,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SipHash.h"
+#include <cassert>
 
 namespace clang {
 namespace interp {
@@ -2736,6 +2738,141 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
+                                        const InterpFrame *Frame,
+                                        const CallExpr *Call,
+                                        uint32_t BuiltinID) {
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VT->getElementType());
+  unsigned SourceLen = VT->getNumElements();
+  assert(SourceLen % 2 == 0 &&
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen);
+  PrimType DstElemT = *S.getContext().classify(
+      Call->getType()->castAs<VectorType>()->getElementType());
+  unsigned DstElem = 0;
+
+  bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_phaddw128 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddw256 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddd128 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddd256 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256);
+
+  bool IsSaturating = (BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 ||
+                       BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256 ||
+                       BuiltinID == clang::X86::BI__builtin_ia32_phsubsw128 ||
+                       BuiltinID == clang::X86::BI__builtin_ia32_phsubsw256);
+
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    APSInt Elem1;
+    APSInt Elem2;
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      Elem1 = LHS.elem<T>(I).toAPSInt();
+      Elem2 = LHS.elem<T>(I+1).toAPSInt();
+    });
+    APSInt Result;
+    if (IsAdd) {
+        if (IsSaturating) {
+          Result = APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+        }else{
+          Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+        }
+    }else{
+      if (IsSaturating) {
+        Result =
+            APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+      } else {
+        Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+      }
+    }
+    INT_TYPE_SWITCH_NO_BOOL(DstElemT,
+                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
+    ++DstElem;
+  }
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    APSInt Elem1;
+    APSInt Elem2;
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      Elem1 = RHS.elem<T>(I).toAPSInt();
+      Elem2 = RHS.elem<T>(I + 1).toAPSInt();
+    });
+    APSInt Result;
+    if (IsAdd) {
+      if (IsSaturating) {
+        Result =
+            APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+      } else {
+        Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+      }
+    } else {
+      if (IsSaturating) {
+        Result = APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+      } else {
+        Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+      }
+    }
+    INT_TYPE_SWITCH_NO_BOOL(DstElemT,
+                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
+    ++DstElem;
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
+static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
+                                          const InterpFrame *Frame,
+                                          const CallExpr *Call,
+                                          uint32_t BuiltinID) {
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  
+  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
+  llvm::RoundingMode RM = getRoundingMode(FPO);
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  unsigned SourceLen = VT->getNumElements();
+  assert(SourceLen % 2 == 0 &&
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
+             SourceLen);
+  unsigned DstElem = 0;
+  bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_haddpd ||
+                BuiltinID == clang::X86::BI__builtin_ia32_haddpd256 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_haddps ||
+                BuiltinID == clang::X86::BI__builtin_ia32_haddps256);
+  using T = Floating;
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
+    APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
+
+    if (IsAdd) {
+      Elem1.add(Elem2, RM);
+    } else {
+      Elem1.subtract(Elem2, RM);
+    }
+    Dst.elem<T>(DstElem++) = Elem1;
+  }
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    APFloat Elem1 = RHS.elem<T>(I).getAPFloat();
+    APFloat Elem2 = RHS.elem<T>(I + 1).getAPFloat();
+    if (IsAdd) {
+      Elem1.add(Elem2, RM);
+    } else {
+      Elem1.subtract(Elem2, RM);
+    }
+    Dst.elem<T>(DstElem++) = Elem1;
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_elementwise_fma(InterpState &S, CodePtr OpPC,
                                             const CallExpr *Call) {
   assert(Call->getNumArgs() == 3);
@@ -3356,49 +3493,73 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_elementwise_min:
     return interp__builtin_elementwise_maxmin(S, OpPC, Call, BuiltinID);
 
-  case clang::X86::BI__builtin_ia32_pmuldq128:
-  case clang::X86::BI__builtin_ia32_pmuldq256:
-  case clang::X86::BI__builtin_ia32_pmuldq512:
-  case clang::X86::BI__builtin_ia32_pmuludq128:
-  case clang::X86::BI__builtin_ia32_pmuludq256:
-  case clang::X86::BI__builtin_ia32_pmuludq512:
-    return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID);
-
-  case Builtin::BI__builtin_elementwise_fma:
-    return interp__builtin_elementwise_fma(S, OpPC, Call);
-
-  case X86::BI__builtin_ia32_selectb_128:
-  case X86::BI__builtin_ia32_selectb_256:
-  case X86::BI__builtin_ia32_selectb_512:
-  case X86::BI__builtin_ia32_selectw_128:
-  case X86::BI__builtin_ia32_selectw_256:
-  case X86::BI__builtin_ia32_selectw_512:
-  case X86::BI__builtin_ia32_selectd_128:
-  case X86::BI__builtin_ia32_selectd_256:
-  case X86::BI__builtin_ia32_selectd_512:
-  case X86::BI__builtin_ia32_selectq_128:
-  case X86::BI__builtin_ia32_selectq_256:
-  case X86::BI__builtin_ia32_selectq_512:
-  case X86::BI__builtin_ia32_selectph_128:
-  case X86::BI__builtin_ia32_selectph_256:
-  case X86::BI__builtin_ia32_selectph_512:
-  case X86::BI__builtin_ia32_selectpbf_128:
-  case X86::BI__builtin_ia32_selectpbf_256:
-  case X86::BI__builtin_ia32_selectpbf_512:
-  case X86::BI__builtin_ia32_selectps_128:
-  case X86::BI__builtin_ia32_selectps_256:
-  case X86::BI__builtin_ia32_selectps_512:
-  case X86::BI__builtin_ia32_selectpd_128:
-  case X86::BI__builtin_ia32_selectpd_256:
-  case X86::BI__builtin_ia32_selectpd_512:
-    return interp__builtin_select(S, OpPC, Call);
+  case clang::X86::BI__builtin_ia32_phaddw128:
+  case clang::X86::BI__builtin_ia32_phaddw256:
+   case clang::X86::BI__builtin_ia32_phaddd128: 
+   case clang::X86::BI__builtin_ia32_phaddd256: 
+   case clang::X86::BI__builtin_ia32_phaddsw128: 
+   case clang::X86::BI__builtin_ia32_phaddsw256:
+    case clang::X86::BI__builtin_ia32_phsubw128:
+    case clang::X86::BI__builtin_ia32_phsubw256:
+    case clang::X86::BI__builtin_ia32_phsubd128:
+    case clang::X86::BI__builtin_ia32_phsubd256:
+    case clang::X86::BI__builtin_ia32_phsubsw128:
+    case clang::X86::BI__builtin_ia32_phsubsw256:
+    
+      return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID);
+    case clang::X86::BI__builtin_ia32_haddpd:
+    case clang::X86::BI__builtin_ia32_haddpd256:
+    case clang::X86::BI__builtin_ia32_haddps:
+    case clang::X86::BI__builtin_ia32_haddps256:
+    case clang::X86::BI__builtin_ia32_hsubpd:
+    case clang::X86::BI__builtin_ia32_hsubpd256:
+    case clang::X86::BI__builtin_ia32_hsubps:
+    case clang::X86::BI__builtin_ia32_hsubps256:
+        return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID);
 
-  default:
-    S.FFDiag(S.Current->getLocation(OpPC),
-             diag::note_invalid_subexpr_in_const_expr)
-        << S.Current->getRange(OpPC);
+    case clang::X86::BI__builtin_ia32_pmuldq128:
+    case clang::X86::BI__builtin_ia32_pmuldq256:
+    case clang::X86::BI__builtin_ia32_pmuldq512:
+    case clang::X86::BI__builtin_ia32_pmuludq128:
+    case clang::X86::BI__builtin_ia32_pmuludq256:
+    case clang::X86::BI__builtin_ia32_pmuludq512:
+      return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID);
+
+    case Builtin::BI__builtin_elementwise_fma:
+      return interp__builtin_elementwise_fma(S, OpPC, Call);
+
+    case X86::BI__builtin_ia32_selectb_128:
+    case X86::BI__builtin_ia32_selectb_256:
+    case X86::BI__builtin_ia32_selectb_512:
+    case X86::BI__builtin_ia32_selectw_128:
+    case X86::BI__builtin_ia32_selectw_256:
+    case X86::BI__builtin_ia32_selectw_512:
+    case X86::BI__builtin_ia32_selectd_128:
+    case X86::BI__builtin_ia32_selectd_256:
+    case X86::BI__builtin_ia32_selectd_512:
+    case X86::BI__builtin_ia32_selectq_128:
+    case X86::BI__builtin_ia32_selectq_256:
+    case X86::BI__builtin_ia32_selectq_512:
+    case X86::BI__builtin_ia32_selectph_128:
+    case X86::BI__builtin_ia32_selectph_256:
+    case X86::BI__builtin_ia32_selectph_512:
+    case X86::BI__builtin_ia32_selectpbf_128:
+    case X86::BI__builtin_ia32_selectpbf_256:
+    case X86::BI__builtin_ia32_selectpbf_512:
+    case X86::BI__builtin_ia32_selectps_128:
+    case X86::BI__builtin_ia32_selectps_256:
+    case X86::BI__builtin_ia32_selectps_512:
+    case X86::BI__builtin_ia32_selectpd_128:
+    case X86::BI__builtin_ia32_selectpd_256:
+    case X86::BI__builtin_ia32_selectpd_512:
+      return interp__builtin_select(S, OpPC, Call);
 
-    return false;
+    default:
+      S.FFDiag(S.Current->getLocation(OpPC),
+               diag::note_invalid_subexpr_in_const_expr)
+          << S.Current->getRange(OpPC);
+
+      return false;
   }
 
   llvm_unreachable("Unhandled builtin ID");
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 66362d44976c4..774a3adf1a7ca 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -55,6 +55,7 @@
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/APFixedPoint.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -12105,6 +12106,145 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     }
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+
+  case clang::X86::BI__builtin_ia32_phaddw128:
+  case clang::X86::BI__builtin_ia32_phaddw256:
+  case clang::X86::BI__builtin_ia32_phaddd128:
+  case clang::X86::BI__builtin_ia32_phaddd256:
+  case clang::X86::BI__builtin_ia32_phaddsw128:
+  case clang::X86::BI__builtin_ia32_phaddsw256:
+
+  case clang::X86::BI__builtin_ia32_phsubw128:
+  case clang::X86::BI__builtin_ia32_phsubw256:
+  case clang::X86::BI__builtin_ia32_phsubd128:
+  case clang::X86::BI__builtin_ia32_phsubd256:
+  case clang::X86::BI__builtin_ia32_phsubsw128:
+  case clang::X86::BI__builtin_ia32_phsubsw256:{
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+
+      unsigned SourceLen = SourceLHS.getVectorLength();
+      SmallVector<APValue, 4> ResultElements;
+      ResultElements.reserve(SourceLen);
+      for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+        APSInt LHSA = SourceLHS.getVectorElt(EltNum).getInt();
+        APSInt LHSB = SourceLHS.getVectorElt(EltNum + 1).getInt();
+
+        switch (E->getBuiltinCallee()) {
+        case clang::X86::BI__builtin_ia32_phaddw128:
+        case clang::X86::BI__builtin_ia32_phaddw256:
+        case clang::X86::BI__builtin_ia32_phaddd128:
+        case clang::X86::BI__builtin_ia32_phaddd256:
+        ResultElements.push_back(
+            APValue(APSInt(LHSA+LHSB, DestUnsigned)));
+        break;
+        case clang::X86::BI__builtin_ia32_phaddsw128:
+        case clang::X86::BI__builtin_ia32_phaddsw256:
+          ResultElements.push_back(APValue(APSInt(
+              LHSA.isSigned() ? LHSA.sadd_sat(LHSB) : LHSA.uadd_sat(LHSB),
+              DestUnsigned)));
+          break;
+        case clang::X86::BI__builtin_ia32_phsubw128:
+        case clang::X86::BI__builtin_ia32_phsubw256:
+        case clang::X86::BI__builtin_ia32_phsubd128:
+        case clang::X86::BI__builtin_ia32_phsubd256:
+          ResultElements.push_back(APValue(APSInt(LHSA - LHSB, DestUnsigned)));
+          break;
+        case clang::X86::BI__builtin_ia32_phsubsw128:
+        case clang::X86::BI__builtin_ia32_phsubsw256:
+          ResultElements.push_back(APValue(APSInt(
+              LHSA.isSigned() ? LHSA.ssub_sat(LHSB) : LHSA.usub_sat(LHSB),
+              DestUnsigned)));
+          break;
+      }
+    }
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APSInt RHSA = SourceRHS.getVectorElt(EltNum).getInt();
+      APSInt RHSB = SourceRHS.getVectorElt(EltNum + 1).getInt();
+
+      switch (E->getBuiltinCallee()) {
+      case clang::X86::BI__builtin_ia32_phaddw128:
+      case clang::X86::BI__builtin_ia32_phaddw256:
+      case clang::X86::BI__builtin_ia32_phaddd128:
+      case clang::X86::BI__builtin_ia32_phaddd256:
+        ResultElements.push_back(APValue(APSInt(RHSA + RHSB, DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_phaddsw128:
+      case clang::X86::BI__builtin_ia32_phaddsw256:
+        ResultElements.push_back(APValue(
+            APSInt(RHSA.isSigned() ? RHSA.sadd_sat(RHSB) : RHSA.uadd_sat(RHSB),
+                   DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_phsubw128:
+      case clang::X86::BI__builtin_ia32_phsubw256:
+      case clang::X86::BI__builtin_ia32_phsubd128:
+      case clang::X86::BI__builtin_ia32_phsubd256:
+        ResultElements.push_back(APValue(APSInt(RHSA - RHSB, DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_phsubsw128:
+      case clang::X86::BI__builtin_ia32_phsubsw256:
+        ResultElements.push_back(APValue(
+            APSInt(RHSA.isSigned() ? RHSA.ssub_sat(RHSB) : RHSA.usub_sat(RHSB),
+                   DestUnsigned)));
+        break;
+      }
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case clang::X86::BI__builtin_ia32_haddpd:
+  case clang::X86::BI__builtin_ia32_haddpd256:
+  case clang::X86::BI__builtin_ia32_haddps:
+  case clang::X86::BI__builtin_ia32_haddps256: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen);
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
+      APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
+      LHSA.add(LHSB, APFloat::rmNearestTiesToEven);
+      ResultElements.push_back(APValue(LHSA));
+    }
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
+      APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
+      RHSA.add(RHSB, APFloat::rmNearestTiesToEven);
+      ResultElements.push_back(APValue(RHSA));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case clang::X86::BI__builtin_ia32_hsubpd:
+  case clang::X86::BI__builtin_ia32_hsubpd256:
+  case clang::X86::BI__builtin_ia32_hsubps:
+  case clang::X86::BI__builtin_ia32_hsubps256: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen);
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
+      APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
+      LHSA.subtract(LHSB, APFloat::rmNearestTiesToEven);
+      ResultElements.push_back(APValue(LHSA));
+    }
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
+      APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
+      RHSA.subtract(RHSB, APFloat::rmNearestTiesToEven);
+      ResultElements.push_back(APValue(RHSA));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   }
 }
 
@@ -12197,67 +12337,71 @@ bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
 
 namespace {
   class ArrayExprEvaluator
-  : public ExprEvaluatorBase<ArrayExprEvaluator> {
-    const LValue &This;
-    APValue &Result;
-  public:
-
-    ArrayExprEvaluator(EvalInfo &Info, const LValue &This, APValue &Result)
-      : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {}
-
-    bool Success(const APValue &V, const Expr *E) {
-      assert(V.isArray() && "expected array");
-      Result = V;
-      return true;
-    }
-
-    bool ZeroInitialization(const Expr *E) {
-      const ConstantArrayType *CAT =
-          Info.Ctx.getAsConstantArrayType(E->getType());
-      if (!CAT) {
-        if (E->getType()->isIncompleteArrayType()) {
-          // We can be asked to zero-initialize a flexible array member; this
-          // is represented as an ImplicitValueInitExpr of incomplete array
-          // type. In this case, the array has zero elements.
-          Result = APValue(APValue::UninitArray(), 0, 0);
-          return true;
-        }
-        // FIXME: We could handle VLAs here.
-        return Error(E);
-      }
-
-      Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize());
-      if (!Result.hasArrayFiller())
-        return true;
-
-      // Zero-initialize all elements.
-      LValue Subobject = This;
-      Subobject.addArray(Info, E, CAT);
-      ImplicitValueInitExpr VIE(CAT->getElementType());
-      return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject, &VIE);
-    }
-
-    bool VisitCallExpr(const CallExpr *E) {
-      return handleCallExpr(E, Result, &This);
-    }
-    bool VisitInitListExpr(const InitListExpr *E,
-                           QualType AllocType = QualType());
-    bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
-    bool VisitCXXConstructExpr(const CXXConstructExpr *E);
-    bool VisitCXXConstructExpr(const CXXConstructExpr *E,
-                               const LValue &Subobject,
-                               APValue *Value, QualType Type);
-    bool VisitStringLiteral(const StringLiteral *E,
-                            QualType AllocType = QualType()) {
-      expandStringLiteral(Info, E, Result, AllocType);
-      return true;
-    }
-    bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E);
-    bool VisitCXXParenListOrInitListExpr(const Expr *ExprToVisit,
-                                         ArrayRef<Expr *> Args,
-                                         const Expr *ArrayFiller,
-                                         QualType AllocType = QualType());
-  };
+  :
+        public
+          ExprEvaluatorBase<ArrayExprEvaluator> {
+            const LValue &This;
+            APValue & Result;
+
+          public:
+            ArrayExprEvaluator(EvalInfo & Info, const LValue &This,
+                               APValue &Result)
+                : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {}
+
+            bool Success(const APValue &V, const Expr *E) {
+              assert(V.isArray() && "expected array");
+              Result = V;
+              return true;
+            }
+
+            bool ZeroInitialization(const Expr *E) {
+              const ConstantArrayType *CAT =
+                  Info.Ctx.getAsConstantArrayType(E->getType());
+              if (!CAT) {
+                if (E->getType()->isIncompleteArrayType()) {
+                  // We can be asked to zero-initialize a flexible array member;
+                  // this is represented as an ImplicitValueInitExpr of
+                  // incomplete array type. In this case, the array has zero
+                  // elements.
+                  Result = APValue(APValue::UninitArray(), 0, 0);
+                  return true;
+                }
+                // FIXME: We could handle VLAs here.
+                return Error(E);
+              }
+
+              Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize());
+              if (!Result.hasArrayFiller())
+                return true;
+
+              // Zero-initialize all elements.
+              LValue Subobject = This;
+              Subobject.addArray(Info, E, CAT);
+              ImplicitValueInitExpr VIE(CAT->getElementType());
+              return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject,
+                                     &VIE);
+            }
+
+            bool VisitCallExpr(const CallExpr *E) {
+              return handleCallExpr(E, Result, &This);
+            }
+            bool VisitInitListExpr(const InitListExpr *E,
+                                   QualType AllocType = QualType());
+            bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
+            bool VisitCXXConstructExpr(const CXXConstructExpr *E);
+            bool VisitCXXConstructExpr(const CXXConstructExpr *E,
+                                       const LValue &Subobject, APValue *Value,
+                                       QualType Type);
+            bool VisitStringLiteral(const StringLiteral *E,
+                                    QualType AllocType = QualType()) {
+              expandStringLiteral(Info, E, Result, AllocType);
+              return true;
+            }
+            bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E);
+            bool VisitCXXParenListOrInitListExpr(
+                const Expr *ExprToVisit, ArrayRef<Expr *> Args,
+                const Expr *ArrayFiller, QualType AllocType = QualType());
+          };
 } // end anonymous namespace
 
 static bool EvaluateArray(const Expr *E, const LValue &This,

>From df6242e4b74e8170cd28a2f9663aa974a4b0b12b Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Sat, 13 Sep 2025 20:58:15 +0800
Subject: [PATCH 4/6] adjust unit test #146940

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 162 ++++++++++++-----------
 clang/test/CodeGen/X86/avx-builtins.c    |  39 ++----
 clang/test/CodeGen/X86/avx2-builtins.c   |  87 ++++--------
 clang/test/CodeGen/X86/mmx-builtins.c    |  54 +-------
 clang/test/CodeGen/X86/ssse3-builtins.c  |  54 +-------
 5 files changed, 128 insertions(+), 268 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index f6027c78935c3..9d5d70698b8d3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2739,9 +2739,9 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
 }
 
 static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
-                                        const InterpFrame *Frame,
-                                        const CallExpr *Call,
-                                        uint32_t BuiltinID) {
+                                          const InterpFrame *Frame,
+                                          const CallExpr *Call,
+                                          uint32_t BuiltinID) {
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
@@ -2752,7 +2752,8 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
   PrimType ElemT = *S.getContext().classify(VT->getElementType());
   unsigned SourceLen = VT->getNumElements();
   assert(SourceLen % 2 == 0 &&
-         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen);
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
+             SourceLen);
   PrimType DstElemT = *S.getContext().classify(
       Call->getType()->castAs<VectorType>()->getElementType());
   unsigned DstElem = 0;
@@ -2774,16 +2775,17 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
     APSInt Elem2;
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
       Elem1 = LHS.elem<T>(I).toAPSInt();
-      Elem2 = LHS.elem<T>(I+1).toAPSInt();
+      Elem2 = LHS.elem<T>(I + 1).toAPSInt();
     });
     APSInt Result;
     if (IsAdd) {
-        if (IsSaturating) {
-          Result = APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-        }else{
-          Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-        }
-    }else{
+      if (IsSaturating) {
+        Result =
+            APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+      } else {
+        Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+      }
+    } else {
       if (IsSaturating) {
         Result =
             APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
@@ -2812,7 +2814,8 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
       }
     } else {
       if (IsSaturating) {
-        Result = APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+        Result =
+            APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
       } else {
         Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
       }
@@ -2826,15 +2829,15 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
 }
 
 static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
-                                          const InterpFrame *Frame,
-                                          const CallExpr *Call,
-                                          uint32_t BuiltinID) {
+                                           const InterpFrame *Frame,
+                                           const CallExpr *Call,
+                                           uint32_t BuiltinID) {
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
-  
+
   FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
   llvm::RoundingMode RM = getRoundingMode(FPO);
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
@@ -2851,7 +2854,6 @@ static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
   for (unsigned I = 0; I != SourceLen; I += 2) {
     APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
-
     if (IsAdd) {
       Elem1.add(Elem2, RM);
     } else {
@@ -3495,71 +3497,71 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
 
   case clang::X86::BI__builtin_ia32_phaddw128:
   case clang::X86::BI__builtin_ia32_phaddw256:
-   case clang::X86::BI__builtin_ia32_phaddd128: 
-   case clang::X86::BI__builtin_ia32_phaddd256: 
-   case clang::X86::BI__builtin_ia32_phaddsw128: 
-   case clang::X86::BI__builtin_ia32_phaddsw256:
-    case clang::X86::BI__builtin_ia32_phsubw128:
-    case clang::X86::BI__builtin_ia32_phsubw256:
-    case clang::X86::BI__builtin_ia32_phsubd128:
-    case clang::X86::BI__builtin_ia32_phsubd256:
-    case clang::X86::BI__builtin_ia32_phsubsw128:
-    case clang::X86::BI__builtin_ia32_phsubsw256:
-    
-      return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID);
-    case clang::X86::BI__builtin_ia32_haddpd:
-    case clang::X86::BI__builtin_ia32_haddpd256:
-    case clang::X86::BI__builtin_ia32_haddps:
-    case clang::X86::BI__builtin_ia32_haddps256:
-    case clang::X86::BI__builtin_ia32_hsubpd:
-    case clang::X86::BI__builtin_ia32_hsubpd256:
-    case clang::X86::BI__builtin_ia32_hsubps:
-    case clang::X86::BI__builtin_ia32_hsubps256:
-        return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID);
-
-    case clang::X86::BI__builtin_ia32_pmuldq128:
-    case clang::X86::BI__builtin_ia32_pmuldq256:
-    case clang::X86::BI__builtin_ia32_pmuldq512:
-    case clang::X86::BI__builtin_ia32_pmuludq128:
-    case clang::X86::BI__builtin_ia32_pmuludq256:
-    case clang::X86::BI__builtin_ia32_pmuludq512:
-      return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID);
-
-    case Builtin::BI__builtin_elementwise_fma:
-      return interp__builtin_elementwise_fma(S, OpPC, Call);
-
-    case X86::BI__builtin_ia32_selectb_128:
-    case X86::BI__builtin_ia32_selectb_256:
-    case X86::BI__builtin_ia32_selectb_512:
-    case X86::BI__builtin_ia32_selectw_128:
-    case X86::BI__builtin_ia32_selectw_256:
-    case X86::BI__builtin_ia32_selectw_512:
-    case X86::BI__builtin_ia32_selectd_128:
-    case X86::BI__builtin_ia32_selectd_256:
-    case X86::BI__builtin_ia32_selectd_512:
-    case X86::BI__builtin_ia32_selectq_128:
-    case X86::BI__builtin_ia32_selectq_256:
-    case X86::BI__builtin_ia32_selectq_512:
-    case X86::BI__builtin_ia32_selectph_128:
-    case X86::BI__builtin_ia32_selectph_256:
-    case X86::BI__builtin_ia32_selectph_512:
-    case X86::BI__builtin_ia32_selectpbf_128:
-    case X86::BI__builtin_ia32_selectpbf_256:
-    case X86::BI__builtin_ia32_selectpbf_512:
-    case X86::BI__builtin_ia32_selectps_128:
-    case X86::BI__builtin_ia32_selectps_256:
-    case X86::BI__builtin_ia32_selectps_512:
-    case X86::BI__builtin_ia32_selectpd_128:
-    case X86::BI__builtin_ia32_selectpd_256:
-    case X86::BI__builtin_ia32_selectpd_512:
-      return interp__builtin_select(S, OpPC, Call);
+  case clang::X86::BI__builtin_ia32_phaddd128:
+  case clang::X86::BI__builtin_ia32_phaddd256:
+  case clang::X86::BI__builtin_ia32_phaddsw128:
+  case clang::X86::BI__builtin_ia32_phaddsw256:
+  case clang::X86::BI__builtin_ia32_phsubw128:
+  case clang::X86::BI__builtin_ia32_phsubw256:
+  case clang::X86::BI__builtin_ia32_phsubd128:
+  case clang::X86::BI__builtin_ia32_phsubd256:
+  case clang::X86::BI__builtin_ia32_phsubsw128:
+  case clang::X86::BI__builtin_ia32_phsubsw256:
+    return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID);
+
+  case clang::X86::BI__builtin_ia32_haddpd:
+  case clang::X86::BI__builtin_ia32_haddpd256:
+  case clang::X86::BI__builtin_ia32_haddps:
+  case clang::X86::BI__builtin_ia32_haddps256:
+  case clang::X86::BI__builtin_ia32_hsubpd:
+  case clang::X86::BI__builtin_ia32_hsubpd256:
+  case clang::X86::BI__builtin_ia32_hsubps:
+  case clang::X86::BI__builtin_ia32_hsubps256:
+    return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID);
+
+  case clang::X86::BI__builtin_ia32_pmuldq128:
+  case clang::X86::BI__builtin_ia32_pmuldq256:
+  case clang::X86::BI__builtin_ia32_pmuldq512:
+  case clang::X86::BI__builtin_ia32_pmuludq128:
+  case clang::X86::BI__builtin_ia32_pmuludq256:
+  case clang::X86::BI__builtin_ia32_pmuludq512:
+    return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID);
+
+  case Builtin::BI__builtin_elementwise_fma:
+    return interp__builtin_elementwise_fma(S, OpPC, Call);
+
+  case X86::BI__builtin_ia32_selectb_128:
+  case X86::BI__builtin_ia32_selectb_256:
+  case X86::BI__builtin_ia32_selectb_512:
+  case X86::BI__builtin_ia32_selectw_128:
+  case X86::BI__builtin_ia32_selectw_256:
+  case X86::BI__builtin_ia32_selectw_512:
+  case X86::BI__builtin_ia32_selectd_128:
+  case X86::BI__builtin_ia32_selectd_256:
+  case X86::BI__builtin_ia32_selectd_512:
+  case X86::BI__builtin_ia32_selectq_128:
+  case X86::BI__builtin_ia32_selectq_256:
+  case X86::BI__builtin_ia32_selectq_512:
+  case X86::BI__builtin_ia32_selectph_128:
+  case X86::BI__builtin_ia32_selectph_256:
+  case X86::BI__builtin_ia32_selectph_512:
+  case X86::BI__builtin_ia32_selectpbf_128:
+  case X86::BI__builtin_ia32_selectpbf_256:
+  case X86::BI__builtin_ia32_selectpbf_512:
+  case X86::BI__builtin_ia32_selectps_128:
+  case X86::BI__builtin_ia32_selectps_256:
+  case X86::BI__builtin_ia32_selectps_512:
+  case X86::BI__builtin_ia32_selectpd_128:
+  case X86::BI__builtin_ia32_selectpd_256:
+  case X86::BI__builtin_ia32_selectpd_512:
+    return interp__builtin_select(S, OpPC, Call);
 
-    default:
-      S.FFDiag(S.Current->getLocation(OpPC),
-               diag::note_invalid_subexpr_in_const_expr)
-          << S.Current->getRange(OpPC);
+  default:
+    S.FFDiag(S.Current->getLocation(OpPC),
+             diag::note_invalid_subexpr_in_const_expr)
+        << S.Current->getRange(OpPC);
 
-      return false;
+    return false;
   }
 
   llvm_unreachable("Unhandled builtin ID");
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 9857b84c94112..4e21cfea41553 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1083,53 +1083,34 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hadd_pd(A, B);
 }
-constexpr bool test_mm256_hadd_epi32_constexpr() {
-    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
-    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
-    constexpr __m256d result = _mm256_hadd_pd(a, b);
-    return match_m256d(result,1.0+2.0,3.0+4.0,5.0+6.0,7.0+8.0);
-}
-TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
+TEST_CONSTEXPR(match_m256d(_mm256_hadd_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), 3.0, 7.0, 11.0, 15.0));
 
 __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hadd_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hadd_ps(A, B);
 }
-constexpr bool test_mm256_hadd_ps_constexpr() {
-    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
-    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f);
-    constexpr __m256 result = _mm256_hadd_ps(a, b);
-    return match_m256(result,1.0f+2.0f,3.0f+4.0f,5.0f+6.0f,7.0f+8.0f,
-                             9.0f+10.0f,11.0f+12.0f,13.0f+14.0f,15.0f+16.0f);
-}
-TEST_CONSTEXPR(test_mm256_hadd_ps_constexpr())
+TEST_CONSTEXPR(_mm256_hadd_ps(
+    (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+    (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f},
+    3.0f, 7.0f, 11.0f, 15.0f, 19.0f, 23.0f, 27.0f, 31.0f))
 
 __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_hsub_pd
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
-constexpr bool test_mm256_hsub_pd_constexpr() {
-    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
-    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
-    constexpr __m256d result = _mm256_hsub_pd(a, b);
-    return match_m256d(result,1.0-2.0,3.0-4.0,5.0-6.0,7.0-8.0);
-}
-TEST_CONSTEXPR(test_mm256_hsub_pd_constexpr())
+TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), -1.0,-1.0,-1.0,-1.0));
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hsub_ps(A, B);
 }
-constexpr bool test_mm256_hsub_ps_constexpr() {
-    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
-    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f);
-    constexpr __m256 result = _mm256_hsub_ps(a, b);
-    return match_m256(result,1.0f-2.0f,3.0f-4.0f,5.0f-6.0f,7.0f-8.0f,
-                             9.0f-10.0f,11.0f-12.0f,13.0f-14.0f,15.0f-16.0f);
-}
+TEST_CONSTEXPR(_mm256_hsub_ps(
+    (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+    (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f},
+    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index c34594cf78a8e..a9095de4fe373 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -461,99 +461,60 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
   // CHECK: call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadd_epi16(a, b);
 }
-
-constexpr bool test_mm256_hadd_epi16_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
-        8,9,10,11,12,13,14,15,16);
-    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
-        24,25,26,27,28,29,30,31,32);
-    
-    constexpr __m256i result = _mm256_hadd_epi16(a, b);
-    return match_v16si(result,1+2,3+4,5+6,7+8,9+10,11+12,13+14,15+16,17+18,19+20,21+22,23+24,25+26,27+28,29+30,31+32);
-}
-TEST_CONSTEXPR(test_mm256_hadd_epi16_constexpr())
+TEST_CONSTEXPR(match_v16hi(_mm256_hadd_epi16(
+    (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 
+    (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), 
+    3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63));
 
 __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hadd_epi32(a, b);
 }
-
-constexpr bool test_mm256_hadd_epi32_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
-    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
-    
-    constexpr __m256i result = _mm256_hadd_epi32(a, b);
-    return match_v8si(result,10+20,30+40,50+60,70+80,5+15,25+35, 45+55,65+75);
-}
-TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
+TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32(
+    (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
+    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75})
+    30,70,110,150,20,60,100,140))
 
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadds_epi16(a, b);
 }
-constexpr bool test_mm256_hadds_epi16_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
-        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-    constexpr __m256i b = _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 
-        1, 1, 1, 1, 1, 1, 1);
-    constexpr __m256i result = _mm256_hadds_epi16(a, b);
-
-    return match_v16si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
-    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm256_hadds_epi16_constexpr())
+TEST_CONSTEXPR(match_v16hi( _mm256_hadds_epi16(
+    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
+    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767)))
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
   // CHECK: call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsub_epi16(a, b);
 }
-
-constexpr bool test_mm256_hsub_epi16_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
-        8,9,10,11,12,13,14,15,16);
-    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
-        24,25,26,27,28,29,30,31,32);
-    
-    constexpr __m256i result = _mm256_hsub_epi16(a, b);
-    return match_v16si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16,17-18,19-20,21-22,23-24,25-26,27-28,29-30,31-32);
-}
-TEST_CONSTEXPR(test_mm256_hsub_epi16_constexpr())
+TEST_CONSTEXPR(match_v16hi(_mm256_hsub_epi16(
+    (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 
+    (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), 
+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
 
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hsub_epi32(a, b);
 }
-
-constexpr bool test_mm256_hsub_epi32_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
-    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
-    
-    constexpr __m256i result = _mm256_hsub_epi32(a, b);
-    return match_v8si(result,10-20,30-40,50-60,70-80,5-15,25-35, 45-55,65-75);
-}
-TEST_CONSTEXPR(test_mm256_hsub_epi32_constexpr())
+TEST_CONSTEXPR(match_v8si(_mm256_hsub_epi32(
+    (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
+    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75})
+    -10,-10,-10,-10,-10,-10,-10,-10))
 
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
-
-constexpr bool test_mm256_hsubs_epi16_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
-        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-    constexpr __m256i b = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-    constexpr __m256i result3 = _mm256_hsubs_epi16(a, b);
-
-    return match_v16si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
-    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm256_hsubs_epi16_constexpr())
-
+TEST_CONSTEXPR(match_v16hi( _mm256_hsubs_epi16(
+    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
+    (__m256i)(__v16hi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767)))
 
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 70f521e380dd4..944af98ffcadc 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -309,84 +309,42 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
-constexpr bool test_mm_hadd_pi16_constexpr() {
-    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
-    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
-    
-    constexpr __m64 result = _mm_hadd_pi16(a, b);
-    return match_v4si(result,1+2,3+4,5+6,7+8);
-}
-TEST_CONSTEXPR(test_mm_hadd_pi16_constexpr())
+TEST_CONSTEXPR(match_v4hi(_mm_hadd_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),3,7,11,15));
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
-constexpr bool test_mm_hadd_pi32_constexpr() {
-    constexpr __m64 a = _mm_setr_pi32(1, 2);
-    constexpr __m64 b = _mm_setr_pi32(3, 4);
-    
-    constexpr __m64 result = _mm_hadd_pi32(a, b);
-    return match_v2si(result,1+2,3+4);
-}
-TEST_CONSTEXPR(test_mm_hadd_pi32_constexpr())
+TEST_CONSTEXPR(match_v2si(_mm_hadd_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),3,7));
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
-constexpr bool test_mm_hadds_pi16_constexpr() {
-    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
-    constexpr __m64 b = _mm_setr_pi16(1,1,1,1);
-    
-    constexpr __m64 result = _mm_hadds_pi16(a, b);
-    return match_v4si(result,32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm_hadds_pi16_constexpr())
+TEST_CONSTEXPR(match_v4hi(_mm_hadds_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){32767, 32767, 32767, 32767}),32767, 32767, 32767, 32767));
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
-constexpr bool test_mm_hsub_pi16_constexpr() {
-    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
-    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
-    
-    constexpr __m64 result = _mm_hsub_pi16(a, b);
-    return match_v4si(result,1-2,3-4,5-6,7-8);
-}
-TEST_CONSTEXPR(test_mm_hsub_pi16_constexpr())
+TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),-1,-1,-1,-1));
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
-constexpr bool test_mm_hsub_pi32_constexpr() {
-    constexpr __m64 a = _mm_setr_pi32(1, 2);
-    constexpr __m64 b = _mm_setr_pi32(3, 4);
-    
-    constexpr __m64 result = _mm_hsub_pi32(a, b);
-    return match_v2si(result,1-2,3-4);
-}
-TEST_CONSTEXPR(test_mm_hsub_pi32_constexpr())
+TEST_CONSTEXPR(match_v2si(_mm_hsub_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),-1,-1));
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
-constexpr bool test_mm_hsubs_pi16_constexpr() {
-    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
-    constexpr __m64 b = _mm_setr_pi16(-1,-1,-1,-1);
-    
-    constexpr __m64 result = _mm_hsubs_pi16(a, b);
-    return match_v4si(result,32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm_hsubs_pi16_constexpr())
+TEST_CONSTEXPR(match_v4hi(_mm_hsubs_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){-4,-5,-6,-7}),32767, 32767, 32767, 32767));
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index bd0ef43278217..61c7ee31af96f 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -60,42 +60,21 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadd_epi16(a, b);
 }
-constexpr bool test_mm_hadd_epi16_constexpr() {
-    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-    constexpr __m128i b = _mm_setr_epi16(17,18,19,20,21,22,23,24);
-    
-    constexpr __m128i result = _mm_hadd_epi16(a, b);
-    return match_v8si(result,1+2,3+4,5+6,7+8,17+18,19+20,21+22,23+24);
-}
-TEST_CONSTEXPR(test_mm_hadd_epi16_constexpr())
+TEST_CONSTEXPR(match_v8hi(_mm_hadd_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){17,18,19,20,21,22,23,24}), 3,7,11,15,35,39,43,47));
 
 __m128i test_mm_hadd_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hadd_epi32(a, b);
 }
-constexpr bool test_mm_hadd_epi32_constexpr() {
-    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
-    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
-    
-    constexpr __m128i result = _mm_hadd_epi32(a, b);
-    return match_v4si(result,1+2,3+4,5+6,7+8);
-}
-TEST_CONSTEXPR(test_mm_hadd_epi32_constexpr())
+TEST_CONSTEXPR(match_v4si(_mm_hadd_epi32((__m128i)(__v4si){1,2,3,4}, (__m128i)(__v4si){5,6,7,8}), 3,7,11,15));
 
 __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadds_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
-constexpr bool test_mm_hadds_epi16_constexpr() {
-    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-    constexpr __m128i b = _mm_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1);
-    constexpr __m128i result = _mm_hadds_epi16(a, b);
-
-    return match_v8si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm_hadds_epi16_constexpr())
+TEST_CONSTEXPR(match_v8hi(_mm_hadds_epi16((__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}, (__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}), 32767,32767,32767,32767,32767,32767,32767,32767));
 
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
@@ -103,42 +82,21 @@ __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
-constexpr bool test_mm_hsub_epi16_constexpr() {
-    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-    constexpr __m128i b = _mm_setr_epi16(9,10,11,12,13,14,15,16);
-    
-    constexpr __m128i result = _mm_hsub_epi16(a, b);
-    return match_v8si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16);
-}
-TEST_CONSTEXPR(test_mm_hsub_epi16_constexpr())
+TEST_CONSTEXPR(match_v8hi(_mm_hsub_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){9,10,11,12,13,14,15,16}), -1,-1,-1,-1,-1,-1,-1,-1));
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
-constexpr bool test_mm_hsub_epi32_constexpr() {
-    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
-    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
-    
-    constexpr __m128i result = _mm_hsub_epi32(a, b);
-    return match_v4si(result,1-2,3-4,5-6,7-8);
-}
-TEST_CONSTEXPR(test_mm_hsub_epi32_constexpr())
+TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,2,1}, (__m128i)(__v4si){8,7,6,5}), 1,1,1,1))   
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
-constexpr bool test_mm_hsubs_epi16_constexpr() {
-    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-    constexpr __m128i b = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
-    constexpr __m128i result3 = _mm_hsubs_epi16(a, b);
-
-    return match_v8si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm_hsubs_epi16_constexpr())
+TEST_CONSTEXPR(match_v8hi(_mm_hsubs_epi16((__m128i)(__v8hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},(__m128i)(__v8hi){-1,-1,-1,-1,-1,-1,-1,-1}), 32767,32767,32767,32767,32767,32767,32767,32767));
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16

>From f91aa214f83bc2d459d43daa1c1563cf0c86b76a Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Fri, 26 Sep 2025 13:42:41 +0800
Subject: [PATCH 5/6] adjust test case and function

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 145 +++++++++--------------
 clang/lib/AST/ExprConstant.cpp           | 126 ++++++++++----------
 clang/test/CodeGen/X86/avx-builtins.c    |   8 +-
 clang/test/CodeGen/X86/avx2-builtins.c   |  30 ++---
 clang/test/CodeGen/X86/mmx-builtins.c    |   8 +-
 clang/test/CodeGen/X86/ssse3-builtins.c  |   8 +-
 6 files changed, 143 insertions(+), 182 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index a0ea1404db182..04c62bcc238bb 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -8,7 +8,6 @@
 #include "../ExprConstShared.h"
 #include "Boolean.h"
 #include "EvalEmitter.h"
-#include "Floating.h"
 #include "Interp.h"
 #include "InterpBuiltinBitCast.h"
 #include "PrimType.h"
@@ -20,7 +19,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SipHash.h"
-#include <cassert>
 
 namespace clang {
 namespace interp {
@@ -2744,100 +2742,56 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
-                                          const InterpFrame *Frame,
-                                          const CallExpr *Call,
-                                          uint32_t BuiltinID) {
+static bool interp_builtin_horizontal_int_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
+  assert(Call->getNumArgs() == 2);
+ 
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  assert(VT->getElementType()->isIntegralOrEnumerationType());
+  PrimType ElemT = *S.getContext().classify(VT->getElementType());
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
 
-  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
-  PrimType ElemT = *S.getContext().classify(VT->getElementType());
   unsigned SourceLen = VT->getNumElements();
   assert(SourceLen % 2 == 0 &&
          Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
              SourceLen);
-  PrimType DstElemT = *S.getContext().classify(
-      Call->getType()->castAs<VectorType>()->getElementType());
   unsigned DstElem = 0;
 
-  bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_phaddw128 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddw256 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddd128 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddd256 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256);
-
-  bool IsSaturating = (BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 ||
-                       BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256 ||
-                       BuiltinID == clang::X86::BI__builtin_ia32_phsubsw128 ||
-                       BuiltinID == clang::X86::BI__builtin_ia32_phsubsw256);
-
   for (unsigned I = 0; I != SourceLen; I += 2) {
-    APSInt Elem1;
-    APSInt Elem2;
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
-      Elem1 = LHS.elem<T>(I).toAPSInt();
-      Elem2 = LHS.elem<T>(I + 1).toAPSInt();
+      APSInt Elem1 = LHS.elem<T>(I).toAPSInt();
+      APSInt Elem2 = LHS.elem<T>(I + 1).toAPSInt();
+      Dst.elem<T>(DstElem) =
+          static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
     });
-    APSInt Result;
-    if (IsAdd) {
-      if (IsSaturating) {
-        Result =
-            APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-      } else {
-        Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-      }
-    } else {
-      if (IsSaturating) {
-        Result =
-            APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-      } else {
-        Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-      }
-    }
-    INT_TYPE_SWITCH_NO_BOOL(DstElemT,
-                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
     ++DstElem;
   }
   for (unsigned I = 0; I != SourceLen; I += 2) {
-    APSInt Elem1;
-    APSInt Elem2;
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
-      Elem1 = RHS.elem<T>(I).toAPSInt();
-      Elem2 = RHS.elem<T>(I + 1).toAPSInt();
+      APSInt Elem1 = RHS.elem<T>(I).toAPSInt();
+      APSInt Elem2 = RHS.elem<T>(I + 1).toAPSInt();
+      Dst.elem<T>(DstElem) =
+          static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
     });
-    APSInt Result;
-    if (IsAdd) {
-      if (IsSaturating) {
-        Result =
-            APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-      } else {
-        Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-      }
-    } else {
-      if (IsSaturating) {
-        Result =
-            APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-      } else {
-        Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-      }
-    }
-    INT_TYPE_SWITCH_NO_BOOL(DstElemT,
-                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
     ++DstElem;
   }
   Dst.initializeAllElements();
   return true;
 }
 
-static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
-                                           const InterpFrame *Frame,
-                                           const CallExpr *Call,
-                                           uint32_t BuiltinID) {
+static bool interp_builtin_horizontal_fp_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APFloat(const APFloat &, const APFloat &,
+                               llvm::RoundingMode)>
+        Fn) {
+  assert(Call->getNumArgs() == 2);
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
@@ -2852,30 +2806,18 @@ static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
          Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
              SourceLen);
   unsigned DstElem = 0;
-  bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_haddpd ||
-                BuiltinID == clang::X86::BI__builtin_ia32_haddpd256 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_haddps ||
-                BuiltinID == clang::X86::BI__builtin_ia32_haddps256);
-  using T = Floating;
   for (unsigned I = 0; I != SourceLen; I += 2) {
+    using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
-    if (IsAdd) {
-      Elem1.add(Elem2, RM);
-    } else {
-      Elem1.subtract(Elem2, RM);
-    }
-    Dst.elem<T>(DstElem++) = Elem1;
+    Dst.elem<T>(DstElem++) =
+        static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
   }
   for (unsigned I = 0; I != SourceLen; I += 2) {
+    using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = RHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = RHS.elem<T>(I + 1).getAPFloat();
-    if (IsAdd) {
-      Elem1.add(Elem2, RM);
-    } else {
-      Elem1.subtract(Elem2, RM);
-    }
-    Dst.elem<T>(DstElem++) = Elem1;
+    Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
   }
   Dst.initializeAllElements();
   return true;
@@ -3596,25 +3538,48 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case clang::X86::BI__builtin_ia32_phaddw256:
   case clang::X86::BI__builtin_ia32_phaddd128:
   case clang::X86::BI__builtin_ia32_phaddd256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {return LHS + RHS;});
   case clang::X86::BI__builtin_ia32_phaddsw128:
   case clang::X86::BI__builtin_ia32_phaddsw256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
+          return LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS);
+        });
   case clang::X86::BI__builtin_ia32_phsubw128:
   case clang::X86::BI__builtin_ia32_phsubw256:
   case clang::X86::BI__builtin_ia32_phsubd128:
   case clang::X86::BI__builtin_ia32_phsubd256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
   case clang::X86::BI__builtin_ia32_phsubsw128:
   case clang::X86::BI__builtin_ia32_phsubsw256:
-    return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID);
-
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
+          return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
+        });
   case clang::X86::BI__builtin_ia32_haddpd:
   case clang::X86::BI__builtin_ia32_haddpd256:
   case clang::X86::BI__builtin_ia32_haddps:
   case clang::X86::BI__builtin_ia32_haddps256:
+    return interp_builtin_horizontal_fp_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.add(RHS, RM);
+          return F;
+        });
   case clang::X86::BI__builtin_ia32_hsubpd:
   case clang::X86::BI__builtin_ia32_hsubpd256:
   case clang::X86::BI__builtin_ia32_hsubps:
   case clang::X86::BI__builtin_ia32_hsubps256:
-    return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID);
+    return interp_builtin_horizontal_fp_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.subtract(RHS, RM);
+          return F;
+        });
 
   case clang::X86::BI__builtin_ia32_pmuldq128:
   case clang::X86::BI__builtin_ia32_pmuldq256:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 026894381e778..01e80a22fbb8d 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12332,71 +12332,67 @@ bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
 
 namespace {
   class ArrayExprEvaluator
-  :
-        public
-          ExprEvaluatorBase<ArrayExprEvaluator> {
-            const LValue &This;
-            APValue & Result;
-
-          public:
-            ArrayExprEvaluator(EvalInfo & Info, const LValue &This,
-                               APValue &Result)
-                : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {}
-
-            bool Success(const APValue &V, const Expr *E) {
-              assert(V.isArray() && "expected array");
-              Result = V;
-              return true;
-            }
-
-            bool ZeroInitialization(const Expr *E) {
-              const ConstantArrayType *CAT =
-                  Info.Ctx.getAsConstantArrayType(E->getType());
-              if (!CAT) {
-                if (E->getType()->isIncompleteArrayType()) {
-                  // We can be asked to zero-initialize a flexible array member;
-                  // this is represented as an ImplicitValueInitExpr of
-                  // incomplete array type. In this case, the array has zero
-                  // elements.
-                  Result = APValue(APValue::UninitArray(), 0, 0);
-                  return true;
-                }
-                // FIXME: We could handle VLAs here.
-                return Error(E);
-              }
-
-              Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize());
-              if (!Result.hasArrayFiller())
-                return true;
-
-              // Zero-initialize all elements.
-              LValue Subobject = This;
-              Subobject.addArray(Info, E, CAT);
-              ImplicitValueInitExpr VIE(CAT->getElementType());
-              return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject,
-                                     &VIE);
-            }
-
-            bool VisitCallExpr(const CallExpr *E) {
-              return handleCallExpr(E, Result, &This);
-            }
-            bool VisitInitListExpr(const InitListExpr *E,
-                                   QualType AllocType = QualType());
-            bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
-            bool VisitCXXConstructExpr(const CXXConstructExpr *E);
-            bool VisitCXXConstructExpr(const CXXConstructExpr *E,
-                                       const LValue &Subobject, APValue *Value,
-                                       QualType Type);
-            bool VisitStringLiteral(const StringLiteral *E,
-                                    QualType AllocType = QualType()) {
-              expandStringLiteral(Info, E, Result, AllocType);
-              return true;
-            }
-            bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E);
-            bool VisitCXXParenListOrInitListExpr(
-                const Expr *ExprToVisit, ArrayRef<Expr *> Args,
-                const Expr *ArrayFiller, QualType AllocType = QualType());
-          };
+  : public ExprEvaluatorBase<ArrayExprEvaluator> {
+    const LValue &This;
+    APValue &Result;
+  public:
+
+    ArrayExprEvaluator(EvalInfo &Info, const LValue &This, APValue &Result)
+      : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {}
+
+    bool Success(const APValue &V, const Expr *E) {
+      assert(V.isArray() && "expected array");
+      Result = V;
+      return true;
+    }
+
+    bool ZeroInitialization(const Expr *E) {
+      const ConstantArrayType *CAT =
+          Info.Ctx.getAsConstantArrayType(E->getType());
+      if (!CAT) {
+        if (E->getType()->isIncompleteArrayType()) {
+          // We can be asked to zero-initialize a flexible array member; this
+          // is represented as an ImplicitValueInitExpr of incomplete array
+          // type. In this case, the array has zero elements.
+          Result = APValue(APValue::UninitArray(), 0, 0);
+          return true;
+        }
+        // FIXME: We could handle VLAs here.
+        return Error(E);
+      }
+
+      Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize());
+      if (!Result.hasArrayFiller())
+        return true;
+
+      // Zero-initialize all elements.
+      LValue Subobject = This;
+      Subobject.addArray(Info, E, CAT);
+      ImplicitValueInitExpr VIE(CAT->getElementType());
+      return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject, &VIE);
+    }
+
+    bool VisitCallExpr(const CallExpr *E) {
+      return handleCallExpr(E, Result, &This);
+    }
+    bool VisitInitListExpr(const InitListExpr *E,
+                           QualType AllocType = QualType());
+    bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
+    bool VisitCXXConstructExpr(const CXXConstructExpr *E);
+    bool VisitCXXConstructExpr(const CXXConstructExpr *E,
+                               const LValue &Subobject,
+                               APValue *Value, QualType Type);
+    bool VisitStringLiteral(const StringLiteral *E,
+                            QualType AllocType = QualType()) {
+      expandStringLiteral(Info, E, Result, AllocType);
+      return true;
+    }
+    bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E);
+    bool VisitCXXParenListOrInitListExpr(const Expr *ExprToVisit,
+                                         ArrayRef<Expr *> Args,
+                                         const Expr *ArrayFiller,
+                                         QualType AllocType = QualType());
+  };
 } // end anonymous namespace
 
 static bool EvaluateArray(const Expr *E, const LValue &This,
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 3c9e184b8ba3f..0ff5a83505607 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1102,7 +1102,7 @@ __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
-TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), -1.0,-1.0,-1.0,-1.0));
+TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){1.0, 2.0, 4.0, 3.0}, (__m256d){10.0, 6.0, 16.0, 8.0}), -1.0,1.0,4.0,8.0));
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
@@ -1110,9 +1110,9 @@ __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   return _mm256_hsub_ps(A, B);
 }
 TEST_CONSTEXPR(_mm256_hsub_ps(
-    (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-    (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f},
-    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))
+    (__m256){1.0f, 2.0f, 4.0f, 3.0f, 5.0f, 7.0f, 7.0f, 5.0f},
+    (__m256){9.0f, 6.0f, 11.0f, 8.0f, 13.0f, 17.0f, 15.0f, 11.0f},
+    -1.0f, 1.0f, -2.0f, 2.0f, -3.0f, 3.0f, -4.0f, 4.0f))
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index a993e2546b165..8837a8c0e52fa 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -476,8 +476,8 @@ __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
 }
 TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32(
     (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
-    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75})
-    30,70,110,150,20,60,100,140))
+    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}),
+    30,70,110,150,20,60,100,140));
 
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
@@ -485,9 +485,9 @@ __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   return _mm256_hadds_epi16(a, b);
 }
 TEST_CONSTEXPR(match_v16hi( _mm256_hadds_epi16(
-    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
-    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
-    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767)))
+    (__m256i)(__v16hi){32767, 32767, 1,2,3,4,5,6,7,8,9,10,11,12,13,14},
+    (__m256i)(__v16hi){19,20,21,22,23,24,25,26,27,28,29,30,31,32, 32767, 5}),
+    32767, 3,7,11,15,19,23,27, 39,43,47,51,55,59,63, 32767));
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
@@ -495,9 +495,9 @@ __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   return _mm256_hsub_epi16(a, b);
 }
 TEST_CONSTEXPR(match_v16hi(_mm256_hsub_epi16(
-    (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 
-    (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), 
-    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
+    (__m256i)(__v16hi){2,1,1,2,5,3,3,5,7,4,4,7,9,5,5,9}, 
+    (__m256i)(__v16hi){10,5,5,10,12,6,6,12,21,14,14,21,24,16,16,24}), 
+    1,-1,2,-2,3,-3,4,-4,5,-5,6,-6,7,-7,8,-8));
 
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
@@ -505,19 +505,19 @@ __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   return _mm256_hsub_epi32(a, b);
 }
 TEST_CONSTEXPR(match_v8si(_mm256_hsub_epi32(
-    (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
-    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75})
-    -10,-10,-10,-10,-10,-10,-10,-10))
+    (__m256i)(__v8si){10, 20, 30,50,60,90,100,140},
+    (__m256i)(__v8si){200,150,260,200,420,350,800,740}),
+    -10,-20,-30,-40,50,60,70,80));
 
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
-TEST_CONSTEXPR(match_v16hi( _mm256_hsubs_epi16(
-    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
-    (__m256i)(__v16hi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767)))
+TEST_CONSTEXPR(match_v16hi(_mm256_hsubs_epi16(
+    (__m256i)(__v16hi){32726, -100, 3, 2, 6, 4, 8, 5,15,10 ,21, 14, 27, 18, 100, 90},
+    (__m256i)(__v16hi){40, 20, 100, 70, 200,150, 100,40, 1000,900,300,150, 500,300, 1, 1}),
+    32767, 1, 2, 3, 5, 7, 9, 10, 20, 30, 50, 60, 100, 150, 200, 0));
 
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index b4c86b29e5260..7c1709f81f665 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -325,28 +325,28 @@ __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
-TEST_CONSTEXPR(match_v4hi(_mm_hadds_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){32767, 32767, 32767, 32767}),32767, 32767, 32767, 32767));
+TEST_CONSTEXPR(match_v4hi(_mm_hadds_pi16((__m64)(__v4hi){32767, 32767, 1,3},(__m64)(__v4hi){-1,3, 40, 60}),32767, 4, 2,100));
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
-TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),-1,-1,-1,-1));
+TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,4,3},(__m64)(__v4hi){10,5,0,-10}),-1,1,5,-10));
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
-TEST_CONSTEXPR(match_v2si(_mm_hsub_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),-1,-1));
+TEST_CONSTEXPR(match_v2si(_mm_hsub_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){4,3}),-1,1));
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
-TEST_CONSTEXPR(match_v4hi(_mm_hsubs_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){-4,-5,-6,-7}),32767, 32767, 32767, 32767));
+TEST_CONSTEXPR(match_v4hi(_mm_hsubs_pi16((__m64)(__v4hi){32767, 32767, 5, -32767},(__m64)(__v4hi){4,5,10,5}),0,32767,-1,5));
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 61c7ee31af96f..d5b64df3f57a9 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -74,7 +74,7 @@ __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
-TEST_CONSTEXPR(match_v8hi(_mm_hadds_epi16((__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}, (__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}), 32767,32767,32767,32767,32767,32767,32767,32767));
+TEST_CONSTEXPR(match_v8hi(_mm_hadds_epi16((__m128i)(__v8hi){30000,30000,-1,2,-3,3,1,4}, (__m128i)(__v8hi){2,6,1,9,-4,16,7,8}), 32767, 1,0,5,8,10,12,15));
 
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
@@ -82,21 +82,21 @@ __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
-TEST_CONSTEXPR(match_v8hi(_mm_hsub_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){9,10,11,12,13,14,15,16}), -1,-1,-1,-1,-1,-1,-1,-1));
+TEST_CONSTEXPR(match_v8hi(_mm_hsub_epi16((__m128i)(__v8hi){20,15,16,12,9,6,4,2}, (__m128i)(__v8hi){3,2,1,1,4,5,0,2}), 5,4,3,2,1,0,-1,-2));
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,2,1}, (__m128i)(__v4si){8,7,6,5}), 1,1,1,1))   
+TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,1,1}, (__m128i)(__v4si){7,5,10,5}), 1,0,-1,5))   
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
-TEST_CONSTEXPR(match_v8hi(_mm_hsubs_epi16((__m128i)(__v8hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},(__m128i)(__v8hi){-1,-1,-1,-1,-1,-1,-1,-1}), 32767,32767,32767,32767,32767,32767,32767,32767));
+TEST_CONSTEXPR(match_v8hi(_mm_hsubs_epi16((__m128i)(__v8hi){32767, -15,16,12,9,6,4,2},(__m128i)(__v8hi){3,2,1,1,4,5,0,2}), 32767,4,3,2,1,0,-1,-2));
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16

>From 4f5fb878426471f92c3b488495ae73bd87897690 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Fri, 26 Sep 2025 13:48:02 +0800
Subject: [PATCH 6/6] undo the unintentional formatting of the code

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 04c62bcc238bb..5a097e5a4f7a0 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2746,7 +2746,7 @@ static bool interp_builtin_horizontal_int_binop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
   assert(Call->getNumArgs() == 2);
- 
+
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
@@ -2810,8 +2810,7 @@ static bool interp_builtin_horizontal_fp_binop(
     using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
-    Dst.elem<T>(DstElem++) =
-        static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
+    Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
   }
   for (unsigned I = 0; I != SourceLen; I += 2) {
     using T = PrimConv<PT_Float>::T;
@@ -3539,7 +3538,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case clang::X86::BI__builtin_ia32_phaddd128:
   case clang::X86::BI__builtin_ia32_phaddd256:
     return interp_builtin_horizontal_int_binop(
-        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {return LHS + RHS;});
+        S, OpPC, Call,
+        [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
   case clang::X86::BI__builtin_ia32_phaddsw128:
   case clang::X86::BI__builtin_ia32_phaddsw256:
     return interp_builtin_horizontal_int_binop(
@@ -3551,7 +3551,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case clang::X86::BI__builtin_ia32_phsubd128:
   case clang::X86::BI__builtin_ia32_phsubd256:
     return interp_builtin_horizontal_int_binop(
-        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
+        S, OpPC, Call,
+        [](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
   case clang::X86::BI__builtin_ia32_phsubsw128:
   case clang::X86::BI__builtin_ia32_phsubsw256:
     return interp_builtin_horizontal_int_binop(