[clang] [X86] Add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr (PR #156822)

Mon Oct 6 23:04:56 PDT 2025

https://github.com/whytolearn updated https://github.com/llvm/llvm-project/pull/156822

>From a81c4068096b960de65c3517f18d2d31004afbce Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Thu, 4 Sep 2025 15:52:57 +0800
Subject: [PATCH 01/12] deal this issues 155395
 VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX
 PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395

---
 clang/lib/Headers/avx2intrin.h          | 27 ++++-----
 clang/lib/Headers/avxintrin.h           | 11 ++--
 clang/lib/Headers/pmmintrin.h           | 20 +++----
 clang/lib/Headers/tmmintrin.h           | 80 +++++++++++--------------
 clang/test/CodeGen/X86/avx-builtins.c   | 29 +++++++++
 clang/test/CodeGen/X86/avx2-builtins.c  | 63 +++++++++++++++++++
 clang/test/CodeGen/X86/mmx-builtins.c   | 48 +++++++++++++++
 clang/test/CodeGen/X86/ssse3-builtins.c | 49 +++++++++++++++
 8 files changed, 250 insertions(+), 77 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 384faa35d246f..f8fb808f7f29c 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -854,10 +854,9 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadd_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
@@ -886,7 +885,7 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hadd_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
@@ -921,10 +920,9 @@ _mm256_hadd_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadds_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadds_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -957,10 +955,9 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hsub_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
@@ -989,7 +986,7 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hsub_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
@@ -1025,7 +1022,7 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 26096da949447..976710a64e80e 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -703,7 +703,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
 ///    elements of a vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
 ///    both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hadd_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
@@ -726,9 +726,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hadd_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 
+_mm256_hadd_ps(__m256 __a, __m256 __b) {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
@@ -749,7 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b)
 ///    odd-indexed elements of a vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the horizontal
 ///    differences of both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
@@ -772,7 +771,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal
 ///    differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hsub_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index cd605df7fb52d..400b28bb877a1 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -89,9 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hadd_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 
+_mm_hadd_ps(__m128 __a, __m128 __b) {
   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
@@ -174,9 +173,8 @@ _mm_moveldup_ps(__m128 __a)
 ///    A 128-bit vector of [2 x double] containing the right source operand.
 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
 ///    and differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_addsub_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_addsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }
 
@@ -197,9 +195,8 @@ _mm_addsub_pd(__m128d __a, __m128d __b)
 ///    destination.
 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hadd_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
 }
 
@@ -220,9 +217,8 @@ _mm_hadd_pd(__m128d __a, __m128d __b)
 ///    the destination.
 /// \returns A 128-bit vector of [2 x double] containing the horizontal
 ///    differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hsub_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
 }
 
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index f01c61afa8ea2..d79f7f6ea4091 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -204,10 +204,10 @@ _mm_abs_epi32(__m128i __a) {
 ///    destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddw128(
+      (__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -227,10 +227,9 @@ _mm_hadd_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -250,11 +249,10 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadd_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+ _mm_hadd_pi16(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -274,7 +272,7 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phaddd128(
@@ -301,10 +299,9 @@ _mm_hadd_pi32(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadds_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadds_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -327,7 +324,7 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phaddsw128(
@@ -351,10 +348,9 @@ _mm_hadds_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -374,10 +370,9 @@ _mm_hsub_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -397,7 +392,7 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubw128(
@@ -421,7 +416,7 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubd128(
@@ -448,10 +443,9 @@ _mm_hsub_pi32(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsubs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -474,7 +468,7 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubsw128(
@@ -509,10 +503,9 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_maddubs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -539,11 +532,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
-                                                 (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_maddubs_pi16(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                               (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -560,7 +552,7 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b)
 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 4a048744faa61..f381faebededf 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1083,24 +1083,53 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hadd_pd(A, B);
 }
+constexpr bool test_mm256_hadd_epi32_constexpr() {
+    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
+    constexpr __m256d result = _mm256_hadd_pd(a, b);
+    return match_m256d(result,1.0+2.0,3.0+4.0,5.0+6.0,7.0+8.0);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
 
 __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hadd_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hadd_ps(A, B);
 }
+constexpr bool test_mm256_hadd_ps_constexpr() {
+    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
+    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f);
+    constexpr __m256 result = _mm256_hadd_ps(a, b);
+    return match_m256(result,1.0f+2.0f,3.0f+4.0f,5.0f+6.0f,7.0f+8.0f,
+                             9.0f+10.0f,11.0f+12.0f,13.0f+14.0f,15.0f+16.0f);
+}
+TEST_CONSTEXPR(test_mm256_hadd_ps_constexpr())
 
 __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_hsub_pd
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
+constexpr bool test_mm256_hsub_pd_constexpr() {
+    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
+    constexpr __m256d result = _mm256_hsub_pd(a, b);
+    return match_m256d(result,1.0-2.0,3.0-4.0,5.0-6.0,7.0-8.0);
+}
+TEST_CONSTEXPR(test_mm256_hsub_pd_constexpr())
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hsub_ps(A, B);
 }
+constexpr bool test_mm256_hsub_ps_constexpr() {
+    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
+    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f);
+    constexpr __m256 result = _mm256_hsub_ps(a, b);
+    return match_m256(result,1.0f-2.0f,3.0f-4.0f,5.0f-6.0f,7.0f-8.0f,
+                             9.0f-10.0f,11.0f-12.0f,13.0f-14.0f,15.0f-16.0f);
+}
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index a39ce513837ea..02845b9417a1f 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -462,17 +462,48 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
   return _mm256_hadd_epi16(a, b);
 }
 
+constexpr bool test_mm256_hadd_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
+        8,9,10,11,12,13,14,15,16);
+    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
+        24,25,26,27,28,29,30,31,32);
+    
+    constexpr __m256i result = _mm256_hadd_epi16(a, b);
+    return match_v16si(result,1+2,3+4,5+6,7+8,9+10,11+12,13+14,15+16,17+18,19+20,21+22,23+24,25+26,27+28,29+30,31+32);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi16_constexpr())
+
 __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hadd_epi32(a, b);
 }
 
+constexpr bool test_mm256_hadd_epi32_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
+    
+    constexpr __m256i result = _mm256_hadd_epi32(a, b);
+    return match_v8si(result,10+20,30+40,50+60,70+80,5+15,25+35, 45+55,65+75);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
+
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadds_epi16(a, b);
 }
+constexpr bool test_mm256_hadds_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
+        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m256i b = _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 
+        1, 1, 1, 1, 1, 1, 1);
+    constexpr __m256i result = _mm256_hadds_epi16(a, b);
+
+    return match_v16si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm256_hadds_epi16_constexpr())
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
@@ -480,18 +511,50 @@ __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   return _mm256_hsub_epi16(a, b);
 }
 
+constexpr bool test_mm256_hsub_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
+        8,9,10,11,12,13,14,15,16);
+    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
+        24,25,26,27,28,29,30,31,32);
+    
+    constexpr __m256i result = _mm256_hsub_epi16(a, b);
+    return match_v16si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16,17-18,19-20,21-22,23-24,25-26,27-28,29-30,31-32);
+}
+TEST_CONSTEXPR(test_mm256_hsub_epi16_constexpr())
+
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hsub_epi32(a, b);
 }
 
+constexpr bool test_mm256_hsub_epi32_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
+    
+    constexpr __m256i result = _mm256_hsub_epi32(a, b);
+    return match_v8si(result,10-20,30-40,50-60,70-80,5-15,25-35, 45-55,65-75);
+}
+TEST_CONSTEXPR(test_mm256_hsub_epi32_constexpr())
+
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
 
+constexpr bool test_mm256_hsubs_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
+        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m256i b = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    constexpr __m256i result3 = _mm256_hsubs_epi16(a, b);
+
+    return match_v16si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm256_hsubs_epi16_constexpr())
+
+
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 7bd2475399bf9..8da0e8c814879 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -309,36 +309,84 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
+constexpr bool test_mm_hadd_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
+    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
+    
+    constexpr __m64 result = _mm_hadd_pi16(a, b);
+    return match_v4si(result,1+2,3+4,5+6,7+8);
+}
+TEST_CONSTEXPR(test_mm_hadd_pi16_constexpr())
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
+constexpr bool test_mm_hadd_pi32_constexpr() {
+    constexpr __m64 a = _mm_setr_pi32(1, 2);
+    constexpr __m64 b = _mm_setr_pi32(3, 4);
+    
+    constexpr __m64 result = _mm_hadd_pi32(a, b);
+    return match_v2si(result,1+2,3+4);
+}
+TEST_CONSTEXPR(test_mm_hadd_pi32_constexpr())
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
+constexpr bool test_mm_hadds_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
+    constexpr __m64 b = _mm_setr_pi16(1,1,1,1);
+    
+    constexpr __m64 result = _mm_hadds_pi16(a, b);
+    return match_v4si(result,32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hadds_pi16_constexpr())
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
+constexpr bool test_mm_hsub_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
+    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
+    
+    constexpr __m64 result = _mm_hsub_pi16(a, b);
+    return match_v4si(result,1-2,3-4,5-6,7-8);
+}
+TEST_CONSTEXPR(test_mm_hsub_pi16_constexpr())
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
+constexpr bool test_mm_hsub_pi32_constexpr() {
+    constexpr __m64 a = _mm_setr_pi32(1, 2);
+    constexpr __m64 b = _mm_setr_pi32(3, 4);
+    
+    constexpr __m64 result = _mm_hsub_pi32(a, b);
+    return match_v2si(result,1-2,3-4);
+}
+TEST_CONSTEXPR(test_mm_hsub_pi32_constexpr())
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
+constexpr bool test_mm_hsubs_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
+    constexpr __m64 b = _mm_setr_pi16(-1,-1,-1,-1);
+    
+    constexpr __m64 result = _mm_hsubs_pi16(a, b);
+    return match_v4si(result,32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hsubs_pi16_constexpr())
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 56ff73f08ab32..bd0ef43278217 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -60,36 +60,85 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadd_epi16(a, b);
 }
+constexpr bool test_mm_hadd_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    constexpr __m128i b = _mm_setr_epi16(17,18,19,20,21,22,23,24);
+    
+    constexpr __m128i result = _mm_hadd_epi16(a, b);
+    return match_v8si(result,1+2,3+4,5+6,7+8,17+18,19+20,21+22,23+24);
+}
+TEST_CONSTEXPR(test_mm_hadd_epi16_constexpr())
 
 __m128i test_mm_hadd_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hadd_epi32(a, b);
 }
+constexpr bool test_mm_hadd_epi32_constexpr() {
+    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
+    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
+    
+    constexpr __m128i result = _mm_hadd_epi32(a, b);
+    return match_v4si(result,1+2,3+4,5+6,7+8);
+}
+TEST_CONSTEXPR(test_mm_hadd_epi32_constexpr())
 
 __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadds_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
+constexpr bool test_mm_hadds_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m128i b = _mm_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+    constexpr __m128i result = _mm_hadds_epi16(a, b);
+
+    return match_v8si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hadds_epi16_constexpr())
+
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
+constexpr bool test_mm_hsub_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    constexpr __m128i b = _mm_setr_epi16(9,10,11,12,13,14,15,16);
+    
+    constexpr __m128i result = _mm_hsub_epi16(a, b);
+    return match_v8si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16);
+}
+TEST_CONSTEXPR(test_mm_hsub_epi16_constexpr())
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
+constexpr bool test_mm_hsub_epi32_constexpr() {
+    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
+    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
+    
+    constexpr __m128i result = _mm_hsub_epi32(a, b);
+    return match_v4si(result,1-2,3-4,5-6,7-8);
+}
+TEST_CONSTEXPR(test_mm_hsub_epi32_constexpr())
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
+constexpr bool test_mm_hsubs_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m128i b = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
+    constexpr __m128i result3 = _mm_hsubs_epi16(a, b);
+
+    return match_v8si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hsubs_epi16_constexpr())
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16

>From 2fadf3fd261935e25adff5b26ad8ee0734746a26 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Thu, 4 Sep 2025 15:55:44 +0800
Subject: [PATCH 02/12] deal issues 15595 [Clang]
 VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX
 PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395

---
 clang/lib/Headers/avx2intrin.h | 15 ++++-----
 clang/lib/Headers/avxintrin.h  | 15 ++++-----
 clang/lib/Headers/pmmintrin.h  |  4 +--
 clang/lib/Headers/tmmintrin.h  | 57 +++++++++++++++-------------------
 4 files changed, 39 insertions(+), 52 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index f8fb808f7f29c..c39f94c7fc16b 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -886,9 +886,8 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hadd_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+_mm256_hadd_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
@@ -987,9 +986,8 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hsub_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+_mm256_hsub_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -1023,9 +1021,8 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hsubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 976710a64e80e..48d79063f9b61 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -704,8 +704,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
 ///    both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hadd_pd(__m256d __a, __m256d __b)
-{
+_mm256_hadd_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -726,8 +725,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 
-_mm256_hadd_ps(__m256 __a, __m256 __b) {
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
+                                                                   __m256 __b) {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
@@ -749,8 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) {
 /// \returns A 256-bit vector of [4 x double] containing the horizontal
 ///    differences of both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hsub_pd(__m256d __a, __m256d __b)
-{
+_mm256_hsub_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -771,9 +769,8 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal
 ///    differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hsub_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a,
+                                                                   __m256 __b) {
   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index 400b28bb877a1..67f2a7ffd1f56 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -89,8 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 
-_mm_hadd_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_ps(__m128 __a,
+                                                                  __m128 __b) {
   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index d79f7f6ea4091..b408c6a3404ec 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -206,8 +206,7 @@ _mm_abs_epi32(__m128i __a) {
 ///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadd_epi16(__m128i __a, __m128i __b) {
-  return (__m128i)__builtin_ia32_phaddw128(
-      (__v8hi)__a, (__v8hi)__b);
+  return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -249,8 +248,8 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) {
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
- _mm_hadd_pi16(__m64 __a, __m64 __b) {
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi16(__m64 __a,
+                                                                   __m64 __b) {
   return __trunc64(__builtin_ia32_phaddw128(
       (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
@@ -272,11 +271,10 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hadd_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddd128(
-        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddd128(
+      (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -324,11 +322,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) {
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hadds_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddsw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadds_pi16(__m64 __a,
+                                                                    __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddsw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -392,11 +389,10 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) {
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsub_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi16(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -416,11 +412,10 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsub_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubd128(
-        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubd128(
+      (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -468,11 +463,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) {
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubsw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a,
+                                                                    __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubsw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -553,9 +547,8 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) {
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_mulhrs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+_mm_mulhrs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit

>From ed4a09fb51ab347b4778b81d1f8c511d31d106a7 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Thu, 11 Sep 2025 13:48:13 +0800
Subject: [PATCH 03/12] constexpr deal

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 243 +++++++++++++++++----
 clang/lib/AST/ExprConstant.cpp           | 266 +++++++++++++++++------
 2 files changed, 407 insertions(+), 102 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8c2b71160f7f3..f6027c78935c3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -8,6 +8,7 @@
 #include "../ExprConstShared.h"
 #include "Boolean.h"
 #include "EvalEmitter.h"
+#include "Floating.h"
 #include "Interp.h"
 #include "InterpBuiltinBitCast.h"
 #include "PrimType.h"
@@ -19,6 +20,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SipHash.h"
+#include <cassert>
 
 namespace clang {
 namespace interp {
@@ -2736,6 +2738,141 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
+                                        const InterpFrame *Frame,
+                                        const CallExpr *Call,
+                                        uint32_t BuiltinID) {
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VT->getElementType());
+  unsigned SourceLen = VT->getNumElements();
+  assert(SourceLen % 2 == 0 &&
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen);
+  PrimType DstElemT = *S.getContext().classify(
+      Call->getType()->castAs<VectorType>()->getElementType());
+  unsigned DstElem = 0;
+
+  bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_phaddw128 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddw256 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddd128 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddd256 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256);
+
+  bool IsSaturating = (BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 ||
+                       BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256 ||
+                       BuiltinID == clang::X86::BI__builtin_ia32_phsubsw128 ||
+                       BuiltinID == clang::X86::BI__builtin_ia32_phsubsw256);
+
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    APSInt Elem1;
+    APSInt Elem2;
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      Elem1 = LHS.elem<T>(I).toAPSInt();
+      Elem2 = LHS.elem<T>(I+1).toAPSInt();
+    });
+    APSInt Result;
+    if (IsAdd) {
+        if (IsSaturating) {
+          Result = APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+        }else{
+          Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+        }
+    }else{
+      if (IsSaturating) {
+        Result =
+            APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+      } else {
+        Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+      }
+    }
+    INT_TYPE_SWITCH_NO_BOOL(DstElemT,
+                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
+    ++DstElem;
+  }
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    APSInt Elem1;
+    APSInt Elem2;
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      Elem1 = RHS.elem<T>(I).toAPSInt();
+      Elem2 = RHS.elem<T>(I + 1).toAPSInt();
+    });
+    APSInt Result;
+    if (IsAdd) {
+      if (IsSaturating) {
+        Result =
+            APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+      } else {
+        Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+      }
+    } else {
+      if (IsSaturating) {
+        Result = APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+      } else {
+        Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+      }
+    }
+    INT_TYPE_SWITCH_NO_BOOL(DstElemT,
+                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
+    ++DstElem;
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
+static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
+                                          const InterpFrame *Frame,
+                                          const CallExpr *Call,
+                                          uint32_t BuiltinID) {
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  
+  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
+  llvm::RoundingMode RM = getRoundingMode(FPO);
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  unsigned SourceLen = VT->getNumElements();
+  assert(SourceLen % 2 == 0 &&
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
+             SourceLen);
+  unsigned DstElem = 0;
+  bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_haddpd ||
+                BuiltinID == clang::X86::BI__builtin_ia32_haddpd256 ||
+                BuiltinID == clang::X86::BI__builtin_ia32_haddps ||
+                BuiltinID == clang::X86::BI__builtin_ia32_haddps256);
+  using T = Floating;
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
+    APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
+
+    if (IsAdd) {
+      Elem1.add(Elem2, RM);
+    } else {
+      Elem1.subtract(Elem2, RM);
+    }
+    Dst.elem<T>(DstElem++) = Elem1;
+  }
+  for (unsigned I = 0; I != SourceLen; I += 2) {
+    APFloat Elem1 = RHS.elem<T>(I).getAPFloat();
+    APFloat Elem2 = RHS.elem<T>(I + 1).getAPFloat();
+    if (IsAdd) {
+      Elem1.add(Elem2, RM);
+    } else {
+      Elem1.subtract(Elem2, RM);
+    }
+    Dst.elem<T>(DstElem++) = Elem1;
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_elementwise_fma(InterpState &S, CodePtr OpPC,
                                             const CallExpr *Call) {
   assert(Call->getNumArgs() == 3);
@@ -3356,49 +3493,73 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_elementwise_min:
     return interp__builtin_elementwise_maxmin(S, OpPC, Call, BuiltinID);
 
-  case clang::X86::BI__builtin_ia32_pmuldq128:
-  case clang::X86::BI__builtin_ia32_pmuldq256:
-  case clang::X86::BI__builtin_ia32_pmuldq512:
-  case clang::X86::BI__builtin_ia32_pmuludq128:
-  case clang::X86::BI__builtin_ia32_pmuludq256:
-  case clang::X86::BI__builtin_ia32_pmuludq512:
-    return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID);
-
-  case Builtin::BI__builtin_elementwise_fma:
-    return interp__builtin_elementwise_fma(S, OpPC, Call);
-
-  case X86::BI__builtin_ia32_selectb_128:
-  case X86::BI__builtin_ia32_selectb_256:
-  case X86::BI__builtin_ia32_selectb_512:
-  case X86::BI__builtin_ia32_selectw_128:
-  case X86::BI__builtin_ia32_selectw_256:
-  case X86::BI__builtin_ia32_selectw_512:
-  case X86::BI__builtin_ia32_selectd_128:
-  case X86::BI__builtin_ia32_selectd_256:
-  case X86::BI__builtin_ia32_selectd_512:
-  case X86::BI__builtin_ia32_selectq_128:
-  case X86::BI__builtin_ia32_selectq_256:
-  case X86::BI__builtin_ia32_selectq_512:
-  case X86::BI__builtin_ia32_selectph_128:
-  case X86::BI__builtin_ia32_selectph_256:
-  case X86::BI__builtin_ia32_selectph_512:
-  case X86::BI__builtin_ia32_selectpbf_128:
-  case X86::BI__builtin_ia32_selectpbf_256:
-  case X86::BI__builtin_ia32_selectpbf_512:
-  case X86::BI__builtin_ia32_selectps_128:
-  case X86::BI__builtin_ia32_selectps_256:
-  case X86::BI__builtin_ia32_selectps_512:
-  case X86::BI__builtin_ia32_selectpd_128:
-  case X86::BI__builtin_ia32_selectpd_256:
-  case X86::BI__builtin_ia32_selectpd_512:
-    return interp__builtin_select(S, OpPC, Call);
+  case clang::X86::BI__builtin_ia32_phaddw128:
+  case clang::X86::BI__builtin_ia32_phaddw256:
+   case clang::X86::BI__builtin_ia32_phaddd128: 
+   case clang::X86::BI__builtin_ia32_phaddd256: 
+   case clang::X86::BI__builtin_ia32_phaddsw128: 
+   case clang::X86::BI__builtin_ia32_phaddsw256:
+    case clang::X86::BI__builtin_ia32_phsubw128:
+    case clang::X86::BI__builtin_ia32_phsubw256:
+    case clang::X86::BI__builtin_ia32_phsubd128:
+    case clang::X86::BI__builtin_ia32_phsubd256:
+    case clang::X86::BI__builtin_ia32_phsubsw128:
+    case clang::X86::BI__builtin_ia32_phsubsw256:
+    
+      return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID);
+    case clang::X86::BI__builtin_ia32_haddpd:
+    case clang::X86::BI__builtin_ia32_haddpd256:
+    case clang::X86::BI__builtin_ia32_haddps:
+    case clang::X86::BI__builtin_ia32_haddps256:
+    case clang::X86::BI__builtin_ia32_hsubpd:
+    case clang::X86::BI__builtin_ia32_hsubpd256:
+    case clang::X86::BI__builtin_ia32_hsubps:
+    case clang::X86::BI__builtin_ia32_hsubps256:
+        return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID);
 
-  default:
-    S.FFDiag(S.Current->getLocation(OpPC),
-             diag::note_invalid_subexpr_in_const_expr)
-        << S.Current->getRange(OpPC);
+    case clang::X86::BI__builtin_ia32_pmuldq128:
+    case clang::X86::BI__builtin_ia32_pmuldq256:
+    case clang::X86::BI__builtin_ia32_pmuldq512:
+    case clang::X86::BI__builtin_ia32_pmuludq128:
+    case clang::X86::BI__builtin_ia32_pmuludq256:
+    case clang::X86::BI__builtin_ia32_pmuludq512:
+      return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID);
+
+    case Builtin::BI__builtin_elementwise_fma:
+      return interp__builtin_elementwise_fma(S, OpPC, Call);
+
+    case X86::BI__builtin_ia32_selectb_128:
+    case X86::BI__builtin_ia32_selectb_256:
+    case X86::BI__builtin_ia32_selectb_512:
+    case X86::BI__builtin_ia32_selectw_128:
+    case X86::BI__builtin_ia32_selectw_256:
+    case X86::BI__builtin_ia32_selectw_512:
+    case X86::BI__builtin_ia32_selectd_128:
+    case X86::BI__builtin_ia32_selectd_256:
+    case X86::BI__builtin_ia32_selectd_512:
+    case X86::BI__builtin_ia32_selectq_128:
+    case X86::BI__builtin_ia32_selectq_256:
+    case X86::BI__builtin_ia32_selectq_512:
+    case X86::BI__builtin_ia32_selectph_128:
+    case X86::BI__builtin_ia32_selectph_256:
+    case X86::BI__builtin_ia32_selectph_512:
+    case X86::BI__builtin_ia32_selectpbf_128:
+    case X86::BI__builtin_ia32_selectpbf_256:
+    case X86::BI__builtin_ia32_selectpbf_512:
+    case X86::BI__builtin_ia32_selectps_128:
+    case X86::BI__builtin_ia32_selectps_256:
+    case X86::BI__builtin_ia32_selectps_512:
+    case X86::BI__builtin_ia32_selectpd_128:
+    case X86::BI__builtin_ia32_selectpd_256:
+    case X86::BI__builtin_ia32_selectpd_512:
+      return interp__builtin_select(S, OpPC, Call);
 
-    return false;
+    default:
+      S.FFDiag(S.Current->getLocation(OpPC),
+               diag::note_invalid_subexpr_in_const_expr)
+          << S.Current->getRange(OpPC);
+
+      return false;
   }
 
   llvm_unreachable("Unhandled builtin ID");
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 66362d44976c4..774a3adf1a7ca 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -55,6 +55,7 @@
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/APFixedPoint.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -12105,6 +12106,145 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     }
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+
+  case clang::X86::BI__builtin_ia32_phaddw128:
+  case clang::X86::BI__builtin_ia32_phaddw256:
+  case clang::X86::BI__builtin_ia32_phaddd128:
+  case clang::X86::BI__builtin_ia32_phaddd256:
+  case clang::X86::BI__builtin_ia32_phaddsw128:
+  case clang::X86::BI__builtin_ia32_phaddsw256:
+
+  case clang::X86::BI__builtin_ia32_phsubw128:
+  case clang::X86::BI__builtin_ia32_phsubw256:
+  case clang::X86::BI__builtin_ia32_phsubd128:
+  case clang::X86::BI__builtin_ia32_phsubd256:
+  case clang::X86::BI__builtin_ia32_phsubsw128:
+  case clang::X86::BI__builtin_ia32_phsubsw256:{
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+
+      unsigned SourceLen = SourceLHS.getVectorLength();
+      SmallVector<APValue, 4> ResultElements;
+      ResultElements.reserve(SourceLen);
+      for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+        APSInt LHSA = SourceLHS.getVectorElt(EltNum).getInt();
+        APSInt LHSB = SourceLHS.getVectorElt(EltNum + 1).getInt();
+
+        switch (E->getBuiltinCallee()) {
+        case clang::X86::BI__builtin_ia32_phaddw128:
+        case clang::X86::BI__builtin_ia32_phaddw256:
+        case clang::X86::BI__builtin_ia32_phaddd128:
+        case clang::X86::BI__builtin_ia32_phaddd256:
+        ResultElements.push_back(
+            APValue(APSInt(LHSA+LHSB, DestUnsigned)));
+        break;
+        case clang::X86::BI__builtin_ia32_phaddsw128:
+        case clang::X86::BI__builtin_ia32_phaddsw256:
+          ResultElements.push_back(APValue(APSInt(
+              LHSA.isSigned() ? LHSA.sadd_sat(LHSB) : LHSA.uadd_sat(LHSB),
+              DestUnsigned)));
+          break;
+        case clang::X86::BI__builtin_ia32_phsubw128:
+        case clang::X86::BI__builtin_ia32_phsubw256:
+        case clang::X86::BI__builtin_ia32_phsubd128:
+        case clang::X86::BI__builtin_ia32_phsubd256:
+          ResultElements.push_back(APValue(APSInt(LHSA - LHSB, DestUnsigned)));
+          break;
+        case clang::X86::BI__builtin_ia32_phsubsw128:
+        case clang::X86::BI__builtin_ia32_phsubsw256:
+          ResultElements.push_back(APValue(APSInt(
+              LHSA.isSigned() ? LHSA.ssub_sat(LHSB) : LHSA.usub_sat(LHSB),
+              DestUnsigned)));
+          break;
+      }
+    }
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APSInt RHSA = SourceRHS.getVectorElt(EltNum).getInt();
+      APSInt RHSB = SourceRHS.getVectorElt(EltNum + 1).getInt();
+
+      switch (E->getBuiltinCallee()) {
+      case clang::X86::BI__builtin_ia32_phaddw128:
+      case clang::X86::BI__builtin_ia32_phaddw256:
+      case clang::X86::BI__builtin_ia32_phaddd128:
+      case clang::X86::BI__builtin_ia32_phaddd256:
+        ResultElements.push_back(APValue(APSInt(RHSA + RHSB, DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_phaddsw128:
+      case clang::X86::BI__builtin_ia32_phaddsw256:
+        ResultElements.push_back(APValue(
+            APSInt(RHSA.isSigned() ? RHSA.sadd_sat(RHSB) : RHSA.uadd_sat(RHSB),
+                   DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_phsubw128:
+      case clang::X86::BI__builtin_ia32_phsubw256:
+      case clang::X86::BI__builtin_ia32_phsubd128:
+      case clang::X86::BI__builtin_ia32_phsubd256:
+        ResultElements.push_back(APValue(APSInt(RHSA - RHSB, DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_phsubsw128:
+      case clang::X86::BI__builtin_ia32_phsubsw256:
+        ResultElements.push_back(APValue(
+            APSInt(RHSA.isSigned() ? RHSA.ssub_sat(RHSB) : RHSA.usub_sat(RHSB),
+                   DestUnsigned)));
+        break;
+      }
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case clang::X86::BI__builtin_ia32_haddpd:
+  case clang::X86::BI__builtin_ia32_haddpd256:
+  case clang::X86::BI__builtin_ia32_haddps:
+  case clang::X86::BI__builtin_ia32_haddps256: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen);
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
+      APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
+      LHSA.add(LHSB, APFloat::rmNearestTiesToEven);
+      ResultElements.push_back(APValue(LHSA));
+    }
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
+      APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
+      RHSA.add(RHSB, APFloat::rmNearestTiesToEven);
+      ResultElements.push_back(APValue(RHSA));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case clang::X86::BI__builtin_ia32_hsubpd:
+  case clang::X86::BI__builtin_ia32_hsubpd256:
+  case clang::X86::BI__builtin_ia32_hsubps:
+  case clang::X86::BI__builtin_ia32_hsubps256: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen);
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
+      APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
+      LHSA.subtract(LHSB, APFloat::rmNearestTiesToEven);
+      ResultElements.push_back(APValue(LHSA));
+    }
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
+      APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
+      RHSA.subtract(RHSB, APFloat::rmNearestTiesToEven);
+      ResultElements.push_back(APValue(RHSA));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   }
 }
 
@@ -12197,67 +12337,71 @@ bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
 
 namespace {
   class ArrayExprEvaluator
-  : public ExprEvaluatorBase<ArrayExprEvaluator> {
-    const LValue &This;
-    APValue &Result;
-  public:
-
-    ArrayExprEvaluator(EvalInfo &Info, const LValue &This, APValue &Result)
-      : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {}
-
-    bool Success(const APValue &V, const Expr *E) {
-      assert(V.isArray() && "expected array");
-      Result = V;
-      return true;
-    }
-
-    bool ZeroInitialization(const Expr *E) {
-      const ConstantArrayType *CAT =
-          Info.Ctx.getAsConstantArrayType(E->getType());
-      if (!CAT) {
-        if (E->getType()->isIncompleteArrayType()) {
-          // We can be asked to zero-initialize a flexible array member; this
-          // is represented as an ImplicitValueInitExpr of incomplete array
-          // type. In this case, the array has zero elements.
-          Result = APValue(APValue::UninitArray(), 0, 0);
-          return true;
-        }
-        // FIXME: We could handle VLAs here.
-        return Error(E);
-      }
-
-      Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize());
-      if (!Result.hasArrayFiller())
-        return true;
-
-      // Zero-initialize all elements.
-      LValue Subobject = This;
-      Subobject.addArray(Info, E, CAT);
-      ImplicitValueInitExpr VIE(CAT->getElementType());
-      return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject, &VIE);
-    }
-
-    bool VisitCallExpr(const CallExpr *E) {
-      return handleCallExpr(E, Result, &This);
-    }
-    bool VisitInitListExpr(const InitListExpr *E,
-                           QualType AllocType = QualType());
-    bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
-    bool VisitCXXConstructExpr(const CXXConstructExpr *E);
-    bool VisitCXXConstructExpr(const CXXConstructExpr *E,
-                               const LValue &Subobject,
-                               APValue *Value, QualType Type);
-    bool VisitStringLiteral(const StringLiteral *E,
-                            QualType AllocType = QualType()) {
-      expandStringLiteral(Info, E, Result, AllocType);
-      return true;
-    }
-    bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E);
-    bool VisitCXXParenListOrInitListExpr(const Expr *ExprToVisit,
-                                         ArrayRef<Expr *> Args,
-                                         const Expr *ArrayFiller,
-                                         QualType AllocType = QualType());
-  };
+  :
+        public
+          ExprEvaluatorBase<ArrayExprEvaluator> {
+            const LValue &This;
+            APValue & Result;
+
+          public:
+            ArrayExprEvaluator(EvalInfo & Info, const LValue &This,
+                               APValue &Result)
+                : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {}
+
+            bool Success(const APValue &V, const Expr *E) {
+              assert(V.isArray() && "expected array");
+              Result = V;
+              return true;
+            }
+
+            bool ZeroInitialization(const Expr *E) {
+              const ConstantArrayType *CAT =
+                  Info.Ctx.getAsConstantArrayType(E->getType());
+              if (!CAT) {
+                if (E->getType()->isIncompleteArrayType()) {
+                  // We can be asked to zero-initialize a flexible array member;
+                  // this is represented as an ImplicitValueInitExpr of
+                  // incomplete array type. In this case, the array has zero
+                  // elements.
+                  Result = APValue(APValue::UninitArray(), 0, 0);
+                  return true;
+                }
+                // FIXME: We could handle VLAs here.
+                return Error(E);
+              }
+
+              Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize());
+              if (!Result.hasArrayFiller())
+                return true;
+
+              // Zero-initialize all elements.
+              LValue Subobject = This;
+              Subobject.addArray(Info, E, CAT);
+              ImplicitValueInitExpr VIE(CAT->getElementType());
+              return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject,
+                                     &VIE);
+            }
+
+            bool VisitCallExpr(const CallExpr *E) {
+              return handleCallExpr(E, Result, &This);
+            }
+            bool VisitInitListExpr(const InitListExpr *E,
+                                   QualType AllocType = QualType());
+            bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
+            bool VisitCXXConstructExpr(const CXXConstructExpr *E);
+            bool VisitCXXConstructExpr(const CXXConstructExpr *E,
+                                       const LValue &Subobject, APValue *Value,
+                                       QualType Type);
+            bool VisitStringLiteral(const StringLiteral *E,
+                                    QualType AllocType = QualType()) {
+              expandStringLiteral(Info, E, Result, AllocType);
+              return true;
+            }
+            bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E);
+            bool VisitCXXParenListOrInitListExpr(
+                const Expr *ExprToVisit, ArrayRef<Expr *> Args,
+                const Expr *ArrayFiller, QualType AllocType = QualType());
+          };
 } // end anonymous namespace
 
 static bool EvaluateArray(const Expr *E, const LValue &This,

>From df6242e4b74e8170cd28a2f9663aa974a4b0b12b Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Sat, 13 Sep 2025 20:58:15 +0800
Subject: [PATCH 04/12] adjust unit test #146940

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 162 ++++++++++++-----------
 clang/test/CodeGen/X86/avx-builtins.c    |  39 ++----
 clang/test/CodeGen/X86/avx2-builtins.c   |  87 ++++--------
 clang/test/CodeGen/X86/mmx-builtins.c    |  54 +-------
 clang/test/CodeGen/X86/ssse3-builtins.c  |  54 +-------
 5 files changed, 128 insertions(+), 268 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index f6027c78935c3..9d5d70698b8d3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2739,9 +2739,9 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
 }
 
 static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
-                                        const InterpFrame *Frame,
-                                        const CallExpr *Call,
-                                        uint32_t BuiltinID) {
+                                          const InterpFrame *Frame,
+                                          const CallExpr *Call,
+                                          uint32_t BuiltinID) {
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
@@ -2752,7 +2752,8 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
   PrimType ElemT = *S.getContext().classify(VT->getElementType());
   unsigned SourceLen = VT->getNumElements();
   assert(SourceLen % 2 == 0 &&
-         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen);
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
+             SourceLen);
   PrimType DstElemT = *S.getContext().classify(
       Call->getType()->castAs<VectorType>()->getElementType());
   unsigned DstElem = 0;
@@ -2774,16 +2775,17 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
     APSInt Elem2;
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
       Elem1 = LHS.elem<T>(I).toAPSInt();
-      Elem2 = LHS.elem<T>(I+1).toAPSInt();
+      Elem2 = LHS.elem<T>(I + 1).toAPSInt();
     });
     APSInt Result;
     if (IsAdd) {
-        if (IsSaturating) {
-          Result = APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-        }else{
-          Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-        }
-    }else{
+      if (IsSaturating) {
+        Result =
+            APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+      } else {
+        Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
+      }
+    } else {
       if (IsSaturating) {
         Result =
             APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
@@ -2812,7 +2814,8 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
       }
     } else {
       if (IsSaturating) {
-        Result = APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
+        Result =
+            APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
       } else {
         Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
       }
@@ -2826,15 +2829,15 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
 }
 
 static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
-                                          const InterpFrame *Frame,
-                                          const CallExpr *Call,
-                                          uint32_t BuiltinID) {
+                                           const InterpFrame *Frame,
+                                           const CallExpr *Call,
+                                           uint32_t BuiltinID) {
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
-  
+
   FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
   llvm::RoundingMode RM = getRoundingMode(FPO);
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
@@ -2851,7 +2854,6 @@ static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
   for (unsigned I = 0; I != SourceLen; I += 2) {
     APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
-
     if (IsAdd) {
       Elem1.add(Elem2, RM);
     } else {
@@ -3495,71 +3497,71 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
 
   case clang::X86::BI__builtin_ia32_phaddw128:
   case clang::X86::BI__builtin_ia32_phaddw256:
-   case clang::X86::BI__builtin_ia32_phaddd128: 
-   case clang::X86::BI__builtin_ia32_phaddd256: 
-   case clang::X86::BI__builtin_ia32_phaddsw128: 
-   case clang::X86::BI__builtin_ia32_phaddsw256:
-    case clang::X86::BI__builtin_ia32_phsubw128:
-    case clang::X86::BI__builtin_ia32_phsubw256:
-    case clang::X86::BI__builtin_ia32_phsubd128:
-    case clang::X86::BI__builtin_ia32_phsubd256:
-    case clang::X86::BI__builtin_ia32_phsubsw128:
-    case clang::X86::BI__builtin_ia32_phsubsw256:
-    
-      return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID);
-    case clang::X86::BI__builtin_ia32_haddpd:
-    case clang::X86::BI__builtin_ia32_haddpd256:
-    case clang::X86::BI__builtin_ia32_haddps:
-    case clang::X86::BI__builtin_ia32_haddps256:
-    case clang::X86::BI__builtin_ia32_hsubpd:
-    case clang::X86::BI__builtin_ia32_hsubpd256:
-    case clang::X86::BI__builtin_ia32_hsubps:
-    case clang::X86::BI__builtin_ia32_hsubps256:
-        return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID);
-
-    case clang::X86::BI__builtin_ia32_pmuldq128:
-    case clang::X86::BI__builtin_ia32_pmuldq256:
-    case clang::X86::BI__builtin_ia32_pmuldq512:
-    case clang::X86::BI__builtin_ia32_pmuludq128:
-    case clang::X86::BI__builtin_ia32_pmuludq256:
-    case clang::X86::BI__builtin_ia32_pmuludq512:
-      return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID);
-
-    case Builtin::BI__builtin_elementwise_fma:
-      return interp__builtin_elementwise_fma(S, OpPC, Call);
-
-    case X86::BI__builtin_ia32_selectb_128:
-    case X86::BI__builtin_ia32_selectb_256:
-    case X86::BI__builtin_ia32_selectb_512:
-    case X86::BI__builtin_ia32_selectw_128:
-    case X86::BI__builtin_ia32_selectw_256:
-    case X86::BI__builtin_ia32_selectw_512:
-    case X86::BI__builtin_ia32_selectd_128:
-    case X86::BI__builtin_ia32_selectd_256:
-    case X86::BI__builtin_ia32_selectd_512:
-    case X86::BI__builtin_ia32_selectq_128:
-    case X86::BI__builtin_ia32_selectq_256:
-    case X86::BI__builtin_ia32_selectq_512:
-    case X86::BI__builtin_ia32_selectph_128:
-    case X86::BI__builtin_ia32_selectph_256:
-    case X86::BI__builtin_ia32_selectph_512:
-    case X86::BI__builtin_ia32_selectpbf_128:
-    case X86::BI__builtin_ia32_selectpbf_256:
-    case X86::BI__builtin_ia32_selectpbf_512:
-    case X86::BI__builtin_ia32_selectps_128:
-    case X86::BI__builtin_ia32_selectps_256:
-    case X86::BI__builtin_ia32_selectps_512:
-    case X86::BI__builtin_ia32_selectpd_128:
-    case X86::BI__builtin_ia32_selectpd_256:
-    case X86::BI__builtin_ia32_selectpd_512:
-      return interp__builtin_select(S, OpPC, Call);
+  case clang::X86::BI__builtin_ia32_phaddd128:
+  case clang::X86::BI__builtin_ia32_phaddd256:
+  case clang::X86::BI__builtin_ia32_phaddsw128:
+  case clang::X86::BI__builtin_ia32_phaddsw256:
+  case clang::X86::BI__builtin_ia32_phsubw128:
+  case clang::X86::BI__builtin_ia32_phsubw256:
+  case clang::X86::BI__builtin_ia32_phsubd128:
+  case clang::X86::BI__builtin_ia32_phsubd256:
+  case clang::X86::BI__builtin_ia32_phsubsw128:
+  case clang::X86::BI__builtin_ia32_phsubsw256:
+    return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID);
+
+  case clang::X86::BI__builtin_ia32_haddpd:
+  case clang::X86::BI__builtin_ia32_haddpd256:
+  case clang::X86::BI__builtin_ia32_haddps:
+  case clang::X86::BI__builtin_ia32_haddps256:
+  case clang::X86::BI__builtin_ia32_hsubpd:
+  case clang::X86::BI__builtin_ia32_hsubpd256:
+  case clang::X86::BI__builtin_ia32_hsubps:
+  case clang::X86::BI__builtin_ia32_hsubps256:
+    return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID);
+
+  case clang::X86::BI__builtin_ia32_pmuldq128:
+  case clang::X86::BI__builtin_ia32_pmuldq256:
+  case clang::X86::BI__builtin_ia32_pmuldq512:
+  case clang::X86::BI__builtin_ia32_pmuludq128:
+  case clang::X86::BI__builtin_ia32_pmuludq256:
+  case clang::X86::BI__builtin_ia32_pmuludq512:
+    return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID);
+
+  case Builtin::BI__builtin_elementwise_fma:
+    return interp__builtin_elementwise_fma(S, OpPC, Call);
+
+  case X86::BI__builtin_ia32_selectb_128:
+  case X86::BI__builtin_ia32_selectb_256:
+  case X86::BI__builtin_ia32_selectb_512:
+  case X86::BI__builtin_ia32_selectw_128:
+  case X86::BI__builtin_ia32_selectw_256:
+  case X86::BI__builtin_ia32_selectw_512:
+  case X86::BI__builtin_ia32_selectd_128:
+  case X86::BI__builtin_ia32_selectd_256:
+  case X86::BI__builtin_ia32_selectd_512:
+  case X86::BI__builtin_ia32_selectq_128:
+  case X86::BI__builtin_ia32_selectq_256:
+  case X86::BI__builtin_ia32_selectq_512:
+  case X86::BI__builtin_ia32_selectph_128:
+  case X86::BI__builtin_ia32_selectph_256:
+  case X86::BI__builtin_ia32_selectph_512:
+  case X86::BI__builtin_ia32_selectpbf_128:
+  case X86::BI__builtin_ia32_selectpbf_256:
+  case X86::BI__builtin_ia32_selectpbf_512:
+  case X86::BI__builtin_ia32_selectps_128:
+  case X86::BI__builtin_ia32_selectps_256:
+  case X86::BI__builtin_ia32_selectps_512:
+  case X86::BI__builtin_ia32_selectpd_128:
+  case X86::BI__builtin_ia32_selectpd_256:
+  case X86::BI__builtin_ia32_selectpd_512:
+    return interp__builtin_select(S, OpPC, Call);
 
-    default:
-      S.FFDiag(S.Current->getLocation(OpPC),
-               diag::note_invalid_subexpr_in_const_expr)
-          << S.Current->getRange(OpPC);
+  default:
+    S.FFDiag(S.Current->getLocation(OpPC),
+             diag::note_invalid_subexpr_in_const_expr)
+        << S.Current->getRange(OpPC);
 
-      return false;
+    return false;
   }
 
   llvm_unreachable("Unhandled builtin ID");
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 9857b84c94112..4e21cfea41553 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1083,53 +1083,34 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hadd_pd(A, B);
 }
-constexpr bool test_mm256_hadd_epi32_constexpr() {
-    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
-    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
-    constexpr __m256d result = _mm256_hadd_pd(a, b);
-    return match_m256d(result,1.0+2.0,3.0+4.0,5.0+6.0,7.0+8.0);
-}
-TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
+TEST_CONSTEXPR(match_m256d(_mm256_hadd_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), 3.0, 7.0, 11.0, 15.0));
 
 __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hadd_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hadd_ps(A, B);
 }
-constexpr bool test_mm256_hadd_ps_constexpr() {
-    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
-    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f);
-    constexpr __m256 result = _mm256_hadd_ps(a, b);
-    return match_m256(result,1.0f+2.0f,3.0f+4.0f,5.0f+6.0f,7.0f+8.0f,
-                             9.0f+10.0f,11.0f+12.0f,13.0f+14.0f,15.0f+16.0f);
-}
-TEST_CONSTEXPR(test_mm256_hadd_ps_constexpr())
+TEST_CONSTEXPR(_mm256_hadd_ps(
+    (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+    (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f},
+    3.0f, 7.0f, 11.0f, 15.0f, 19.0f, 23.0f, 27.0f, 31.0f))
 
 __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_hsub_pd
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
-constexpr bool test_mm256_hsub_pd_constexpr() {
-    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
-    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
-    constexpr __m256d result = _mm256_hsub_pd(a, b);
-    return match_m256d(result,1.0-2.0,3.0-4.0,5.0-6.0,7.0-8.0);
-}
-TEST_CONSTEXPR(test_mm256_hsub_pd_constexpr())
+TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), -1.0,-1.0,-1.0,-1.0));
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hsub_ps(A, B);
 }
-constexpr bool test_mm256_hsub_ps_constexpr() {
-    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
-    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f);
-    constexpr __m256 result = _mm256_hsub_ps(a, b);
-    return match_m256(result,1.0f-2.0f,3.0f-4.0f,5.0f-6.0f,7.0f-8.0f,
-                             9.0f-10.0f,11.0f-12.0f,13.0f-14.0f,15.0f-16.0f);
-}
+TEST_CONSTEXPR(_mm256_hsub_ps(
+    (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+    (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f},
+    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index c34594cf78a8e..a9095de4fe373 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -461,99 +461,60 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
   // CHECK: call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadd_epi16(a, b);
 }
-
-constexpr bool test_mm256_hadd_epi16_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
-        8,9,10,11,12,13,14,15,16);
-    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
-        24,25,26,27,28,29,30,31,32);
-    
-    constexpr __m256i result = _mm256_hadd_epi16(a, b);
-    return match_v16si(result,1+2,3+4,5+6,7+8,9+10,11+12,13+14,15+16,17+18,19+20,21+22,23+24,25+26,27+28,29+30,31+32);
-}
-TEST_CONSTEXPR(test_mm256_hadd_epi16_constexpr())
+TEST_CONSTEXPR(match_v16hi(_mm256_hadd_epi16(
+    (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 
+    (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), 
+    3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63));
 
 __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hadd_epi32(a, b);
 }
-
-constexpr bool test_mm256_hadd_epi32_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
-    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
-    
-    constexpr __m256i result = _mm256_hadd_epi32(a, b);
-    return match_v8si(result,10+20,30+40,50+60,70+80,5+15,25+35, 45+55,65+75);
-}
-TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
+TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32(
+    (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
+    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75})
+    30,70,110,150,20,60,100,140))
 
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadds_epi16(a, b);
 }
-constexpr bool test_mm256_hadds_epi16_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
-        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-    constexpr __m256i b = _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 
-        1, 1, 1, 1, 1, 1, 1);
-    constexpr __m256i result = _mm256_hadds_epi16(a, b);
-
-    return match_v16si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
-    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm256_hadds_epi16_constexpr())
+TEST_CONSTEXPR(match_v16hi( _mm256_hadds_epi16(
+    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
+    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767)))
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
   // CHECK: call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsub_epi16(a, b);
 }
-
-constexpr bool test_mm256_hsub_epi16_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
-        8,9,10,11,12,13,14,15,16);
-    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
-        24,25,26,27,28,29,30,31,32);
-    
-    constexpr __m256i result = _mm256_hsub_epi16(a, b);
-    return match_v16si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16,17-18,19-20,21-22,23-24,25-26,27-28,29-30,31-32);
-}
-TEST_CONSTEXPR(test_mm256_hsub_epi16_constexpr())
+TEST_CONSTEXPR(match_v16hi(_mm256_hsub_epi16(
+    (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 
+    (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), 
+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
 
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hsub_epi32(a, b);
 }
-
-constexpr bool test_mm256_hsub_epi32_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
-    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
-    
-    constexpr __m256i result = _mm256_hsub_epi32(a, b);
-    return match_v8si(result,10-20,30-40,50-60,70-80,5-15,25-35, 45-55,65-75);
-}
-TEST_CONSTEXPR(test_mm256_hsub_epi32_constexpr())
+TEST_CONSTEXPR(match_v8si(_mm256_hsub_epi32(
+    (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
+    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75})
+    -10,-10,-10,-10,-10,-10,-10,-10))
 
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
-
-constexpr bool test_mm256_hsubs_epi16_constexpr() {
-    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
-        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-    constexpr __m256i b = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-    constexpr __m256i result3 = _mm256_hsubs_epi16(a, b);
-
-    return match_v16si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767,
-    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm256_hsubs_epi16_constexpr())
-
+TEST_CONSTEXPR(match_v16hi( _mm256_hsubs_epi16(
+    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
+    (__m256i)(__v16hi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767)))
 
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 70f521e380dd4..944af98ffcadc 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -309,84 +309,42 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
-constexpr bool test_mm_hadd_pi16_constexpr() {
-    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
-    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
-    
-    constexpr __m64 result = _mm_hadd_pi16(a, b);
-    return match_v4si(result,1+2,3+4,5+6,7+8);
-}
-TEST_CONSTEXPR(test_mm_hadd_pi16_constexpr())
+TEST_CONSTEXPR(match_v4hi(_mm_hadd_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),3,7,11,15));
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
-constexpr bool test_mm_hadd_pi32_constexpr() {
-    constexpr __m64 a = _mm_setr_pi32(1, 2);
-    constexpr __m64 b = _mm_setr_pi32(3, 4);
-    
-    constexpr __m64 result = _mm_hadd_pi32(a, b);
-    return match_v2si(result,1+2,3+4);
-}
-TEST_CONSTEXPR(test_mm_hadd_pi32_constexpr())
+TEST_CONSTEXPR(match_v2si(_mm_hadd_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),3,7));
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
-constexpr bool test_mm_hadds_pi16_constexpr() {
-    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
-    constexpr __m64 b = _mm_setr_pi16(1,1,1,1);
-    
-    constexpr __m64 result = _mm_hadds_pi16(a, b);
-    return match_v4si(result,32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm_hadds_pi16_constexpr())
+TEST_CONSTEXPR(match_v4hi(_mm_hadds_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){32767, 32767, 32767, 32767}),32767, 32767, 32767, 32767));
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
-constexpr bool test_mm_hsub_pi16_constexpr() {
-    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
-    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
-    
-    constexpr __m64 result = _mm_hsub_pi16(a, b);
-    return match_v4si(result,1-2,3-4,5-6,7-8);
-}
-TEST_CONSTEXPR(test_mm_hsub_pi16_constexpr())
+TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),-1,-1,-1,-1));
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
-constexpr bool test_mm_hsub_pi32_constexpr() {
-    constexpr __m64 a = _mm_setr_pi32(1, 2);
-    constexpr __m64 b = _mm_setr_pi32(3, 4);
-    
-    constexpr __m64 result = _mm_hsub_pi32(a, b);
-    return match_v2si(result,1-2,3-4);
-}
-TEST_CONSTEXPR(test_mm_hsub_pi32_constexpr())
+TEST_CONSTEXPR(match_v2si(_mm_hsub_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),-1,-1));
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
-constexpr bool test_mm_hsubs_pi16_constexpr() {
-    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
-    constexpr __m64 b = _mm_setr_pi16(-1,-1,-1,-1);
-    
-    constexpr __m64 result = _mm_hsubs_pi16(a, b);
-    return match_v4si(result,32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm_hsubs_pi16_constexpr())
+TEST_CONSTEXPR(match_v4hi(_mm_hsubs_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){-4,-5,-6,-7}),32767, 32767, 32767, 32767));
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index bd0ef43278217..61c7ee31af96f 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -60,42 +60,21 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadd_epi16(a, b);
 }
-constexpr bool test_mm_hadd_epi16_constexpr() {
-    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-    constexpr __m128i b = _mm_setr_epi16(17,18,19,20,21,22,23,24);
-    
-    constexpr __m128i result = _mm_hadd_epi16(a, b);
-    return match_v8si(result,1+2,3+4,5+6,7+8,17+18,19+20,21+22,23+24);
-}
-TEST_CONSTEXPR(test_mm_hadd_epi16_constexpr())
+TEST_CONSTEXPR(match_v8hi(_mm_hadd_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){17,18,19,20,21,22,23,24}), 3,7,11,15,35,39,43,47));
 
 __m128i test_mm_hadd_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hadd_epi32(a, b);
 }
-constexpr bool test_mm_hadd_epi32_constexpr() {
-    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
-    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
-    
-    constexpr __m128i result = _mm_hadd_epi32(a, b);
-    return match_v4si(result,1+2,3+4,5+6,7+8);
-}
-TEST_CONSTEXPR(test_mm_hadd_epi32_constexpr())
+TEST_CONSTEXPR(match_v4si(_mm_hadd_epi32((__m128i)(__v4si){1,2,3,4}, (__m128i)(__v4si){5,6,7,8}), 3,7,11,15));
 
 __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadds_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
-constexpr bool test_mm_hadds_epi16_constexpr() {
-    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-    constexpr __m128i b = _mm_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1);
-    constexpr __m128i result = _mm_hadds_epi16(a, b);
-
-    return match_v8si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm_hadds_epi16_constexpr())
+TEST_CONSTEXPR(match_v8hi(_mm_hadds_epi16((__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}, (__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}), 32767,32767,32767,32767,32767,32767,32767,32767));
 
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
@@ -103,42 +82,21 @@ __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
-constexpr bool test_mm_hsub_epi16_constexpr() {
-    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
-    constexpr __m128i b = _mm_setr_epi16(9,10,11,12,13,14,15,16);
-    
-    constexpr __m128i result = _mm_hsub_epi16(a, b);
-    return match_v8si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16);
-}
-TEST_CONSTEXPR(test_mm_hsub_epi16_constexpr())
+TEST_CONSTEXPR(match_v8hi(_mm_hsub_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){9,10,11,12,13,14,15,16}), -1,-1,-1,-1,-1,-1,-1,-1));
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
-constexpr bool test_mm_hsub_epi32_constexpr() {
-    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
-    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
-    
-    constexpr __m128i result = _mm_hsub_epi32(a, b);
-    return match_v4si(result,1-2,3-4,5-6,7-8);
-}
-TEST_CONSTEXPR(test_mm_hsub_epi32_constexpr())
+TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,2,1}, (__m128i)(__v4si){8,7,6,5}), 1,1,1,1))   
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
-constexpr bool test_mm_hsubs_epi16_constexpr() {
-    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-    constexpr __m128i b = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
-    constexpr __m128i result3 = _mm_hsubs_epi16(a, b);
-
-    return match_v8si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
-}
-TEST_CONSTEXPR(test_mm_hsubs_epi16_constexpr())
+TEST_CONSTEXPR(match_v8hi(_mm_hsubs_epi16((__m128i)(__v8hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},(__m128i)(__v8hi){-1,-1,-1,-1,-1,-1,-1,-1}), 32767,32767,32767,32767,32767,32767,32767,32767));
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16

>From f91aa214f83bc2d459d43daa1c1563cf0c86b76a Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Fri, 26 Sep 2025 13:42:41 +0800
Subject: [PATCH 05/12] adjust test case and function

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 145 +++++++++--------------
 clang/lib/AST/ExprConstant.cpp           | 126 ++++++++++----------
 clang/test/CodeGen/X86/avx-builtins.c    |   8 +-
 clang/test/CodeGen/X86/avx2-builtins.c   |  30 ++---
 clang/test/CodeGen/X86/mmx-builtins.c    |   8 +-
 clang/test/CodeGen/X86/ssse3-builtins.c  |   8 +-
 6 files changed, 143 insertions(+), 182 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index a0ea1404db182..04c62bcc238bb 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -8,7 +8,6 @@
 #include "../ExprConstShared.h"
 #include "Boolean.h"
 #include "EvalEmitter.h"
-#include "Floating.h"
 #include "Interp.h"
 #include "InterpBuiltinBitCast.h"
 #include "PrimType.h"
@@ -20,7 +19,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SipHash.h"
-#include <cassert>
 
 namespace clang {
 namespace interp {
@@ -2744,100 +2742,56 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC,
-                                          const InterpFrame *Frame,
-                                          const CallExpr *Call,
-                                          uint32_t BuiltinID) {
+static bool interp_builtin_horizontal_int_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
+  assert(Call->getNumArgs() == 2);
+ 
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  assert(VT->getElementType()->isIntegralOrEnumerationType());
+  PrimType ElemT = *S.getContext().classify(VT->getElementType());
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
 
-  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
-  PrimType ElemT = *S.getContext().classify(VT->getElementType());
   unsigned SourceLen = VT->getNumElements();
   assert(SourceLen % 2 == 0 &&
          Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
              SourceLen);
-  PrimType DstElemT = *S.getContext().classify(
-      Call->getType()->castAs<VectorType>()->getElementType());
   unsigned DstElem = 0;
 
-  bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_phaddw128 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddw256 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddd128 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddd256 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256);
-
-  bool IsSaturating = (BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 ||
-                       BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256 ||
-                       BuiltinID == clang::X86::BI__builtin_ia32_phsubsw128 ||
-                       BuiltinID == clang::X86::BI__builtin_ia32_phsubsw256);
-
   for (unsigned I = 0; I != SourceLen; I += 2) {
-    APSInt Elem1;
-    APSInt Elem2;
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
-      Elem1 = LHS.elem<T>(I).toAPSInt();
-      Elem2 = LHS.elem<T>(I + 1).toAPSInt();
+      APSInt Elem1 = LHS.elem<T>(I).toAPSInt();
+      APSInt Elem2 = LHS.elem<T>(I + 1).toAPSInt();
+      Dst.elem<T>(DstElem) =
+          static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
     });
-    APSInt Result;
-    if (IsAdd) {
-      if (IsSaturating) {
-        Result =
-            APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-      } else {
-        Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-      }
-    } else {
-      if (IsSaturating) {
-        Result =
-            APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-      } else {
-        Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-      }
-    }
-    INT_TYPE_SWITCH_NO_BOOL(DstElemT,
-                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
     ++DstElem;
   }
   for (unsigned I = 0; I != SourceLen; I += 2) {
-    APSInt Elem1;
-    APSInt Elem2;
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
-      Elem1 = RHS.elem<T>(I).toAPSInt();
-      Elem2 = RHS.elem<T>(I + 1).toAPSInt();
+      APSInt Elem1 = RHS.elem<T>(I).toAPSInt();
+      APSInt Elem2 = RHS.elem<T>(I + 1).toAPSInt();
+      Dst.elem<T>(DstElem) =
+          static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
     });
-    APSInt Result;
-    if (IsAdd) {
-      if (IsSaturating) {
-        Result =
-            APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-      } else {
-        Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-      }
-    } else {
-      if (IsSaturating) {
-        Result =
-            APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned());
-      } else {
-        Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned());
-      }
-    }
-    INT_TYPE_SWITCH_NO_BOOL(DstElemT,
-                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
     ++DstElem;
   }
   Dst.initializeAllElements();
   return true;
 }
 
-static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
-                                           const InterpFrame *Frame,
-                                           const CallExpr *Call,
-                                           uint32_t BuiltinID) {
+static bool interp_builtin_horizontal_fp_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APFloat(const APFloat &, const APFloat &,
+                               llvm::RoundingMode)>
+        Fn) {
+  assert(Call->getNumArgs() == 2);
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
@@ -2852,30 +2806,18 @@ static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC,
          Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
              SourceLen);
   unsigned DstElem = 0;
-  bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_haddpd ||
-                BuiltinID == clang::X86::BI__builtin_ia32_haddpd256 ||
-                BuiltinID == clang::X86::BI__builtin_ia32_haddps ||
-                BuiltinID == clang::X86::BI__builtin_ia32_haddps256);
-  using T = Floating;
   for (unsigned I = 0; I != SourceLen; I += 2) {
+    using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
-    if (IsAdd) {
-      Elem1.add(Elem2, RM);
-    } else {
-      Elem1.subtract(Elem2, RM);
-    }
-    Dst.elem<T>(DstElem++) = Elem1;
+    Dst.elem<T>(DstElem++) =
+        static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
   }
   for (unsigned I = 0; I != SourceLen; I += 2) {
+    using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = RHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = RHS.elem<T>(I + 1).getAPFloat();
-    if (IsAdd) {
-      Elem1.add(Elem2, RM);
-    } else {
-      Elem1.subtract(Elem2, RM);
-    }
-    Dst.elem<T>(DstElem++) = Elem1;
+    Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
   }
   Dst.initializeAllElements();
   return true;
@@ -3596,25 +3538,48 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case clang::X86::BI__builtin_ia32_phaddw256:
   case clang::X86::BI__builtin_ia32_phaddd128:
   case clang::X86::BI__builtin_ia32_phaddd256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {return LHS + RHS;});
   case clang::X86::BI__builtin_ia32_phaddsw128:
   case clang::X86::BI__builtin_ia32_phaddsw256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
+          return LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS);
+        });
   case clang::X86::BI__builtin_ia32_phsubw128:
   case clang::X86::BI__builtin_ia32_phsubw256:
   case clang::X86::BI__builtin_ia32_phsubd128:
   case clang::X86::BI__builtin_ia32_phsubd256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
   case clang::X86::BI__builtin_ia32_phsubsw128:
   case clang::X86::BI__builtin_ia32_phsubsw256:
-    return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID);
-
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
+          return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
+        });
   case clang::X86::BI__builtin_ia32_haddpd:
   case clang::X86::BI__builtin_ia32_haddpd256:
   case clang::X86::BI__builtin_ia32_haddps:
   case clang::X86::BI__builtin_ia32_haddps256:
+    return interp_builtin_horizontal_fp_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.add(RHS, RM);
+          return F;
+        });
   case clang::X86::BI__builtin_ia32_hsubpd:
   case clang::X86::BI__builtin_ia32_hsubpd256:
   case clang::X86::BI__builtin_ia32_hsubps:
   case clang::X86::BI__builtin_ia32_hsubps256:
-    return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID);
+    return interp_builtin_horizontal_fp_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.subtract(RHS, RM);
+          return F;
+        });
 
   case clang::X86::BI__builtin_ia32_pmuldq128:
   case clang::X86::BI__builtin_ia32_pmuldq256:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 026894381e778..01e80a22fbb8d 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12332,71 +12332,67 @@ bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
 
 namespace {
   class ArrayExprEvaluator
-  :
-        public
-          ExprEvaluatorBase<ArrayExprEvaluator> {
-            const LValue &This;
-            APValue & Result;
-
-          public:
-            ArrayExprEvaluator(EvalInfo & Info, const LValue &This,
-                               APValue &Result)
-                : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {}
-
-            bool Success(const APValue &V, const Expr *E) {
-              assert(V.isArray() && "expected array");
-              Result = V;
-              return true;
-            }
-
-            bool ZeroInitialization(const Expr *E) {
-              const ConstantArrayType *CAT =
-                  Info.Ctx.getAsConstantArrayType(E->getType());
-              if (!CAT) {
-                if (E->getType()->isIncompleteArrayType()) {
-                  // We can be asked to zero-initialize a flexible array member;
-                  // this is represented as an ImplicitValueInitExpr of
-                  // incomplete array type. In this case, the array has zero
-                  // elements.
-                  Result = APValue(APValue::UninitArray(), 0, 0);
-                  return true;
-                }
-                // FIXME: We could handle VLAs here.
-                return Error(E);
-              }
-
-              Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize());
-              if (!Result.hasArrayFiller())
-                return true;
-
-              // Zero-initialize all elements.
-              LValue Subobject = This;
-              Subobject.addArray(Info, E, CAT);
-              ImplicitValueInitExpr VIE(CAT->getElementType());
-              return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject,
-                                     &VIE);
-            }
-
-            bool VisitCallExpr(const CallExpr *E) {
-              return handleCallExpr(E, Result, &This);
-            }
-            bool VisitInitListExpr(const InitListExpr *E,
-                                   QualType AllocType = QualType());
-            bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
-            bool VisitCXXConstructExpr(const CXXConstructExpr *E);
-            bool VisitCXXConstructExpr(const CXXConstructExpr *E,
-                                       const LValue &Subobject, APValue *Value,
-                                       QualType Type);
-            bool VisitStringLiteral(const StringLiteral *E,
-                                    QualType AllocType = QualType()) {
-              expandStringLiteral(Info, E, Result, AllocType);
-              return true;
-            }
-            bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E);
-            bool VisitCXXParenListOrInitListExpr(
-                const Expr *ExprToVisit, ArrayRef<Expr *> Args,
-                const Expr *ArrayFiller, QualType AllocType = QualType());
-          };
+  : public ExprEvaluatorBase<ArrayExprEvaluator> {
+    const LValue &This;
+    APValue &Result;
+  public:
+
+    ArrayExprEvaluator(EvalInfo &Info, const LValue &This, APValue &Result)
+      : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {}
+
+    bool Success(const APValue &V, const Expr *E) {
+      assert(V.isArray() && "expected array");
+      Result = V;
+      return true;
+    }
+
+    bool ZeroInitialization(const Expr *E) {
+      const ConstantArrayType *CAT =
+          Info.Ctx.getAsConstantArrayType(E->getType());
+      if (!CAT) {
+        if (E->getType()->isIncompleteArrayType()) {
+          // We can be asked to zero-initialize a flexible array member; this
+          // is represented as an ImplicitValueInitExpr of incomplete array
+          // type. In this case, the array has zero elements.
+          Result = APValue(APValue::UninitArray(), 0, 0);
+          return true;
+        }
+        // FIXME: We could handle VLAs here.
+        return Error(E);
+      }
+
+      Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize());
+      if (!Result.hasArrayFiller())
+        return true;
+
+      // Zero-initialize all elements.
+      LValue Subobject = This;
+      Subobject.addArray(Info, E, CAT);
+      ImplicitValueInitExpr VIE(CAT->getElementType());
+      return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject, &VIE);
+    }
+
+    bool VisitCallExpr(const CallExpr *E) {
+      return handleCallExpr(E, Result, &This);
+    }
+    bool VisitInitListExpr(const InitListExpr *E,
+                           QualType AllocType = QualType());
+    bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E);
+    bool VisitCXXConstructExpr(const CXXConstructExpr *E);
+    bool VisitCXXConstructExpr(const CXXConstructExpr *E,
+                               const LValue &Subobject,
+                               APValue *Value, QualType Type);
+    bool VisitStringLiteral(const StringLiteral *E,
+                            QualType AllocType = QualType()) {
+      expandStringLiteral(Info, E, Result, AllocType);
+      return true;
+    }
+    bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E);
+    bool VisitCXXParenListOrInitListExpr(const Expr *ExprToVisit,
+                                         ArrayRef<Expr *> Args,
+                                         const Expr *ArrayFiller,
+                                         QualType AllocType = QualType());
+  };
 } // end anonymous namespace
 
 static bool EvaluateArray(const Expr *E, const LValue &This,
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 3c9e184b8ba3f..0ff5a83505607 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1102,7 +1102,7 @@ __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
-TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), -1.0,-1.0,-1.0,-1.0));
+TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){1.0, 2.0, 4.0, 3.0}, (__m256d){10.0, 6.0, 16.0, 8.0}), -1.0,1.0,4.0,8.0));
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
@@ -1110,9 +1110,9 @@ __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   return _mm256_hsub_ps(A, B);
 }
 TEST_CONSTEXPR(_mm256_hsub_ps(
-    (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-    (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f},
-    -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f))
+    (__m256){1.0f, 2.0f, 4.0f, 3.0f, 5.0f, 7.0f, 7.0f, 5.0f},
+    (__m256){9.0f, 6.0f, 11.0f, 8.0f, 13.0f, 17.0f, 15.0f, 11.0f},
+    -1.0f, 1.0f, -2.0f, 2.0f, -3.0f, 3.0f, -4.0f, 4.0f))
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index a993e2546b165..8837a8c0e52fa 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -476,8 +476,8 @@ __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
 }
 TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32(
     (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
-    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75})
-    30,70,110,150,20,60,100,140))
+    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}),
+    30,70,110,150,20,60,100,140));
 
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
@@ -485,9 +485,9 @@ __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   return _mm256_hadds_epi16(a, b);
 }
 TEST_CONSTEXPR(match_v16hi( _mm256_hadds_epi16(
-    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
-    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
-    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767)))
+    (__m256i)(__v16hi){32767, 32767, 1,2,3,4,5,6,7,8,9,10,11,12,13,14},
+    (__m256i)(__v16hi){19,20,21,22,23,24,25,26,27,28,29,30,31,32, 32767, 5}),
+    32767, 3,7,11,15,19,23,27, 39,43,47,51,55,59,63, 32767));
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
@@ -495,9 +495,9 @@ __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   return _mm256_hsub_epi16(a, b);
 }
 TEST_CONSTEXPR(match_v16hi(_mm256_hsub_epi16(
-    (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 
-    (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), 
-    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
+    (__m256i)(__v16hi){2,1,1,2,5,3,3,5,7,4,4,7,9,5,5,9}, 
+    (__m256i)(__v16hi){10,5,5,10,12,6,6,12,21,14,14,21,24,16,16,24}), 
+    1,-1,2,-2,3,-3,4,-4,5,-5,6,-6,7,-7,8,-8));
 
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
@@ -505,19 +505,19 @@ __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   return _mm256_hsub_epi32(a, b);
 }
 TEST_CONSTEXPR(match_v8si(_mm256_hsub_epi32(
-    (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
-    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75})
-    -10,-10,-10,-10,-10,-10,-10,-10))
+    (__m256i)(__v8si){10, 20, 30,50,60,90,100,140},
+    (__m256i)(__v8si){200,150,260,200,420,350,800,740}),
+    -10,-20,-30,-40,50,60,70,80));
 
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
-TEST_CONSTEXPR(match_v16hi( _mm256_hsubs_epi16(
-    (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},
-    (__m256i)(__v16hi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767)))
+TEST_CONSTEXPR(match_v16hi(_mm256_hsubs_epi16(
+    (__m256i)(__v16hi){32726, -100, 3, 2, 6, 4, 8, 5,15,10 ,21, 14, 27, 18, 100, 90},
+    (__m256i)(__v16hi){40, 20, 100, 70, 200,150, 100,40, 1000,900,300,150, 500,300, 1, 1}),
+    32767, 1, 2, 3, 5, 7, 9, 10, 20, 30, 50, 60, 100, 150, 200, 0));
 
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index b4c86b29e5260..7c1709f81f665 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -325,28 +325,28 @@ __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
-TEST_CONSTEXPR(match_v4hi(_mm_hadds_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){32767, 32767, 32767, 32767}),32767, 32767, 32767, 32767));
+TEST_CONSTEXPR(match_v4hi(_mm_hadds_pi16((__m64)(__v4hi){32767, 32767, 1,3},(__m64)(__v4hi){-1,3, 40, 60}),32767, 4, 2,100));
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
-TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),-1,-1,-1,-1));
+TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,4,3},(__m64)(__v4hi){10,5,0,-10}),-1,1,5,-10));
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
-TEST_CONSTEXPR(match_v2si(_mm_hsub_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),-1,-1));
+TEST_CONSTEXPR(match_v2si(_mm_hsub_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){4,3}),-1,1));
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
-TEST_CONSTEXPR(match_v4hi(_mm_hsubs_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){-4,-5,-6,-7}),32767, 32767, 32767, 32767));
+TEST_CONSTEXPR(match_v4hi(_mm_hsubs_pi16((__m64)(__v4hi){32767, 32767, 5, -32767},(__m64)(__v4hi){4,5,10,5}),0,32767,-1,5));
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 61c7ee31af96f..d5b64df3f57a9 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -74,7 +74,7 @@ __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
-TEST_CONSTEXPR(match_v8hi(_mm_hadds_epi16((__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}, (__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}), 32767,32767,32767,32767,32767,32767,32767,32767));
+TEST_CONSTEXPR(match_v8hi(_mm_hadds_epi16((__m128i)(__v8hi){30000,30000,-1,2,-3,3,1,4}, (__m128i)(__v8hi){2,6,1,9,-4,16,7,8}), 32767, 1,0,5,8,10,12,15));
 
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
@@ -82,21 +82,21 @@ __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
-TEST_CONSTEXPR(match_v8hi(_mm_hsub_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){9,10,11,12,13,14,15,16}), -1,-1,-1,-1,-1,-1,-1,-1));
+TEST_CONSTEXPR(match_v8hi(_mm_hsub_epi16((__m128i)(__v8hi){20,15,16,12,9,6,4,2}, (__m128i)(__v8hi){3,2,1,1,4,5,0,2}), 5,4,3,2,1,0,-1,-2));
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,2,1}, (__m128i)(__v4si){8,7,6,5}), 1,1,1,1))   
+TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,1,1}, (__m128i)(__v4si){7,5,10,5}), 1,0,-1,5))   
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
-TEST_CONSTEXPR(match_v8hi(_mm_hsubs_epi16((__m128i)(__v8hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},(__m128i)(__v8hi){-1,-1,-1,-1,-1,-1,-1,-1}), 32767,32767,32767,32767,32767,32767,32767,32767));
+TEST_CONSTEXPR(match_v8hi(_mm_hsubs_epi16((__m128i)(__v8hi){32767, -15,16,12,9,6,4,2},(__m128i)(__v8hi){3,2,1,1,4,5,0,2}), 32767,4,3,2,1,0,-1,-2));
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16

>From 4f5fb878426471f92c3b488495ae73bd87897690 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Fri, 26 Sep 2025 13:48:02 +0800
Subject: [PATCH 06/12] undo the unintentional formatting of the code

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 04c62bcc238bb..5a097e5a4f7a0 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2746,7 +2746,7 @@ static bool interp_builtin_horizontal_int_binop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
   assert(Call->getNumArgs() == 2);
- 
+
   assert(Call->getArg(0)->getType()->isVectorType() &&
          Call->getArg(1)->getType()->isVectorType());
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
@@ -2810,8 +2810,7 @@ static bool interp_builtin_horizontal_fp_binop(
     using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
-    Dst.elem<T>(DstElem++) =
-        static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
+    Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
   }
   for (unsigned I = 0; I != SourceLen; I += 2) {
     using T = PrimConv<PT_Float>::T;
@@ -3539,7 +3538,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case clang::X86::BI__builtin_ia32_phaddd128:
   case clang::X86::BI__builtin_ia32_phaddd256:
     return interp_builtin_horizontal_int_binop(
-        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {return LHS + RHS;});
+        S, OpPC, Call,
+        [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
   case clang::X86::BI__builtin_ia32_phaddsw128:
   case clang::X86::BI__builtin_ia32_phaddsw256:
     return interp_builtin_horizontal_int_binop(
@@ -3551,7 +3551,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case clang::X86::BI__builtin_ia32_phsubd128:
   case clang::X86::BI__builtin_ia32_phsubd256:
     return interp_builtin_horizontal_int_binop(
-        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
+        S, OpPC, Call,
+        [](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
   case clang::X86::BI__builtin_ia32_phsubsw128:
   case clang::X86::BI__builtin_ia32_phsubsw256:
     return interp_builtin_horizontal_int_binop(

>From b2cac3ef3cb6832bc5c012473ddedd74a46ef5fd Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Fri, 26 Sep 2025 16:10:33 +0800
Subject: [PATCH 07/12] adjust code

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 1437a6ea135d3..6e9b2dcf75712 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2830,13 +2830,13 @@ static bool interp_builtin_horizontal_fp_binop(
     using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
-    Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
+    Dst.elem<T>(DstElem++) = static_cast<T>(Fn(Elem1, Elem2, RM));
   }
   for (unsigned I = 0; I != SourceLen; I += 2) {
     using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = RHS.elem<T>(I).getAPFloat();
     APFloat Elem2 = RHS.elem<T>(I + 1).getAPFloat();
-    Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
+    Dst.elem<T>(DstElem++) = static_cast<T>(Fn(Elem1, Elem2, RM));
   }
   Dst.initializeAllElements();
   return true;

>From 197123a5566b3e342cac10f1f594068bdd27a319 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Mon, 29 Sep 2025 01:47:12 +0800
Subject: [PATCH 08/12] adjust code for mm256

---
 clang/include/clang/Basic/BuiltinsX86.td |  41 +++++---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 113 ++++++++++++++++++++++-
 clang/lib/AST/ExprConstant.cpp           |  80 +++++++++++++---
 clang/lib/Headers/avxintrin.h            |   4 +-
 clang/lib/Headers/pmmintrin.h            |   2 +-
 clang/lib/Headers/tmmintrin.h            |  23 +++--
 clang/test/CodeGen/X86/avx-builtins.c    |  19 ++--
 clang/test/CodeGen/X86/avx2-builtins.c   |   2 +-
 clang/test/CodeGen/X86/mmx-builtins.c    |   2 +-
 clang/test/CodeGen/X86/sse3-builtins.c   |   4 +
 clang/test/CodeGen/X86/ssse3-builtins.c  |   2 +-
 11 files changed, 237 insertions(+), 55 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 77e599587edc3..a09569900ba90 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -111,19 +111,20 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
   }
 
   let Features = "sse3" in {
-    foreach Op = ["addsub", "hadd", "hsub"] in {
+    foreach Op = ["addsub"] in {
       def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
       def Op#pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
     }
   }
 
-  let Features = "ssse3" in {
-    foreach Op = ["phadd", "phsub"] in {
-      def Op#w128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-      def Op#sw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-      def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  let Features = "sse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+    foreach Op = ["hadd", "hsub"] in {
+      def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
+      def Op#pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
     }
+  }
 
+  let Features = "ssse3" in {
     def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
     def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
@@ -135,7 +136,7 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
 
 // AVX
 let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in {
-  foreach Op = ["addsub", "hadd", "hsub", "max", "min"] in {
+  foreach Op = ["addsub", "max", "min"] in {
     def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
     def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
   }
@@ -310,6 +311,14 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
 }
 
+let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+    foreach Op = ["phadd", "phsub"] in {
+      def Op#w128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+      def Op#sw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+      def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+    }
+}
+
 let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
@@ -500,6 +509,11 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
   def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
   def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
   def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
+
+  foreach Op = ["hadd", "hsub"] in {
+    def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
+    def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
+  }
 }
 
 let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
@@ -572,12 +586,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
 let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
   def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
-  def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
   def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
@@ -647,6 +655,13 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
   def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
   def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+
+  def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 6e9b2dcf75712..e538988a24c85 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2842,6 +2842,78 @@ static bool interp_builtin_horizontal_fp_binop(
   return true;
 }
 
+static bool interp_builtin_horizontal_fps256_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APFloat(const APFloat &, const APFloat &,
+                               llvm::RoundingMode)>
+        Fn) {
+  assert(Call->getNumArgs() == 2);
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
+  llvm::RoundingMode RM = getRoundingMode(FPO);
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  unsigned SourceLen = VT->getNumElements();
+  assert(SourceLen % 2 == 0 && 
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen);
+  unsigned DstElem = 0;
+  for (unsigned I = 0; I < 4; ++I) {
+    using T = PrimConv<PT_Float>::T;
+    unsigned SrcIdx = 2 * I;       
+    unsigned DestIdx = (I < 2) ? I : (I + 2);
+    APFloat Elem1 = LHS.elem<T>(SrcIdx).getAPFloat();
+    APFloat Elem2 = LHS.elem<T>(SrcIdx + 1).getAPFloat();
+    Dst.elem<T>(DestIdx) = static_cast<T>(Fn(Elem1, Elem2, RM));
+  }
+  for (unsigned I = 0; I < 4; ++I) {
+    using T = PrimConv<PT_Float>::T;
+     unsigned SrcIdx = 2 * I;       
+    unsigned DestIdx = (I < 2) ? (I + 2) : (I + 4);
+    APFloat Elem1 = RHS.elem<T>(SrcIdx).getAPFloat();
+    APFloat Elem2 = RHS.elem<T>(SrcIdx + 1).getAPFloat();
+    Dst.elem<T>(DestIdx) = static_cast<T>(Fn(Elem1, Elem2, RM));
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
+static bool interp_builtin_horizontal_fpd256_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APFloat(const APFloat &, const APFloat &,
+                               llvm::RoundingMode)>
+        Fn) {
+  assert(Call->getNumArgs() == 2);
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
+  llvm::RoundingMode RM = getRoundingMode(FPO);
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  unsigned SourceLen = VT->getNumElements();
+  assert(SourceLen % 2 == 0 && 
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen);
+  for (unsigned I = 0; I < 2; ++I) {
+    using T = PrimConv<PT_Float>::T;
+    APFloat Elem1 = LHS.elem<T>(2*I).getAPFloat();
+    APFloat Elem2 = LHS.elem<T>(2*I + 1).getAPFloat();
+    Dst.elem<T>(2*I) = static_cast<T>(Fn(Elem1, Elem2, RM));
+  }
+  for (unsigned I = 0; I < 2; ++I) {
+    using T = PrimConv<PT_Float>::T;
+    APFloat Elem1 = RHS.elem<T>(2*I).getAPFloat();
+    APFloat Elem2 = RHS.elem<T>(2*I + 1).getAPFloat();
+    Dst.elem<T>(2*I+1) = static_cast<T>(Fn(Elem1, Elem2, RM));
+  }
+  Dst.initializeAllElements();
+  return true;
+}
 static bool interp__builtin_elementwise_triop_fp(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APFloat(const APFloat &, const APFloat &,
@@ -3680,9 +3752,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
         });
   case clang::X86::BI__builtin_ia32_haddpd:
-  case clang::X86::BI__builtin_ia32_haddpd256:
   case clang::X86::BI__builtin_ia32_haddps:
-  case clang::X86::BI__builtin_ia32_haddps256:
     return interp_builtin_horizontal_fp_binop(
         S, OpPC, Call,
         [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
@@ -3690,10 +3760,26 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           F.add(RHS, RM);
           return F;
         });
+  case clang::X86::BI__builtin_ia32_haddpd256:{
+    return interp_builtin_horizontal_fpd256_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.add(RHS, RM);
+          return F;
+        });
+  }
+  case clang::X86::BI__builtin_ia32_haddps256:{
+    return interp_builtin_horizontal_fps256_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.add(RHS, RM);
+          return F;
+        });
+  }
   case clang::X86::BI__builtin_ia32_hsubpd:
-  case clang::X86::BI__builtin_ia32_hsubpd256:
   case clang::X86::BI__builtin_ia32_hsubps:
-  case clang::X86::BI__builtin_ia32_hsubps256:
     return interp_builtin_horizontal_fp_binop(
         S, OpPC, Call,
         [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
@@ -3701,7 +3787,24 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           F.subtract(RHS, RM);
           return F;
         });
-
+  case clang::X86::BI__builtin_ia32_hsubpd256:{
+    return interp_builtin_horizontal_fpd256_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.subtract(RHS, RM);
+          return F;
+        });
+  }
+  case clang::X86::BI__builtin_ia32_hsubps256:{
+    return interp_builtin_horizontal_fps256_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.subtract(RHS, RM);
+          return F;
+        });
+  }
   case clang::X86::BI__builtin_ia32_pmuldq128:
   case clang::X86::BI__builtin_ia32_pmuldq256:
   case clang::X86::BI__builtin_ia32_pmuldq512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 863a0c455a624..ee9b3acfab59b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -55,7 +55,6 @@
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/APFixedPoint.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -12247,9 +12246,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
   case clang::X86::BI__builtin_ia32_haddpd:
-  case clang::X86::BI__builtin_ia32_haddpd256:
-  case clang::X86::BI__builtin_ia32_haddps:
-  case clang::X86::BI__builtin_ia32_haddps256: {
+  case clang::X86::BI__builtin_ia32_haddps: {
     APValue SourceLHS, SourceRHS;
     if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
         !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
@@ -12257,24 +12254,23 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     unsigned SourceLen = SourceLHS.getVectorLength();
     SmallVector<APValue, 4> ResultElements;
     ResultElements.reserve(SourceLen);
+    llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
     for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
       APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
       APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
-      LHSA.add(LHSB, APFloat::rmNearestTiesToEven);
+      LHSA.add(LHSB, RM);
       ResultElements.push_back(APValue(LHSA));
     }
     for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
       APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
       APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
-      RHSA.add(RHSB, APFloat::rmNearestTiesToEven);
+      RHSA.add(RHSB, RM);
       ResultElements.push_back(APValue(RHSA));
     }
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
   case clang::X86::BI__builtin_ia32_hsubpd:
-  case clang::X86::BI__builtin_ia32_hsubpd256:
-  case clang::X86::BI__builtin_ia32_hsubps:
-  case clang::X86::BI__builtin_ia32_hsubps256: {
+  case clang::X86::BI__builtin_ia32_hsubps: {
     APValue SourceLHS, SourceRHS;
     if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
         !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
@@ -12282,21 +12278,81 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     unsigned SourceLen = SourceLHS.getVectorLength();
     SmallVector<APValue, 4> ResultElements;
     ResultElements.reserve(SourceLen);
+    llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
     for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
       APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
       APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
-      LHSA.subtract(LHSB, APFloat::rmNearestTiesToEven);
+      LHSA.subtract(LHSB, RM);
       ResultElements.push_back(APValue(LHSA));
     }
     for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
       APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
       APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
-      RHSA.subtract(RHSB, APFloat::rmNearestTiesToEven);
+      RHSA.subtract(RHSB, RM);
       ResultElements.push_back(APValue(RHSA));
     }
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
-
+  case clang::X86::BI__builtin_ia32_haddpd256:
+  case clang::X86::BI__builtin_ia32_hsubpd256: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    SmallVector<APValue, 4> ResultElements(4);
+    llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
+    for (unsigned i = 0; i < 2; ++i) {
+        APFloat A = SourceLHS.getVectorElt(2*i).getFloat();
+        APFloat B = SourceLHS.getVectorElt(2*i+1).getFloat();
+        if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddpd256)
+            A.add(B, RM);
+        else 
+            A.subtract(B, RM);
+        ResultElements[2*i] = APValue(A);
+    }
+    for (unsigned i = 0; i < 2; ++i) {
+        APFloat A = SourceRHS.getVectorElt(2*i).getFloat();
+        APFloat B = SourceRHS.getVectorElt(2*i+1).getFloat();
+        if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddpd256)
+            A.add(B, RM);
+        else 
+            A.subtract(B, RM);
+        ResultElements[2*i+1] = APValue(A);
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case clang::X86::BI__builtin_ia32_haddps256:
+  case clang::X86::BI__builtin_ia32_hsubps256: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    SmallVector<APValue, 4> ResultElements(8); 
+    llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
+    for (unsigned i = 0; i < 4; ++i) {
+      unsigned SrcIdx = 2 * i;       
+      unsigned DestIdx = (i < 2) ? i : (i + 2); 
+      APFloat A = SourceLHS.getVectorElt(SrcIdx).getFloat();
+      APFloat B = SourceLHS.getVectorElt(SrcIdx + 1).getFloat();
+      if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddps256)
+        A.add(B, RM);
+      else 
+        A.subtract(B, RM);
+      ResultElements[DestIdx] = APValue(A);
+    }
+    for (unsigned i = 0; i < 4; ++i) {
+      unsigned SrcIdx = 2 * i;      
+      unsigned DestIdx = (i < 2) ? (i + 2) : (i + 4);
+      APFloat A = SourceRHS.getVectorElt(SrcIdx).getFloat();
+      APFloat B = SourceRHS.getVectorElt(SrcIdx + 1).getFloat();
+      if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddps256)
+        A.add(B, RM);
+      else 
+        A.subtract(B, RM);
+      ResultElements[DestIdx] = APValue(A);
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case Builtin::BI__builtin_elementwise_fshl:
   case Builtin::BI__builtin_elementwise_fshr: {
     APValue SourceHi, SourceLo, SourceShift;
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index ecae7f2445a20..1d8bb6b6a1104 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -716,8 +716,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b) {
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
-                                                                   __m256 __b) {
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
+ _mm256_hadd_ps(__m256 __a, __m256 __b) {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index 7d7aeece0ab13..82821eeb25bdc 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -105,7 +105,7 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_ps(__m128 __a,
 ///    bits of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal
 ///    differences of both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsub_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index efd7a0d7351f2..387e568243375 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -492,9 +492,10 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a,
 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_maddubs_epi16(__m128i __a, __m128i __b) {
-  return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -521,10 +522,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) {
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_maddubs_pi16(__m64 __a, __m64 __b) {
-  return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
-                                               (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                                 (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -541,9 +543,10 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) {
 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_mulhrs_epi16(__m128i __a, __m128i __b) {
-  return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 3fe1baf430190..ed39e3930a66c 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1093,34 +1093,35 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hadd_pd(A, B);
 }
-TEST_CONSTEXPR(match_m256d(_mm256_hadd_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), 3.0, 7.0, 11.0, 15.0));
+TEST_CONSTEXPR(match_m256d(_mm256_hadd_pd((__m256d){+1.0, +2.0, +3.0, +4.0}, (__m256d){+5.0, +6.0, +7.0, +8.0}), +3.0, +11.0, +7.0, +15.0));
 
 __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hadd_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hadd_ps(A, B);
 }
-TEST_CONSTEXPR(_mm256_hadd_ps(
-    (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-    (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f},
-    3.0f, 7.0f, 11.0f, 15.0f, 19.0f, 23.0f, 27.0f, 31.0f))
+
+TEST_CONSTEXPR(match_m256(_mm256_hadd_ps(
+    (__m256){+1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f},
+    (__m256){+9.0f, +10.0f, +11.0f, +12.0f, +13.0f, +14.0f, +15.0f, +16.0f}),
+    +3.0f, +7.0f, +19.0f, +23.0f, +11.0f, +15.0f, +27.0f, +31.0f));
 
 __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_hsub_pd
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
-TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){1.0, 2.0, 4.0, 3.0}, (__m256d){10.0, 6.0, 16.0, 8.0}), -1.0,1.0,4.0,8.0));
+TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){+1.0, +2.0, +4.0, +3.0}, (__m256d){+10.0, +6.0, +16.0, +8.0}), -1.0,+4.0,+1.0,+8.0));
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hsub_ps(A, B);
 }
-TEST_CONSTEXPR(_mm256_hsub_ps(
+TEST_CONSTEXPR(match_m256(_mm256_hsub_ps(
     (__m256){1.0f, 2.0f, 4.0f, 3.0f, 5.0f, 7.0f, 7.0f, 5.0f},
-    (__m256){9.0f, 6.0f, 11.0f, 8.0f, 13.0f, 17.0f, 15.0f, 11.0f},
-    -1.0f, 1.0f, -2.0f, 2.0f, -3.0f, 3.0f, -4.0f, 4.0f))
+    (__m256){6.0f, 9.0f, 11.0f, 8.0f, 13.0f, 17.0f, 15.0f, 11.0f}),
+    -1.0f, 1.0f, -3.0f, 3.0f, -2.0f, 2.0f, -4.0f, 4.0f));
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index a7f5b5e6269a0..3f5da2d105ce3 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -527,7 +527,7 @@ __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
 }
 TEST_CONSTEXPR(match_v8si(_mm256_hsub_epi32(
     (__m256i)(__v8si){10, 20, 30,50,60,90,100,140},
-    (__m256i)(__v8si){200,150,260,200,420,350,800,740}),
+    (__m256i)(__v8si){200,150,260,200,420,350,800,720}),
     -10,-20,-30,-40,50,60,70,80));
 
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index f7960977c0f51..a82658627acd3 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -332,7 +332,7 @@ __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
-TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,4,3},(__m64)(__v4hi){10,5,0,-10}),-1,1,5,-10));
+TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,4,3},(__m64)(__v4hi){10,5,0,-10}),-1,1,5,10));
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
diff --git a/clang/test/CodeGen/X86/sse3-builtins.c b/clang/test/CodeGen/X86/sse3-builtins.c
index c53afc56e7246..a82dd4080670b 100644
--- a/clang/test/CodeGen/X86/sse3-builtins.c
+++ b/clang/test/CodeGen/X86/sse3-builtins.c
@@ -31,24 +31,28 @@ __m128d test_mm_hadd_pd(__m128d A, __m128d B) {
   // CHECK: call {{.*}}<2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_hadd_pd(A, B);
 }
+TEST_CONSTEXPR(match_m128d(_mm_hadd_pd((__m128d){+1.0, +2.0}, (__m128d){+3.0, +4.0}), +3.0, +7.0));
 
 __m128 test_mm_hadd_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_hadd_ps
   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_hadd_ps(A, B);
 }
+TEST_CONSTEXPR(match_m128(_mm_hadd_ps((__m128){+1.0f, +2.0f, +3.0f, +4.0f}, (__m128){+5.0f,+6.0f,+7.0f,+8.0f}), +3.0f, +7.0f, +11.0f, +15.0f));
 
 __m128d test_mm_hsub_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_hsub_pd
   // CHECK: call {{.*}}<2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_hsub_pd(A, B);
 }
+TEST_CONSTEXPR(match_m128d(_mm_hsub_pd((__m128d){+1.0, +2.0}, (__m128d){+4.0, +3.0}), -1.0, +1.0));
 
 __m128 test_mm_hsub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_hsub_ps
   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_hsub_ps(A, B);
 }
+TEST_CONSTEXPR(match_m128(_mm_hsub_ps((__m128){+1.0f, +2.0f, +4.0f, +3.0f}, (__m128){+5.0f,+7.0f,+10.0f,+8.0f}), -1.0f, +1.0f, -2.0f, +2.0f));
 
 __m128i test_mm_lddqu_si128(__m128i const* P) {
   // CHECK-LABEL: test_mm_lddqu_si128
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index d5b64df3f57a9..29ff01b91420f 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -89,7 +89,7 @@ __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,1,1}, (__m128i)(__v4si){7,5,10,5}), 1,0,-1,5))   
+TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,1,1}, (__m128i)(__v4si){7,5,10,5}), 1,0,2,5));   
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16

>From b7331576730bf7f886799d6e2286c23c914732b2 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Mon, 29 Sep 2025 01:52:30 +0800
Subject: [PATCH 09/12] format code

---
 clang/include/clang/Basic/BuiltinsX86.td | 59 ++++++++++++++++--------
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 34 +++++++-------
 clang/lib/AST/ExprConstant.cpp           | 40 ++++++++--------
 clang/lib/Headers/avxintrin.h            |  4 +-
 clang/lib/Headers/pmmintrin.h            |  5 +-
 clang/lib/Headers/tmmintrin.h            | 23 ++++-----
 6 files changed, 93 insertions(+), 72 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index a09569900ba90..dc851ca8e511c 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -117,10 +117,15 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
     }
   }
 
-  let Features = "sse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  let Features = "sse3",
+      Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
     foreach Op = ["hadd", "hsub"] in {
-      def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
-      def Op#pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
+      def Op#ps
+          : X86Builtin<
+                "_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
+      def Op#pd
+          : X86Builtin<
+                "_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
     }
   }
 
@@ -311,12 +316,16 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
 }
 
-let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
-    foreach Op = ["phadd", "phsub"] in {
-      def Op#w128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-      def Op#sw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-      def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-    }
+let Features = "ssse3",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  foreach Op = ["phadd", "phsub"] in {
+    def Op#w128
+        : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+    def Op#sw128
+        : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+    def Op#d128
+        : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  }
 }
 
 let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -511,8 +520,11 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
   def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
 
   foreach Op = ["hadd", "hsub"] in {
-    def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
-    def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
+    def Op#pd256
+        : X86Builtin<
+              "_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
+    def Op#ps256
+        : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
   }
 }
 
@@ -585,7 +597,8 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
 
 let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
-  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
+  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, "
+                              "_Vector<32, char>, _Constant int)">;
   def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
   def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
@@ -656,12 +669,22 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
   def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
 
-  def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phaddw256
+      : X86Builtin<
+            "_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phaddd256
+      : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phaddsw256
+      : X86Builtin<
+            "_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubw256
+      : X86Builtin<
+            "_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubd256
+      : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phsubsw256
+      : X86Builtin<
+            "_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index e538988a24c85..d02e756c911d6 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2858,12 +2858,13 @@ static bool interp_builtin_horizontal_fps256_binop(
   llvm::RoundingMode RM = getRoundingMode(FPO);
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
   unsigned SourceLen = VT->getNumElements();
-  assert(SourceLen % 2 == 0 && 
-         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen);
+  assert(SourceLen % 2 == 0 &&
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
+             SourceLen);
   unsigned DstElem = 0;
   for (unsigned I = 0; I < 4; ++I) {
     using T = PrimConv<PT_Float>::T;
-    unsigned SrcIdx = 2 * I;       
+    unsigned SrcIdx = 2 * I;
     unsigned DestIdx = (I < 2) ? I : (I + 2);
     APFloat Elem1 = LHS.elem<T>(SrcIdx).getAPFloat();
     APFloat Elem2 = LHS.elem<T>(SrcIdx + 1).getAPFloat();
@@ -2871,7 +2872,7 @@ static bool interp_builtin_horizontal_fps256_binop(
   }
   for (unsigned I = 0; I < 4; ++I) {
     using T = PrimConv<PT_Float>::T;
-     unsigned SrcIdx = 2 * I;       
+    unsigned SrcIdx = 2 * I;
     unsigned DestIdx = (I < 2) ? (I + 2) : (I + 4);
     APFloat Elem1 = RHS.elem<T>(SrcIdx).getAPFloat();
     APFloat Elem2 = RHS.elem<T>(SrcIdx + 1).getAPFloat();
@@ -2897,19 +2898,20 @@ static bool interp_builtin_horizontal_fpd256_binop(
   llvm::RoundingMode RM = getRoundingMode(FPO);
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
   unsigned SourceLen = VT->getNumElements();
-  assert(SourceLen % 2 == 0 && 
-         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen);
+  assert(SourceLen % 2 == 0 &&
+         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
+             SourceLen);
   for (unsigned I = 0; I < 2; ++I) {
     using T = PrimConv<PT_Float>::T;
-    APFloat Elem1 = LHS.elem<T>(2*I).getAPFloat();
-    APFloat Elem2 = LHS.elem<T>(2*I + 1).getAPFloat();
-    Dst.elem<T>(2*I) = static_cast<T>(Fn(Elem1, Elem2, RM));
+    APFloat Elem1 = LHS.elem<T>(2 * I).getAPFloat();
+    APFloat Elem2 = LHS.elem<T>(2 * I + 1).getAPFloat();
+    Dst.elem<T>(2 * I) = static_cast<T>(Fn(Elem1, Elem2, RM));
   }
   for (unsigned I = 0; I < 2; ++I) {
     using T = PrimConv<PT_Float>::T;
-    APFloat Elem1 = RHS.elem<T>(2*I).getAPFloat();
-    APFloat Elem2 = RHS.elem<T>(2*I + 1).getAPFloat();
-    Dst.elem<T>(2*I+1) = static_cast<T>(Fn(Elem1, Elem2, RM));
+    APFloat Elem1 = RHS.elem<T>(2 * I).getAPFloat();
+    APFloat Elem2 = RHS.elem<T>(2 * I + 1).getAPFloat();
+    Dst.elem<T>(2 * I + 1) = static_cast<T>(Fn(Elem1, Elem2, RM));
   }
   Dst.initializeAllElements();
   return true;
@@ -3760,7 +3762,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           F.add(RHS, RM);
           return F;
         });
-  case clang::X86::BI__builtin_ia32_haddpd256:{
+  case clang::X86::BI__builtin_ia32_haddpd256: {
     return interp_builtin_horizontal_fpd256_binop(
         S, OpPC, Call,
         [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
@@ -3769,7 +3771,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return F;
         });
   }
-  case clang::X86::BI__builtin_ia32_haddps256:{
+  case clang::X86::BI__builtin_ia32_haddps256: {
     return interp_builtin_horizontal_fps256_binop(
         S, OpPC, Call,
         [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
@@ -3787,7 +3789,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           F.subtract(RHS, RM);
           return F;
         });
-  case clang::X86::BI__builtin_ia32_hsubpd256:{
+  case clang::X86::BI__builtin_ia32_hsubpd256: {
     return interp_builtin_horizontal_fpd256_binop(
         S, OpPC, Call,
         [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
@@ -3796,7 +3798,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return F;
         });
   }
-  case clang::X86::BI__builtin_ia32_hsubps256:{
+  case clang::X86::BI__builtin_ia32_hsubps256: {
     return interp_builtin_horizontal_fps256_binop(
         S, OpPC, Call,
         [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index ee9b3acfab59b..112117249238b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12302,22 +12302,22 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     SmallVector<APValue, 4> ResultElements(4);
     llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
     for (unsigned i = 0; i < 2; ++i) {
-        APFloat A = SourceLHS.getVectorElt(2*i).getFloat();
-        APFloat B = SourceLHS.getVectorElt(2*i+1).getFloat();
-        if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddpd256)
-            A.add(B, RM);
-        else 
-            A.subtract(B, RM);
-        ResultElements[2*i] = APValue(A);
+      APFloat A = SourceLHS.getVectorElt(2 * i).getFloat();
+      APFloat B = SourceLHS.getVectorElt(2 * i + 1).getFloat();
+      if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddpd256)
+        A.add(B, RM);
+      else
+        A.subtract(B, RM);
+      ResultElements[2 * i] = APValue(A);
     }
     for (unsigned i = 0; i < 2; ++i) {
-        APFloat A = SourceRHS.getVectorElt(2*i).getFloat();
-        APFloat B = SourceRHS.getVectorElt(2*i+1).getFloat();
-        if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddpd256)
-            A.add(B, RM);
-        else 
-            A.subtract(B, RM);
-        ResultElements[2*i+1] = APValue(A);
+      APFloat A = SourceRHS.getVectorElt(2 * i).getFloat();
+      APFloat B = SourceRHS.getVectorElt(2 * i + 1).getFloat();
+      if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddpd256)
+        A.add(B, RM);
+      else
+        A.subtract(B, RM);
+      ResultElements[2 * i + 1] = APValue(A);
     }
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
@@ -12327,27 +12327,27 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
         !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
       return false;
-    SmallVector<APValue, 4> ResultElements(8); 
+    SmallVector<APValue, 4> ResultElements(8);
     llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
     for (unsigned i = 0; i < 4; ++i) {
-      unsigned SrcIdx = 2 * i;       
-      unsigned DestIdx = (i < 2) ? i : (i + 2); 
+      unsigned SrcIdx = 2 * i;
+      unsigned DestIdx = (i < 2) ? i : (i + 2);
       APFloat A = SourceLHS.getVectorElt(SrcIdx).getFloat();
       APFloat B = SourceLHS.getVectorElt(SrcIdx + 1).getFloat();
       if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddps256)
         A.add(B, RM);
-      else 
+      else
         A.subtract(B, RM);
       ResultElements[DestIdx] = APValue(A);
     }
     for (unsigned i = 0; i < 4; ++i) {
-      unsigned SrcIdx = 2 * i;      
+      unsigned SrcIdx = 2 * i;
       unsigned DestIdx = (i < 2) ? (i + 2) : (i + 4);
       APFloat A = SourceRHS.getVectorElt(SrcIdx).getFloat();
       APFloat B = SourceRHS.getVectorElt(SrcIdx + 1).getFloat();
       if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddps256)
         A.add(B, RM);
-      else 
+      else
         A.subtract(B, RM);
       ResultElements[DestIdx] = APValue(A);
     }
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 1d8bb6b6a1104..ecae7f2445a20 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -716,8 +716,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b) {
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
- _mm256_hadd_ps(__m256 __a, __m256 __b) {
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
+                                                                   __m256 __b) {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index 82821eeb25bdc..42bd343e326de 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -105,9 +105,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_ps(__m128 __a,
 ///    bits of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal
 ///    differences of both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_ps(__m128 __a,
+                                                                  __m128 __b) {
   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
 }
 
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 387e568243375..f5803fdded2d5 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -492,10 +492,9 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a,
 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maddubs_epi16(__m128i __a,
+                                                               __m128i __b) {
+  return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -522,11 +521,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
-                                                 (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_maddubs_pi16(__m64 __a,
+                                                            __m64 __b) {
+  return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                               (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -543,10 +541,9 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b)
 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mulhrs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhrs_epi16(__m128i __a,
+                                                              __m128i __b) {
+  return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit

>From 9a7c1383fc33d0613521beec559e7e4bf3196012 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Thu, 2 Oct 2025 19:49:54 +0800
Subject: [PATCH 10/12] deal all 256 double pane ins

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp |  73 +++++---------
 clang/lib/AST/ExprConstant.cpp           | 122 +++++++++++++----------
 clang/test/CodeGen/X86/avx2-builtins.c   |  12 +--
 3 files changed, 101 insertions(+), 106 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index ded0472645b50..4b5f1654fa973 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2738,42 +2738,38 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
 static bool interp_builtin_horizontal_int_binop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
-  assert(Call->getNumArgs() == 2);
-
-  assert(Call->getArg(0)->getType()->isVectorType() &&
-         Call->getArg(1)->getType()->isVectorType());
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
-  assert(VT->getElementType()->isIntegralOrEnumerationType());
   PrimType ElemT = *S.getContext().classify(VT->getElementType());
   bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
 
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
+  unsigned NumElts = VT->getNumElements();
+  unsigned EltBits = S.getASTContext().getIntWidth(VT->getElementType());
+  unsigned EltsPerLane = 128 / EltBits;
+  unsigned Lanes = NumElts * EltBits / 128;
+  unsigned DestIndex = 0;
+
+  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+    unsigned LaneStart = Lane * EltsPerLane;
+    for (unsigned I = 0; I < EltsPerLane; I += 2) {
+      INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+        APSInt Elem1 = LHS.elem<T>(LaneStart + I).toAPSInt();
+        APSInt Elem2 = LHS.elem<T>(LaneStart + I + 1).toAPSInt();
+        APSInt ResL = APSInt(Fn(Elem1, Elem2), DestUnsigned);
+        Dst.elem<T>(DestIndex++) = static_cast<T>(ResL);
+      });
+    }
 
-  unsigned SourceLen = VT->getNumElements();
-  assert(SourceLen % 2 == 0 &&
-         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
-             SourceLen);
-  unsigned DstElem = 0;
-
-  for (unsigned I = 0; I != SourceLen; I += 2) {
-    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
-      APSInt Elem1 = LHS.elem<T>(I).toAPSInt();
-      APSInt Elem2 = LHS.elem<T>(I + 1).toAPSInt();
-      Dst.elem<T>(DstElem) =
-          static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
-    });
-    ++DstElem;
-  }
-  for (unsigned I = 0; I != SourceLen; I += 2) {
-    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
-      APSInt Elem1 = RHS.elem<T>(I).toAPSInt();
-      APSInt Elem2 = RHS.elem<T>(I + 1).toAPSInt();
-      Dst.elem<T>(DstElem) =
-          static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
-    });
-    ++DstElem;
+for (unsigned I = 0; I < EltsPerLane; I += 2) {
+      INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+        APSInt Elem1 = RHS.elem<T>(LaneStart + I).toAPSInt();
+        APSInt Elem2 = RHS.elem<T>(LaneStart + I + 1).toAPSInt();
+        APSInt ResR = APSInt(Fn(Elem1, Elem2), DestUnsigned);
+        Dst.elem<T>(DestIndex++) = static_cast<T>(ResR);
+      });
+    }
   }
   Dst.initializeAllElements();
   return true;
@@ -2784,9 +2780,6 @@ static bool interp_builtin_horizontal_fp_binop(
     llvm::function_ref<APFloat(const APFloat &, const APFloat &,
                                llvm::RoundingMode)>
         Fn) {
-  assert(Call->getNumArgs() == 2);
-  assert(Call->getArg(0)->getType()->isVectorType() &&
-         Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
@@ -2795,9 +2788,6 @@ static bool interp_builtin_horizontal_fp_binop(
   llvm::RoundingMode RM = getRoundingMode(FPO);
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
   unsigned SourceLen = VT->getNumElements();
-  assert(SourceLen % 2 == 0 &&
-         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
-             SourceLen);
   unsigned DstElem = 0;
   for (unsigned I = 0; I != SourceLen; I += 2) {
     using T = PrimConv<PT_Float>::T;
@@ -2820,9 +2810,6 @@ static bool interp_builtin_horizontal_fps256_binop(
     llvm::function_ref<APFloat(const APFloat &, const APFloat &,
                                llvm::RoundingMode)>
         Fn) {
-  assert(Call->getNumArgs() == 2);
-  assert(Call->getArg(0)->getType()->isVectorType() &&
-         Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
@@ -2830,11 +2817,6 @@ static bool interp_builtin_horizontal_fps256_binop(
   FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
   llvm::RoundingMode RM = getRoundingMode(FPO);
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
-  unsigned SourceLen = VT->getNumElements();
-  assert(SourceLen % 2 == 0 &&
-         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
-             SourceLen);
-  unsigned DstElem = 0;
   for (unsigned I = 0; I < 4; ++I) {
     using T = PrimConv<PT_Float>::T;
     unsigned SrcIdx = 2 * I;
@@ -2860,9 +2842,6 @@ static bool interp_builtin_horizontal_fpd256_binop(
     llvm::function_ref<APFloat(const APFloat &, const APFloat &,
                                llvm::RoundingMode)>
         Fn) {
-  assert(Call->getNumArgs() == 2);
-  assert(Call->getArg(0)->getType()->isVectorType() &&
-         Call->getArg(1)->getType()->isVectorType());
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
@@ -2870,10 +2849,6 @@ static bool interp_builtin_horizontal_fpd256_binop(
   FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
   llvm::RoundingMode RM = getRoundingMode(FPO);
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
-  unsigned SourceLen = VT->getNumElements();
-  assert(SourceLen % 2 == 0 &&
-         Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
-             SourceLen);
   for (unsigned I = 0; I < 2; ++I) {
     using T = PrimConv<PT_Float>::T;
     APFloat Elem1 = LHS.elem<T>(2 * I).getAPFloat();
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 112117249238b..cda9fcc7b891b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12169,7 +12169,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case clang::X86::BI__builtin_ia32_phsubd128:
   case clang::X86::BI__builtin_ia32_phsubd256:
   case clang::X86::BI__builtin_ia32_phsubsw128:
-  case clang::X86::BI__builtin_ia32_phsubsw256:{
+  case clang::X86::BI__builtin_ia32_phsubsw256: {
     APValue SourceLHS, SourceRHS;
     if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
         !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
@@ -12177,70 +12177,90 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
     bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
 
-      unsigned SourceLen = SourceLHS.getVectorLength();
-      SmallVector<APValue, 4> ResultElements;
-      ResultElements.reserve(SourceLen);
-      for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
-        APSInt LHSA = SourceLHS.getVectorElt(EltNum).getInt();
-        APSInt LHSB = SourceLHS.getVectorElt(EltNum + 1).getInt();
+    unsigned NumElts = SourceLHS.getVectorLength();
+    unsigned EltBits = Info.Ctx.getIntWidth(DestEltTy);
+    unsigned EltsPerLane = 128 / EltBits;
+    unsigned Lanes = NumElts * EltBits / 128;
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(NumElts);
 
+    for (unsigned LaneStart = 0; LaneStart != NumElts;
+         LaneStart += EltsPerLane) {
+      for (unsigned I = 0; I != EltsPerLane; I += 2) {
+        APSInt LHSA = SourceLHS.getVectorElt(LaneStart + I).getInt();
+        APSInt LHSB = SourceLHS.getVectorElt(LaneStart + I + 1).getInt();
         switch (E->getBuiltinCallee()) {
         case clang::X86::BI__builtin_ia32_phaddw128:
         case clang::X86::BI__builtin_ia32_phaddw256:
         case clang::X86::BI__builtin_ia32_phaddd128:
-        case clang::X86::BI__builtin_ia32_phaddd256:
-        ResultElements.push_back(
-            APValue(APSInt(LHSA+LHSB, DestUnsigned)));
-        break;
+        case clang::X86::BI__builtin_ia32_phaddd256: {
+          APSInt Res(LHSA + LHSB, DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
         case clang::X86::BI__builtin_ia32_phaddsw128:
-        case clang::X86::BI__builtin_ia32_phaddsw256:
-          ResultElements.push_back(APValue(APSInt(
-              LHSA.isSigned() ? LHSA.sadd_sat(LHSB) : LHSA.uadd_sat(LHSB),
-              DestUnsigned)));
+        case clang::X86::BI__builtin_ia32_phaddsw256: {
+          APSInt Res(LHSA.isSigned() ? LHSA.sadd_sat(LHSB)
+                                     : LHSA.uadd_sat(LHSB),
+                     DestUnsigned);
+          ResultElements.push_back(APValue(Res));
           break;
+        }
         case clang::X86::BI__builtin_ia32_phsubw128:
         case clang::X86::BI__builtin_ia32_phsubw256:
         case clang::X86::BI__builtin_ia32_phsubd128:
-        case clang::X86::BI__builtin_ia32_phsubd256:
-          ResultElements.push_back(APValue(APSInt(LHSA - LHSB, DestUnsigned)));
+        case clang::X86::BI__builtin_ia32_phsubd256: {
+          APSInt Res(LHSA - LHSB, DestUnsigned);
+          ResultElements.push_back(APValue(Res));
           break;
+        }
         case clang::X86::BI__builtin_ia32_phsubsw128:
-        case clang::X86::BI__builtin_ia32_phsubsw256:
-          ResultElements.push_back(APValue(APSInt(
-              LHSA.isSigned() ? LHSA.ssub_sat(LHSB) : LHSA.usub_sat(LHSB),
-              DestUnsigned)));
+        case clang::X86::BI__builtin_ia32_phsubsw256: {
+          APSInt Res(LHSA.isSigned() ? LHSA.ssub_sat(LHSB)
+                                     : LHSA.usub_sat(LHSB),
+                     DestUnsigned);
+          ResultElements.push_back(APValue(Res));
           break;
+        }
+        }
       }
-    }
-    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
-      APSInt RHSA = SourceRHS.getVectorElt(EltNum).getInt();
-      APSInt RHSB = SourceRHS.getVectorElt(EltNum + 1).getInt();
-
-      switch (E->getBuiltinCallee()) {
-      case clang::X86::BI__builtin_ia32_phaddw128:
-      case clang::X86::BI__builtin_ia32_phaddw256:
-      case clang::X86::BI__builtin_ia32_phaddd128:
-      case clang::X86::BI__builtin_ia32_phaddd256:
-        ResultElements.push_back(APValue(APSInt(RHSA + RHSB, DestUnsigned)));
-        break;
-      case clang::X86::BI__builtin_ia32_phaddsw128:
-      case clang::X86::BI__builtin_ia32_phaddsw256:
-        ResultElements.push_back(APValue(
-            APSInt(RHSA.isSigned() ? RHSA.sadd_sat(RHSB) : RHSA.uadd_sat(RHSB),
-                   DestUnsigned)));
-        break;
-      case clang::X86::BI__builtin_ia32_phsubw128:
-      case clang::X86::BI__builtin_ia32_phsubw256:
-      case clang::X86::BI__builtin_ia32_phsubd128:
-      case clang::X86::BI__builtin_ia32_phsubd256:
-        ResultElements.push_back(APValue(APSInt(RHSA - RHSB, DestUnsigned)));
-        break;
-      case clang::X86::BI__builtin_ia32_phsubsw128:
-      case clang::X86::BI__builtin_ia32_phsubsw256:
-        ResultElements.push_back(APValue(
-            APSInt(RHSA.isSigned() ? RHSA.ssub_sat(RHSB) : RHSA.usub_sat(RHSB),
-                   DestUnsigned)));
-        break;
+      for (unsigned I = 0; I != EltsPerLane; I += 2) {
+        APSInt RHSA = SourceRHS.getVectorElt(LaneStart + I).getInt();
+        APSInt RHSB = SourceRHS.getVectorElt(LaneStart + I + 1).getInt();
+        switch (E->getBuiltinCallee()) {
+        case clang::X86::BI__builtin_ia32_phaddw128:
+        case clang::X86::BI__builtin_ia32_phaddw256:
+        case clang::X86::BI__builtin_ia32_phaddd128:
+        case clang::X86::BI__builtin_ia32_phaddd256: {
+          APSInt Res(RHSA + RHSB, DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phaddsw128:
+        case clang::X86::BI__builtin_ia32_phaddsw256: {
+          APSInt Res(RHSA.isSigned() ? RHSA.sadd_sat(RHSB)
+                                     : RHSA.uadd_sat(RHSB),
+                     DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phsubw128:
+        case clang::X86::BI__builtin_ia32_phsubw256:
+        case clang::X86::BI__builtin_ia32_phsubd128:
+        case clang::X86::BI__builtin_ia32_phsubd256: {
+          APSInt Res(RHSA - RHSB, DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phsubsw128:
+        case clang::X86::BI__builtin_ia32_phsubsw256: {
+          APSInt Res(RHSA.isSigned() ? RHSA.ssub_sat(RHSB)
+                                     : RHSA.usub_sat(RHSB),
+                     DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        }
       }
     }
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 3f5da2d105ce3..e5e9f3a6d0d4c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -488,7 +488,7 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
 TEST_CONSTEXPR(match_v16hi(_mm256_hadd_epi16(
     (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 
     (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), 
-    3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63));
+    3,7,11,15,35,39,43,47,19,23,27,31,51,55,59,63));
 
 __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadd_epi32
@@ -498,7 +498,7 @@ __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
 TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32(
     (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
     (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}),
-    30,70,110,150,20,60,100,140));
+    30,70,20,60,110,150,100,140));
 
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
@@ -508,7 +508,7 @@ __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
 TEST_CONSTEXPR(match_v16hi( _mm256_hadds_epi16(
     (__m256i)(__v16hi){32767, 32767, 1,2,3,4,5,6,7,8,9,10,11,12,13,14},
     (__m256i)(__v16hi){19,20,21,22,23,24,25,26,27,28,29,30,31,32, 32767, 5}),
-    32767, 3,7,11,15,19,23,27, 39,43,47,51,55,59,63, 32767));
+    32767, 3,7,11, 39,43,47,51,15,19,23,27, 55,59,63, 32767));
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
@@ -518,7 +518,7 @@ __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
 TEST_CONSTEXPR(match_v16hi(_mm256_hsub_epi16(
     (__m256i)(__v16hi){2,1,1,2,5,3,3,5,7,4,4,7,9,5,5,9}, 
     (__m256i)(__v16hi){10,5,5,10,12,6,6,12,21,14,14,21,24,16,16,24}), 
-    1,-1,2,-2,3,-3,4,-4,5,-5,6,-6,7,-7,8,-8));
+    1,-1,2,-2,5,-5,6,-6,3,-3,4,-4, 7,-7,8,-8));
 
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
@@ -528,7 +528,7 @@ __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
 TEST_CONSTEXPR(match_v8si(_mm256_hsub_epi32(
     (__m256i)(__v8si){10, 20, 30,50,60,90,100,140},
     (__m256i)(__v8si){200,150,260,200,420,350,800,720}),
-    -10,-20,-30,-40,50,60,70,80));
+    -10,-20,50,60, -30,-40, 70,80));
 
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
@@ -538,7 +538,7 @@ __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
 TEST_CONSTEXPR(match_v16hi(_mm256_hsubs_epi16(
     (__m256i)(__v16hi){32726, -100, 3, 2, 6, 4, 8, 5,15,10 ,21, 14, 27, 18, 100, 90},
     (__m256i)(__v16hi){40, 20, 100, 70, 200,150, 100,40, 1000,900,300,150, 500,300, 1, 1}),
-    32767, 1, 2, 3, 5, 7, 9, 10, 20, 30, 50, 60, 100, 150, 200, 0));
+    32767, 1, 2, 3,  20, 30, 50, 60, 5, 7, 9, 10, 100, 150, 200, 0));
 
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32

>From a65f4fceadcf65c6e50603453296a9ec2302e539 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Thu, 2 Oct 2025 19:52:52 +0800
Subject: [PATCH 11/12] deal all 256 double pane ins

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 4b5f1654fa973..56c1bcc119813 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2762,7 +2762,7 @@ static bool interp_builtin_horizontal_int_binop(
       });
     }
 
-for (unsigned I = 0; I < EltsPerLane; I += 2) {
+    for (unsigned I = 0; I < EltsPerLane; I += 2) {
       INT_TYPE_SWITCH_NO_BOOL(ElemT, {
         APSInt Elem1 = RHS.elem<T>(LaneStart + I).toAPSInt();
         APSInt Elem2 = RHS.elem<T>(LaneStart + I + 1).toAPSInt();

>From 98773173adb7d6e38e8f20da34262039fe16a925 Mon Sep 17 00:00:00 2001
From: whyuuwang <whyuuwang at tencent.com>
Date: Tue, 7 Oct 2025 14:03:33 +0800
Subject: [PATCH 12/12] adjust for 128 and 256 oprand

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 128 ++++-------------------
 1 file changed, 22 insertions(+), 106 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 56c1bcc119813..4a9ead1162bbc 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2783,87 +2783,34 @@ static bool interp_builtin_horizontal_fp_binop(
   const Pointer &RHS = S.Stk.pop<Pointer>();
   const Pointer &LHS = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
-
   FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
   llvm::RoundingMode RM = getRoundingMode(FPO);
   const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
-  unsigned SourceLen = VT->getNumElements();
-  unsigned DstElem = 0;
-  for (unsigned I = 0; I != SourceLen; I += 2) {
-    using T = PrimConv<PT_Float>::T;
-    APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
-    APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
-    Dst.elem<T>(DstElem++) = static_cast<T>(Fn(Elem1, Elem2, RM));
-  }
-  for (unsigned I = 0; I != SourceLen; I += 2) {
-    using T = PrimConv<PT_Float>::T;
-    APFloat Elem1 = RHS.elem<T>(I).getAPFloat();
-    APFloat Elem2 = RHS.elem<T>(I + 1).getAPFloat();
-    Dst.elem<T>(DstElem++) = static_cast<T>(Fn(Elem1, Elem2, RM));
-  }
-  Dst.initializeAllElements();
-  return true;
-}
 
-static bool interp_builtin_horizontal_fps256_binop(
-    InterpState &S, CodePtr OpPC, const CallExpr *Call,
-    llvm::function_ref<APFloat(const APFloat &, const APFloat &,
-                               llvm::RoundingMode)>
-        Fn) {
-  const Pointer &RHS = S.Stk.pop<Pointer>();
-  const Pointer &LHS = S.Stk.pop<Pointer>();
-  const Pointer &Dst = S.Stk.peek<Pointer>();
+  unsigned NumElts = VT->getNumElements();
+  unsigned EltBits = S.getASTContext().getTypeSize(VT->getElementType());
+  unsigned NumLanes = NumElts * EltBits / 128;
+  unsigned NumElemsPerLane = NumElts / NumLanes;
+  unsigned HalfElemsPerLane = NumElemsPerLane / 2;
 
-  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
-  llvm::RoundingMode RM = getRoundingMode(FPO);
-  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
-  for (unsigned I = 0; I < 4; ++I) {
+  for (unsigned L = 0; L != NumElts; L += NumElemsPerLane) {
     using T = PrimConv<PT_Float>::T;
-    unsigned SrcIdx = 2 * I;
-    unsigned DestIdx = (I < 2) ? I : (I + 2);
-    APFloat Elem1 = LHS.elem<T>(SrcIdx).getAPFloat();
-    APFloat Elem2 = LHS.elem<T>(SrcIdx + 1).getAPFloat();
-    Dst.elem<T>(DestIdx) = static_cast<T>(Fn(Elem1, Elem2, RM));
-  }
-  for (unsigned I = 0; I < 4; ++I) {
-    using T = PrimConv<PT_Float>::T;
-    unsigned SrcIdx = 2 * I;
-    unsigned DestIdx = (I < 2) ? (I + 2) : (I + 4);
-    APFloat Elem1 = RHS.elem<T>(SrcIdx).getAPFloat();
-    APFloat Elem2 = RHS.elem<T>(SrcIdx + 1).getAPFloat();
-    Dst.elem<T>(DestIdx) = static_cast<T>(Fn(Elem1, Elem2, RM));
+    for (unsigned E = 0; E != HalfElemsPerLane; ++E) {
+      APFloat Elem1 = LHS.elem<T>(L + (2 * E) + 0).getAPFloat();
+      APFloat Elem2 = LHS.elem<T>(L + (2 * E) + 1).getAPFloat();
+      Dst.elem<T>(L + E) = static_cast<T>(Fn(Elem1, Elem2, RM));
+    }
+    for (unsigned E = 0; E != HalfElemsPerLane; ++E) {
+      APFloat Elem1 = RHS.elem<T>(L + (2 * E) + 0).getAPFloat();
+      APFloat Elem2 = RHS.elem<T>(L + (2 * E) + 1).getAPFloat();
+      Dst.elem<T>(L + E + HalfElemsPerLane) =
+          static_cast<T>(Fn(Elem1, Elem2, RM));
+    }
   }
   Dst.initializeAllElements();
   return true;
 }
 
-static bool interp_builtin_horizontal_fpd256_binop(
-    InterpState &S, CodePtr OpPC, const CallExpr *Call,
-    llvm::function_ref<APFloat(const APFloat &, const APFloat &,
-                               llvm::RoundingMode)>
-        Fn) {
-  const Pointer &RHS = S.Stk.pop<Pointer>();
-  const Pointer &LHS = S.Stk.pop<Pointer>();
-  const Pointer &Dst = S.Stk.peek<Pointer>();
-
-  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
-  llvm::RoundingMode RM = getRoundingMode(FPO);
-  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
-  for (unsigned I = 0; I < 2; ++I) {
-    using T = PrimConv<PT_Float>::T;
-    APFloat Elem1 = LHS.elem<T>(2 * I).getAPFloat();
-    APFloat Elem2 = LHS.elem<T>(2 * I + 1).getAPFloat();
-    Dst.elem<T>(2 * I) = static_cast<T>(Fn(Elem1, Elem2, RM));
-  }
-  for (unsigned I = 0; I < 2; ++I) {
-    using T = PrimConv<PT_Float>::T;
-    APFloat Elem1 = RHS.elem<T>(2 * I).getAPFloat();
-    APFloat Elem2 = RHS.elem<T>(2 * I + 1).getAPFloat();
-    Dst.elem<T>(2 * I + 1) = static_cast<T>(Fn(Elem1, Elem2, RM));
-  }
-  Dst.initializeAllElements();
-  return true;
-}
 static bool interp__builtin_elementwise_triop_fp(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APFloat(const APFloat &, const APFloat &,
@@ -3703,6 +3650,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
         });
   case clang::X86::BI__builtin_ia32_haddpd:
   case clang::X86::BI__builtin_ia32_haddps:
+  case clang::X86::BI__builtin_ia32_haddpd256:
+  case clang::X86::BI__builtin_ia32_haddps256:
     return interp_builtin_horizontal_fp_binop(
         S, OpPC, Call,
         [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
@@ -3710,26 +3659,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           F.add(RHS, RM);
           return F;
         });
-  case clang::X86::BI__builtin_ia32_haddpd256: {
-    return interp_builtin_horizontal_fpd256_binop(
-        S, OpPC, Call,
-        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
-          APFloat F = LHS;
-          F.add(RHS, RM);
-          return F;
-        });
-  }
-  case clang::X86::BI__builtin_ia32_haddps256: {
-    return interp_builtin_horizontal_fps256_binop(
-        S, OpPC, Call,
-        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
-          APFloat F = LHS;
-          F.add(RHS, RM);
-          return F;
-        });
-  }
   case clang::X86::BI__builtin_ia32_hsubpd:
   case clang::X86::BI__builtin_ia32_hsubps:
+  case clang::X86::BI__builtin_ia32_hsubpd256:
+  case clang::X86::BI__builtin_ia32_hsubps256:
     return interp_builtin_horizontal_fp_binop(
         S, OpPC, Call,
         [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
@@ -3737,24 +3670,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           F.subtract(RHS, RM);
           return F;
         });
-  case clang::X86::BI__builtin_ia32_hsubpd256: {
-    return interp_builtin_horizontal_fpd256_binop(
-        S, OpPC, Call,
-        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
-          APFloat F = LHS;
-          F.subtract(RHS, RM);
-          return F;
-        });
-  }
-  case clang::X86::BI__builtin_ia32_hsubps256: {
-    return interp_builtin_horizontal_fps256_binop(
-        S, OpPC, Call,
-        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
-          APFloat F = LHS;
-          F.subtract(RHS, RM);
-          return F;
-        });
-  }
+
   case clang::X86::BI__builtin_ia32_pmuldq128:
   case clang::X86::BI__builtin_ia32_pmuldq256:
   case clang::X86::BI__builtin_ia32_pmuldq512: