[clang] [X86][bytecode] Allow SSE/AVX BLENDVPD/PD intrinsics to be used in constexpr (PR #157126)

Fri Sep 5 08:36:37 PDT 2025

https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/157126

BLENDV intrinsics use the signbit of the condition mask to select between the LHS (false) and RHS (true) operands

Fixes #157066

>From f7db91d029dc81f99225deef3144b498e7195fbe Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 5 Sep 2025 16:35:45 +0100
Subject: [PATCH] [X86][bytecode] Allow SSE/AVX BLENDVPD/PD intrinsics to be
 used in constexpr

BLENDV intrinsics use the signbit of the condition mask to select between the LHS (false) and RHS (true) operands

Fixes #157066
---
 clang/include/clang/Basic/BuiltinsX86.td | 12 ++++++++----
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 +++++++++++
 clang/lib/AST/ExprConstant.cpp           |  9 +++++++--
 clang/lib/Headers/avxintrin.h            | 10 ++++------
 clang/lib/Headers/smmintrin.h            | 10 ++++------
 clang/test/CodeGen/X86/avx-builtins.c    |  2 ++
 clang/test/CodeGen/X86/sse41-builtins.c  |  2 ++
 7 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 35b800f557ff5..b4ff550d27279 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -315,8 +315,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
   def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
   def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
-  def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
-  def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
   def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
@@ -335,7 +333,10 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
 }
 
 let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+  def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+
   def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
 }
 
@@ -470,8 +471,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
   def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
   def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
   def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
-  def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
-  def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
   def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
   def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
   def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
@@ -495,6 +494,11 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def vpermilps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
 }
 
+let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+  def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+}
+
 let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
   def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 4d3d6397e5ed9..acc9f9e2cf1e4 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3421,6 +3421,17 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return F;
         });
 
+  case clang::X86::BI__builtin_ia32_blendvpd:
+  case clang::X86::BI__builtin_ia32_blendvpd256:
+  case clang::X86::BI__builtin_ia32_blendvps:
+  case clang::X86::BI__builtin_ia32_blendvps256:
+    return interp__builtin_elementwise_triop_fp(
+        S, OpPC, Call,
+        [](const APFloat &F, const APFloat &T, const APFloat &C,
+           llvm::RoundingMode) {
+          return C.bitcastToAPInt().isNegative() ? T : F;
+        });
+
   case clang::X86::BI__builtin_ia32_pblendvb128:
   case clang::X86::BI__builtin_ia32_pblendvb256:
     return interp__builtin_elementwise_triop(
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 100e944f9b48c..90dff96869fc7 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11995,6 +11995,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case X86::BI__builtin_ia32_blendvpd:
+  case X86::BI__builtin_ia32_blendvpd256:
+  case X86::BI__builtin_ia32_blendvps:
+  case X86::BI__builtin_ia32_blendvps256:
   case X86::BI__builtin_ia32_pblendvb128:
   case X86::BI__builtin_ia32_pblendvb256: {
     // SSE blendv by mask signbit: "Result = C[] < 0 ? T[] : F[]".
@@ -12011,8 +12015,9 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
       const APValue &F = SourceF.getVectorElt(EltNum);
       const APValue &T = SourceT.getVectorElt(EltNum);
-      APInt C = SourceC.getVectorElt(EltNum).getInt();
-      ResultElements.push_back(C.isNegative() ? T : F);
+      const APValue &C = SourceC.getVectorElt(EltNum);
+      APInt M = C.isInt() ? (APInt)C.getInt() : C.getFloat().bitcastToAPInt();
+      ResultElements.push_back(M.isNegative() ? T : F);
     }
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index bfd62dfd9a6b2..e9d8be320bec9 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -1402,9 +1402,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    64-bit element in operand \a __b is copied to the same position in the
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
   return (__m256d)__builtin_ia32_blendvpd256(
     (__v4df)__a, (__v4df)__b, (__v4df)__c);
 }
@@ -1430,9 +1429,8 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
 ///    corresponding 32-bit element in operand \a __b is copied to the same
 ///    position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
   return (__m256)__builtin_ia32_blendvps256(
     (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
 }
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index b10fa2fe375a1..6319fdbbeb8f0 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -439,9 +439,8 @@
 ///    position in the result. When a mask bit is 1, the corresponding 64-bit
 ///    element in operand \a __V2 is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
-                                                           __m128d __V2,
-                                                           __m128d __M) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M) {
   return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
                                           (__v2df)__M);
 }
@@ -466,9 +465,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
 ///    position in the result. When a mask bit is 1, the corresponding 32-bit
 ///    element in operand \a __V2 is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
-                                                          __m128 __V2,
-                                                          __m128 __M) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M) {
   return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
                                          (__v4sf)__M);
 }
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 622ac5d50aaf0..f255dbe1b2adc 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -99,12 +99,14 @@ __m256d test_mm256_blendv_pd(__m256d V1, __m256d V2, __m256d V3) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_blendv_pd(V1, V2, V3);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_blendv_pd((__m256d)(__v4df){1.0, 2.0, 3.0, 4.0},(__m256d)(__v4df){-100.0, -101.0, -102.0, -103.0},(__m256d)(__v4df){0.0, -1.0, 1.0, -1.0}), 1.0f, -101.0, 3.0, -103.0));
 
 __m256 test_mm256_blendv_ps(__m256 V1, __m256 V2, __m256 V3) {
   // CHECK-LABEL: test_mm256_blendv_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_blendv_ps(V1, V2, V3);
 }
+TEST_CONSTEXPR(match_m256(_mm256_blendv_ps((__m256)(__v8sf){0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f},(__m256)(__v8sf){-100.0f, -101.0f, -102.0f, -103.0f, -104.0f, -105.0f, -106.0f, -107.0f},(__m256)(__v8sf){-1.0f, 2.0f, -3.0f, 4.0f, -5.0f, -6.0f, 7.0f, -0.0f}), -100.0f, 1.0f, -102.0f, 3.0f, -104.0f, -105.0f, 6.0f, -107.0f));
 
 __m256d test_mm256_broadcast_pd(__m128d* A) {
   // CHECK-LABEL: test_mm256_broadcast_pd
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 3abfee0409f16..dca161c8038a2 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -52,12 +52,14 @@ __m128d test_mm_blendv_pd(__m128d V1, __m128d V2, __m128d V3) {
   // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_blendv_pd(V1, V2, V3);
 }
+TEST_CONSTEXPR(match_m128d(_mm_blendv_pd((__m128d)(__v2df){2.0, -4.0},(__m128d)(__v2df){-111.0, +222.0},(__m128d)(__v2df){2.0, -2.0}), 2.0, 222.0));
 
 __m128 test_mm_blendv_ps(__m128 V1, __m128 V2, __m128 V3) {
   // CHECK-LABEL: test_mm_blendv_ps
   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.blendvps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_blendv_ps(V1, V2, V3);
 }
+TEST_CONSTEXPR(match_m128(_mm_blendv_ps((__m128)(__v4sf){0.0f, 1.0f, 2.0f, 3.0f},(__m128)(__v4sf){-100.0f, -101.0f, -102.0f, -103.0f},(__m128)(__v4sf){-1.0f, 2.0f, -3.0f, 0.0f}), -100.0f, 1.0f, -102.0f, 3.0f));
 
 __m128d test_mm_ceil_pd(__m128d x) {
   // CHECK-LABEL: test_mm_ceil_pd