[clang] [X86][bytecode] Allow SSE/AVX BLENDVPD/PD intrinsics to be used in constexpr (PR #157126)
Simon Pilgrim via cfe-commits
cfe-commits at lists.llvm.org
Fri Sep 5 08:36:37 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/157126
BLENDV intrinsics use the signbit of the condition mask to select between the LHS (false) and RHS (true) operands
Fixes #157066
>From f7db91d029dc81f99225deef3144b498e7195fbe Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 5 Sep 2025 16:35:45 +0100
Subject: [PATCH] [X86][bytecode] Allow SSE/AVX BLENDVPD/PD intrinsics to be
used in constexpr
BLENDV intrinsics use the signbit of the condition mask to select between the LHS (false) and RHS (true) operands
Fixes #157066
---
clang/include/clang/Basic/BuiltinsX86.td | 12 ++++++++----
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 +++++++++++
clang/lib/AST/ExprConstant.cpp | 9 +++++++--
clang/lib/Headers/avxintrin.h | 10 ++++------
clang/lib/Headers/smmintrin.h | 10 ++++------
clang/test/CodeGen/X86/avx-builtins.c | 2 ++
clang/test/CodeGen/X86/sse41-builtins.c | 2 ++
7 files changed, 38 insertions(+), 18 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 35b800f557ff5..b4ff550d27279 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -315,8 +315,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
- def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
- def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
@@ -335,7 +333,10 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
}
let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+ def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
+ def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
+
def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
}
@@ -470,8 +471,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
- def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
- def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
@@ -495,6 +494,11 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def vpermilps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
}
+let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+ def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
+ def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+}
+
let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 4d3d6397e5ed9..acc9f9e2cf1e4 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3421,6 +3421,17 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return F;
});
+ case clang::X86::BI__builtin_ia32_blendvpd:
+ case clang::X86::BI__builtin_ia32_blendvpd256:
+ case clang::X86::BI__builtin_ia32_blendvps:
+ case clang::X86::BI__builtin_ia32_blendvps256:
+ return interp__builtin_elementwise_triop_fp(
+ S, OpPC, Call,
+ [](const APFloat &F, const APFloat &T, const APFloat &C,
+ llvm::RoundingMode) {
+ return C.bitcastToAPInt().isNegative() ? T : F;
+ });
+
case clang::X86::BI__builtin_ia32_pblendvb128:
case clang::X86::BI__builtin_ia32_pblendvb256:
return interp__builtin_elementwise_triop(
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 100e944f9b48c..90dff96869fc7 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11995,6 +11995,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_blendvpd:
+ case X86::BI__builtin_ia32_blendvpd256:
+ case X86::BI__builtin_ia32_blendvps:
+ case X86::BI__builtin_ia32_blendvps256:
case X86::BI__builtin_ia32_pblendvb128:
case X86::BI__builtin_ia32_pblendvb256: {
// SSE blendv by mask signbit: "Result = C[] < 0 ? T[] : F[]".
@@ -12011,8 +12015,9 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
const APValue &F = SourceF.getVectorElt(EltNum);
const APValue &T = SourceT.getVectorElt(EltNum);
- APInt C = SourceC.getVectorElt(EltNum).getInt();
- ResultElements.push_back(C.isNegative() ? T : F);
+ const APValue &C = SourceC.getVectorElt(EltNum);
+ APInt M = C.isInt() ? (APInt)C.getInt() : C.getFloat().bitcastToAPInt();
+ ResultElements.push_back(M.isNegative() ? T : F);
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index bfd62dfd9a6b2..e9d8be320bec9 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -1402,9 +1402,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// 64-bit element in operand \a __b is copied to the same position in the
/// destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) {
return (__m256d)__builtin_ia32_blendvpd256(
(__v4df)__a, (__v4df)__b, (__v4df)__c);
}
@@ -1430,9 +1429,8 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
/// corresponding 32-bit element in operand \a __b is copied to the same
/// position in the destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) {
return (__m256)__builtin_ia32_blendvps256(
(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
}
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index b10fa2fe375a1..6319fdbbeb8f0 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -439,9 +439,8 @@
/// position in the result. When a mask bit is 1, the corresponding 64-bit
/// element in operand \a __V2 is copied to the same position in the result.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
- __m128d __V2,
- __m128d __M) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M) {
return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
(__v2df)__M);
}
@@ -466,9 +465,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
/// position in the result. When a mask bit is 1, the corresponding 32-bit
/// element in operand \a __V2 is copied to the same position in the result.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
- __m128 __V2,
- __m128 __M) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M) {
return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
(__v4sf)__M);
}
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 622ac5d50aaf0..f255dbe1b2adc 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -99,12 +99,14 @@ __m256d test_mm256_blendv_pd(__m256d V1, __m256d V2, __m256d V3) {
// CHECK: call {{.*}}<4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
return _mm256_blendv_pd(V1, V2, V3);
}
+TEST_CONSTEXPR(match_m256d(_mm256_blendv_pd((__m256d)(__v4df){1.0, 2.0, 3.0, 4.0},(__m256d)(__v4df){-100.0, -101.0, -102.0, -103.0},(__m256d)(__v4df){0.0, -1.0, 1.0, -1.0}), 1.0f, -101.0, 3.0, -103.0));
__m256 test_mm256_blendv_ps(__m256 V1, __m256 V2, __m256 V3) {
// CHECK-LABEL: test_mm256_blendv_ps
// CHECK: call {{.*}}<8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
return _mm256_blendv_ps(V1, V2, V3);
}
+TEST_CONSTEXPR(match_m256(_mm256_blendv_ps((__m256)(__v8sf){0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f},(__m256)(__v8sf){-100.0f, -101.0f, -102.0f, -103.0f, -104.0f, -105.0f, -106.0f, -107.0f},(__m256)(__v8sf){-1.0f, 2.0f, -3.0f, 4.0f, -5.0f, -6.0f, 7.0f, -0.0f}), -100.0f, 1.0f, -102.0f, 3.0f, -104.0f, -105.0f, 6.0f, -107.0f));
__m256d test_mm256_broadcast_pd(__m128d* A) {
// CHECK-LABEL: test_mm256_broadcast_pd
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 3abfee0409f16..dca161c8038a2 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -52,12 +52,14 @@ __m128d test_mm_blendv_pd(__m128d V1, __m128d V2, __m128d V3) {
// CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
return _mm_blendv_pd(V1, V2, V3);
}
+TEST_CONSTEXPR(match_m128d(_mm_blendv_pd((__m128d)(__v2df){2.0, -4.0},(__m128d)(__v2df){-111.0, +222.0},(__m128d)(__v2df){2.0, -2.0}), 2.0, 222.0));
__m128 test_mm_blendv_ps(__m128 V1, __m128 V2, __m128 V3) {
// CHECK-LABEL: test_mm_blendv_ps
// CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.blendvps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
return _mm_blendv_ps(V1, V2, V3);
}
+TEST_CONSTEXPR(match_m128(_mm_blendv_ps((__m128)(__v4sf){0.0f, 1.0f, 2.0f, 3.0f},(__m128)(__v4sf){-100.0f, -101.0f, -102.0f, -103.0f},(__m128)(__v4sf){-1.0f, 2.0f, -3.0f, 0.0f}), -100.0f, 1.0f, -102.0f, 3.0f));
__m128d test_mm_ceil_pd(__m128d x) {
// CHECK-LABEL: test_mm_ceil_pd
More information about the cfe-commits
mailing list