[clang] [Headers][X86] Enable constexpr handling for MMX/SSE/AVX/AVX512 PMADDWD/PMADDUBSW intrinsics (PR #161563)
Bhasawut Singhaphan via cfe-commits
cfe-commits at lists.llvm.org
Tue Oct 7 03:59:59 PDT 2025
https://github.com/markbhasawut updated https://github.com/llvm/llvm-project/pull/161563
>From 6095d8f01cfce820ab456686c4555ff9492ce23d Mon Sep 17 00:00:00 2001
From: Bhasawut Singhaphan <bhasawut at gmail.com>
Date: Mon, 22 Sep 2025 17:09:08 +0700
Subject: [PATCH 1/4] [Headers][X86] Enable constexpr handling for
MMX/SSE/AVX/AVX512 PMADDWD/PMADDUBSW intrinsics
---
clang/include/clang/Basic/BuiltinsX86.td | 19 +++++++++++--------
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 16 ++++++++++++++++
clang/lib/AST/ExprConstant.cpp | 14 ++++++++++++++
3 files changed, 41 insertions(+), 8 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 41652259cf6a3..217589d7add1d 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -123,13 +123,16 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}
- def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}
+
+ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+ def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
+ }
}
// AVX
@@ -278,13 +281,14 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
- def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
}
let Features = "sse2",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+ def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
+
def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
@@ -581,8 +585,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
- def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
- def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
@@ -619,6 +621,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
+ def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
+ def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
+
def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
@@ -1378,10 +1383,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
def subps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
}
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
- def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
-}
let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def addss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
@@ -1999,6 +2000,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
}
let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+ def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
+ def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 1eea813b8c556..b1928ef121325 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2583,6 +2583,12 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC,
return true;
}
+static bool interp__builtin_ia32_pmadd(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call,
+ unsigned BuiltinID) {
+ return true; // TODO: Implement the builtin.
+}
+
static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
const CallExpr *Call,
unsigned BuiltinID) {
@@ -3494,6 +3500,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return interp__builtin_elementwise_int_binop(S, OpPC, Call,
llvm::APIntOps::avgCeilU);
+ case clang::X86::BI__builtin_ia32_pmaddubsw128:
+ case clang::X86::BI__builtin_ia32_pmaddubsw256:
+ case clang::X86::BI__builtin_ia32_pmaddubsw512:
+ return true; // TODO: Use interp__builtin_i32_pmadd.
+
+ case clang::X86::BI__builtin_ia32_pmaddwd128:
+ case clang::X86::BI__builtin_ia32_pmaddwd256:
+ case clang::X86::BI__builtin_ia32_pmaddwd512:
+ return true; // TODO: Use interp__builtin_i32_pmadd.
+
case clang::X86::BI__builtin_ia32_pmulhuw128:
case clang::X86::BI__builtin_ia32_pmulhuw256:
case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 618e1636e9e53..aa0794c46d092 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11778,6 +11778,20 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case clang::X86::BI__builtin_ia32_pavgw512:
return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU);
+ case clang::X86::BI__builtin_ia32_pmaddubsw128:
+ case clang::X86::BI__builtin_ia32_pmaddwd128:
+ case clang::X86::BI__builtin_ia32_pmaddubsw256:
+ case clang::X86::BI__builtin_ia32_pmaddwd256:
+ case clang::X86::BI__builtin_ia32_pmaddubsw512:
+ case clang::X86::BI__builtin_ia32_pmaddwd512:
+ return true; // TODO: Handle __builtin_ia32_pmaddub
+
+ case clang::X86::BI__builtin_ia32_pmaddwd128:
+ case clang::X86::BI__builtin_ia32_pmaddwd256:
+ case clang::X86::BI__builtin_ia32_pmaddwd512:
+ return true; // TODO: Handle __builtin_ia32_pmadd
+ });
+
case clang::X86::BI__builtin_ia32_pmulhuw128:
case clang::X86::BI__builtin_ia32_pmulhuw256:
case clang::X86::BI__builtin_ia32_pmulhuw512:
>From 9c1a7e79bdb94c1707b73967ce92215e645c1929 Mon Sep 17 00:00:00 2001
From: Bhasawut Singhaphan <bhasawut at gmail.com>
Date: Wed, 24 Sep 2025 19:38:57 +0000
Subject: [PATCH 2/4] Modified InterpBuiltin.cpp and ExprConstant.cpp
---
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 53 ++++++++++++++++++++----
clang/lib/AST/ExprConstant.cpp | 49 ++++++++++++++++++----
2 files changed, 87 insertions(+), 15 deletions(-)
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index b1928ef121325..4ccc6c20504bb 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2583,10 +2583,35 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC,
return true;
}
-static bool interp__builtin_ia32_pmadd(InterpState &S, CodePtr OpPC,
- const CallExpr *Call,
- unsigned BuiltinID) {
- return true; // TODO: Implement the builtin.
+static bool interp__builtin_ia32_pmadd(
+ InterpState &S, CodePtr OpPC, const CallExpr *Call,
+ llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &,
+ const APSInt &)>
+ Fn) {
+ assert(Call->getArg(0)->getType()->isVectorType() &&
+ Call->getArg(1)->getType()->isVectorType());
+ const Pointer &RHS = S.Stk.pop<Pointer>();
+ const Pointer &LHS = S.Stk.pop<Pointer>();
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+
+ const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+ PrimType ElemT = *S.getContext().classify(VT->getElementType());
+ unsigned NumElems = VT->getNumElements();
+ bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
+ for (unsigned I = 0; I != NumElems; I += 2) {
+ INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+ APSInt LoLHS = LHS.elem<T>(I).toAPSInt();
+ APSInt HiLHS = LHS.elem<T>(I + 1).toAPSInt();
+ APSInt LoRHS = RHS.elem<T>(I).toAPSInt();
+ APSInt HiRHS = RHS.elem<T>(I + 1).toAPSInt();
+ Dst.elem<T>(I) =
+ static_cast<T>(APSInt(Fn(LoLHS, HiLHS, LoRHS, HiRHS), DestUnsigned));
+ });
+ }
+
+ Dst.initializeAllElements();
+ return true;
}
static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
@@ -3503,12 +3528,26 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case clang::X86::BI__builtin_ia32_pmaddubsw128:
case clang::X86::BI__builtin_ia32_pmaddubsw256:
case clang::X86::BI__builtin_ia32_pmaddubsw512:
- return true; // TODO: Use interp__builtin_i32_pmadd.
-
+ return interp__builtin_ia32_pmadd(
+ S, OpPC, Call,
+ [](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
+ const APSInt &HiRHS) {
+ unsigned BitWidth = 2 * LHS.getBitWidth();
+ return (LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth))
+ .sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth)));
+ });
+
case clang::X86::BI__builtin_ia32_pmaddwd128:
case clang::X86::BI__builtin_ia32_pmaddwd256:
case clang::X86::BI__builtin_ia32_pmaddwd512:
- return true; // TODO: Use interp__builtin_i32_pmadd.
+ return interp__builtin_ia32_pmadd(
+ S, OpPC, Call,
+ [](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
+ const APSInt &HiRHS) {
+ unsigned BitWidth = 2 * LHS.getBitWidth();
+ return (LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) +
+ (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth));
+ });
case clang::X86::BI__builtin_ia32_pmulhuw128:
case clang::X86::BI__builtin_ia32_pmulhuw256:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index aa0794c46d092..4e5d37a6ba96e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11779,18 +11779,51 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU);
case clang::X86::BI__builtin_ia32_pmaddubsw128:
- case clang::X86::BI__builtin_ia32_pmaddwd128:
case clang::X86::BI__builtin_ia32_pmaddubsw256:
- case clang::X86::BI__builtin_ia32_pmaddwd256:
case clang::X86::BI__builtin_ia32_pmaddubsw512:
- case clang::X86::BI__builtin_ia32_pmaddwd512:
- return true; // TODO: Handle __builtin_ia32_pmaddub
-
case clang::X86::BI__builtin_ia32_pmaddwd128:
case clang::X86::BI__builtin_ia32_pmaddwd256:
- case clang::X86::BI__builtin_ia32_pmaddwd512:
- return true; // TODO: Handle __builtin_ia32_pmadd
- });
+ case clang::X86::BI__builtin_ia32_pmaddwd512: {
+ APValue SourceLHS, SourceRHS;
+ if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+ !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+ return false;
+
+ unsigned SourceLen = SourceLHS.getVectorLength();
+ bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+ SmallVector<APValue, 4> ResultElements;
+ ResultElements.reserve(SourceLen / 2);
+
+ for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+ const APSInt &LoLHS = SourceLHS.getVectorElt(EltNum).getInt();
+ const APSInt &HiLHS = SourceLHS.getVectorElt(EltNum + 1).getInt();
+ const APSInt &LoRHS = SourceRHS.getVectorElt(EltNum).getInt();
+ const APSInt &HiRHS = SourceRHS.getVectorElt(EltNum + 1).getInt();
+ unsigned BitWidth = 2 * LoLHS.getBitWidth();
+
+ switch (E->getBuiltinCallee()) {
+ case clang::X86::BI__builtin_ia32_pmaddubsw128:
+ case clang::X86::BI__builtin_ia32_pmaddubsw256:
+ case clang::X86::BI__builtin_ia32_pmaddubsw512:
+ ResultElements.push_back(APValue(
+ APSInt(
+ (LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth))
+ .sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth)))),
+ DestUnsigned));
+ break;
+ case clang::X86::BI__builtin_ia32_pmaddwd128:
+ case clang::X86::BI__builtin_ia32_pmaddwd256:
+ case clang::X86::BI__builtin_ia32_pmaddwd512:
+ ResultElements.push_back(
+ APValue(APSInt((LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) +
+ (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth))),
+ DestUnsigned));
+ break;
+ }
+ }
+
+ return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+ }
case clang::X86::BI__builtin_ia32_pmulhuw128:
case clang::X86::BI__builtin_ia32_pmulhuw256:
>From 2ab5ec4e2fcdbcf3f8660a55d9743fcb0f67567e Mon Sep 17 00:00:00 2001
From: Bhasawut Singhaphan <bhasawut at gmail.com>
Date: Wed, 1 Oct 2025 15:16:25 +0000
Subject: [PATCH 3/4] Update MMX/SSE/AVX/AVX512 PMADDWD/PMADDUBSW intrinsics to
be used in constexpr
---
clang/lib/Headers/avx2intrin.h | 4 ++--
clang/lib/Headers/avx512bwintrin.h | 12 ++++++------
clang/lib/Headers/avx512vlbwintrin.h | 16 ++++++++--------
clang/lib/Headers/emmintrin.h | 2 +-
clang/lib/Headers/mmintrin.h | 6 +++---
clang/lib/Headers/tmmintrin.h | 11 +++++++----
6 files changed, 27 insertions(+), 24 deletions(-)
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 31759c5386d9f..c6bff41973ef8 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1035,7 +1035,7 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maddubs_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
@@ -1067,7 +1067,7 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_madd_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index c36bd814725fa..473fe94af65d8 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -1064,12 +1064,12 @@ _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
__m512i __Y) {
return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
@@ -1077,26 +1077,26 @@ _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
(__v32hi)_mm512_maddubs_epi16(__X, __Y),
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_madd_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_madd_epi16(__A, __B),
(__v16si)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_madd_epi16(__A, __B),
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 5e6daa8f7b260..81e4cbb9615c1 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1295,21 +1295,21 @@ _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
(__v16hi)_mm256_setzero_si256());
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_maddubs_epi16(__X, __Y),
(__v8hi)__W);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_maddubs_epi16(__X, __Y),
(__v8hi)_mm_setzero_si128());
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
__m256i __Y) {
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1317,35 +1317,35 @@ _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
(__v16hi)__W);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_maddubs_epi16(__X, __Y),
(__v16hi)_mm256_setzero_si256());
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_madd_epi16(__A, __B),
(__v4si)__W);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_madd_epi16(__A, __B),
(__v4si)_mm_setzero_si128());
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_madd_epi16(__A, __B),
(__v8si)__W);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_madd_epi16(__A, __B),
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 6597e7e7d4030..2b1fd7e3a8b07 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2290,7 +2290,7 @@ _mm_avg_epu16(__m128i __a, __m128i __b) {
/// A 128-bit signed [8 x i16] vector.
/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
/// of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_madd_epi16(__m128i __a,
__m128i __b) {
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
}
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 5f617530b6f78..145980410e2ec 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -679,11 +679,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) {
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
/// products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
_mm_madd_pi16(__m64 __m1, __m64 __m2)
{
- return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
- (__v8hi)__anyext128(__m2)));
+ return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1),
+ (__v8hi)__zext128(__m2)));
}
/// Multiplies each 16-bit signed integer element of the first 64-bit
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index d40f0c56b2c5a..8d6eddbfcb731 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -23,6 +23,9 @@
#define __trunc64(x) \
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x) \
+ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
+ 1, 2, 3)
#define __anyext128(x) \
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
1, -1, -1)
@@ -504,7 +507,7 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_maddubs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
@@ -534,11 +537,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_maddubs_pi16(__m64 __a, __m64 __b)
{
- return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
- (__v16qi)__anyext128(__b)));
+ return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a),
+ (__v16qi)__zext128(__b)));
}
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
>From 6836847f138d8bc58dbc28bab6b52ff7c9edecf9 Mon Sep 17 00:00:00 2001
From: Bhasawut Singhaphan <bhasawut at gmail.com>
Date: Thu, 2 Oct 2025 01:38:20 +0700
Subject: [PATCH 4/4] Address clang-format issue
---
clang/lib/Headers/avx2intrin.h | 8 +++-----
clang/lib/Headers/emmintrin.h | 4 ++--
clang/lib/Headers/mmintrin.h | 7 +++----
clang/lib/Headers/tmmintrin.h | 12 +++++-------
4 files changed, 13 insertions(+), 18 deletions(-)
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index c6bff41973ef8..4aaca2db8236a 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1036,9 +1036,8 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_maddubs_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
+ return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
}
/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
@@ -1068,8 +1067,7 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [8 x i32] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_madd_epi16(__m256i __a, __m256i __b)
-{
+_mm256_madd_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
}
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 2b1fd7e3a8b07..454e9a2504949 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2290,8 +2290,8 @@ _mm_avg_epu16(__m128i __a, __m128i __b) {
/// A 128-bit signed [8 x i16] vector.
/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
/// of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_madd_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_madd_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
}
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 145980410e2ec..aca78e6986ad9 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -680,10 +680,9 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) {
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
/// products of both parameters.
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
-_mm_madd_pi16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1),
- (__v8hi)__zext128(__m2)));
+_mm_madd_pi16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1),
+ (__v8hi)__zext128(__m2)));
}
/// Multiplies each 16-bit signed integer element of the first 64-bit
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 8d6eddbfcb731..49c9d7c0eee92 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -508,9 +508,8 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_maddubs_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+_mm_maddubs_epi16(__m128i __a, __m128i __b) {
+ return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
}
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -538,10 +537,9 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
- return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a),
- (__v16qi)__zext128(__b)));
+_mm_maddubs_pi16(__m64 __a, __m64 __b) {
+ return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a),
+ (__v16qi)__zext128(__b)));
}
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
More information about the cfe-commits
mailing list