[clang] [Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AVX512 pack intrinsics to be used in constexpr (PR #156003)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Sep 17 20:17:00 PDT 2025
https://github.com/woruyu updated https://github.com/llvm/llvm-project/pull/156003
>From 61f8bfd509889a0154b838ad357adb2563b620ea Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Tue, 16 Sep 2025 23:30:47 -0900
Subject: [PATCH 1/2] [Headers][X86] VectorExprEvaluator::VisitCallExpr - allow
SSE/AVX2/AVX512 pack intrinsics to be used in constexpr
---
clang/include/clang/Basic/BuiltinsX86.td | 26 +++--
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 115 +++++++++++++++++++++
clang/lib/AST/ExprConstant.cpp | 86 ++++++++++++++-
clang/lib/Headers/avx2intrin.h | 20 ++--
clang/lib/Headers/avx512bwintrin.h | 20 ++--
clang/lib/Headers/emmintrin.h | 12 +--
clang/lib/Headers/mmintrin.h | 27 +++--
clang/lib/Headers/smmintrin.h | 4 +-
clang/test/CodeGen/X86/avx2-builtins.c | 4 +
clang/test/CodeGen/X86/avx512bw-builtins.c | 20 ++--
clang/test/CodeGen/X86/mmx-builtins.c | 3 +
clang/test/CodeGen/X86/sse2-builtins.c | 3 +
clang/test/CodeGen/X86/sse41-builtins.c | 1 +
13 files changed, 274 insertions(+), 67 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index aac502091b57e..730fd860b3330 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -93,9 +93,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
}
let Features = "sse2" in {
- def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
- def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
- def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
@@ -108,6 +105,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
+ def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+ def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+ def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
}
let Features = "sse3" in {
@@ -312,7 +312,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
- def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -338,6 +337,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVector
def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+ def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
}
let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -570,10 +570,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
- def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
- def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
- def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
- def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
@@ -644,6 +640,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+
+ def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+ def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+ def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+ def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -1305,11 +1306,14 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
- def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+ def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+}
+
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
- def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+ def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
- def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
+ def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
}
let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index b7b6d65c38e97..34cf8cb3bf30c 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2604,6 +2604,68 @@ static bool interp__builtin_elementwise_int_binop(
return true;
}
+static bool interp__builtin_x86_pack(
+ InterpState &S, CodePtr, const CallExpr *E, unsigned SrcBits, bool IsUnsat,
+ llvm::function_ref<APSInt(const APSInt &X, unsigned, unsigned, bool)>
+ narrowElement) {
+ const auto *VT0 = E->getArg(0)->getType()->castAs<VectorType>();
+ const auto *VT1 = E->getArg(1)->getType()->castAs<VectorType>();
+ assert(VT0 && VT1 && "pack builtin VT0 and VT1 must be VectorType");
+ assert(VT0->getElementType() == VT1->getElementType() &&
+ VT0->getNumElements() == VT1->getNumElements() &&
+ "pack builtin VT0 and VT1 ElementType must be same");
+
+ const unsigned LHSVecLen = VT0->getNumElements();
+
+ const Pointer &RHS = S.Stk.pop<Pointer>();
+ const Pointer &LHS = S.Stk.pop<Pointer>();
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+
+ auto readSrc = [&](const Pointer &P, unsigned I) -> APSInt {
+ if (SrcBits == 16) {
+ int16_t v = P.elem<int16_t>(I);
+ return APSInt(APInt(16, static_cast<uint16_t>(v)), /*isUnsigned=*/false);
+ } else {
+ int32_t v = P.elem<int32_t>(I);
+ return APSInt(APInt(32, static_cast<uint32_t>(v)), /*isUnsigned=*/false);
+ }
+ };
+
+ const unsigned DstBits = SrcBits / 2;
+ auto writeDst = [&](unsigned I, const APSInt &Y) {
+ if (DstBits == 8) {
+ if (IsUnsat)
+ Dst.elem<uint8_t>(I) = static_cast<uint8_t>(Y.getZExtValue());
+ else
+ Dst.elem<int8_t>(I) = static_cast<int8_t>(Y.getSExtValue());
+ } else {
+ if (IsUnsat)
+ Dst.elem<uint16_t>(I) = static_cast<uint16_t>(Y.getZExtValue());
+ else
+ Dst.elem<int16_t>(I) = static_cast<int16_t>(Y.getSExtValue());
+ }
+ };
+
+ const unsigned VectorBits = LHSVecLen * SrcBits;
+ const unsigned srcPerLane = VectorBits >= 128 ? (128 / SrcBits) : LHSVecLen;
+ const unsigned lanes = VectorBits >= 128 ? (VectorBits / 128) : 1;
+
+ for (unsigned lane = 0; lane < lanes; ++lane) {
+ const unsigned baseSrc = lane * srcPerLane;
+ const unsigned baseDst = lane * (2 * srcPerLane);
+ for (unsigned i = 0; i < srcPerLane; ++i)
+ writeDst(baseDst + i, narrowElement(readSrc(LHS, baseSrc + i), SrcBits,
+ DstBits, IsUnsat));
+ for (unsigned i = 0; i < srcPerLane; ++i)
+ writeDst(
+ baseDst + srcPerLane + i,
+ narrowElement(readSrc(RHS, baseSrc + i), SrcBits, DstBits, IsUnsat));
+ }
+
+ Dst.initializeAllElements();
+ return true;
+}
+
static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC,
const CallExpr *Call,
unsigned BuiltinID) {
@@ -2920,6 +2982,35 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return Invalid(S, OpPC);
const InterpFrame *Frame = S.Current;
+
+ auto NarrowElement = [](const APSInt &Element, unsigned SrcBits,
+ unsigned DstBits, bool IsUnsat) -> APSInt {
+ assert(Element.getBitWidth() == SrcBits &&
+ "pack builtin LHS/RHS Element Width must equal to SrcBits");
+
+ APInt Tmp = Element;
+ const APInt Lo = IsUnsat ? APInt(SrcBits, 0)
+ : APInt::getSignedMinValue(DstBits).sext(SrcBits);
+ const APInt Hi = IsUnsat ? APInt::getMaxValue(DstBits).zext(SrcBits)
+ : APInt::getSignedMaxValue(DstBits).sext(SrcBits);
+
+ APInt Narrow;
+ if (IsUnsat) {
+ if (Tmp.isNegative())
+ Tmp = Lo;
+ else if (Tmp.ugt(Hi))
+ Tmp = Hi;
+ Narrow = Tmp.zextOrTrunc(DstBits);
+ } else {
+ if (Tmp.sgt(Hi))
+ Tmp = Hi;
+ else if (Tmp.slt(Lo))
+ Tmp = Lo;
+ Narrow = Tmp.sextOrTrunc(DstBits);
+ }
+ return APSInt(Narrow, /*isUnsigned=*/IsUnsat);
+ };
+
switch (BuiltinID) {
case Builtin::BI__builtin_is_constant_evaluated:
return interp__builtin_is_constant_evaluated(S, OpPC, Frame, Call);
@@ -3435,6 +3526,30 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
}
return LHS.lshr(RHS.getZExtValue());
});
+ case clang::X86::BI__builtin_ia32_packsswb128:
+ case clang::X86::BI__builtin_ia32_packsswb256:
+ case clang::X86::BI__builtin_ia32_packsswb512:
+ return interp__builtin_x86_pack(S, OpPC, Call,
+ /*SrcBits=*/16, /*Unsat=*/false,
+ NarrowElement);
+ case clang::X86::BI__builtin_ia32_packuswb128:
+ case clang::X86::BI__builtin_ia32_packuswb256:
+ case clang::X86::BI__builtin_ia32_packuswb512:
+ return interp__builtin_x86_pack(S, OpPC, Call,
+ /*SrcBits=*/16, /*Unsat=*/true,
+ NarrowElement);
+ case clang::X86::BI__builtin_ia32_packssdw128:
+ case clang::X86::BI__builtin_ia32_packssdw256:
+ case clang::X86::BI__builtin_ia32_packssdw512:
+ return interp__builtin_x86_pack(S, OpPC, Call,
+ /*SrcBits=*/32, /*Unsat=*/false,
+ NarrowElement);
+ case clang::X86::BI__builtin_ia32_packusdw128:
+ case clang::X86::BI__builtin_ia32_packusdw256:
+ case clang::X86::BI__builtin_ia32_packusdw512:
+ return interp__builtin_x86_pack(S, OpPC, Call,
+ /*SrcBits=*/32, /*Unsat=*/true,
+ NarrowElement);
case clang::X86::BI__builtin_ia32_vprotbi:
case clang::X86::BI__builtin_ia32_vprotdi:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b2cb9e2b3c347..86f74b52e7a0a 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11575,6 +11575,43 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
return false;
}
+static bool evalPackBuiltin(
+ const CallExpr *E, EvalInfo &Info, APValue &Result, unsigned SrcBits,
+ bool IsUnsat,
+ llvm::function_ref<APSInt(const APSInt &, unsigned, unsigned, bool)>
+ narrowElement) {
+ APValue LHS, RHS;
+ if (!EvaluateAsRValue(Info, E->getArg(0), LHS) ||
+ !EvaluateAsRValue(Info, E->getArg(1), RHS))
+ return false;
+
+ unsigned LHSVecLen = LHS.getVectorLength();
+ unsigned RHSVecLen = RHS.getVectorLength();
+
+ assert(LHSVecLen != 0 && LHSVecLen == RHSVecLen &&
+ "pack builtin LHSVecLen must equal to RHSVecLen");
+
+ const unsigned VectorBits = LHSVecLen * SrcBits;
+ const unsigned srcPerLane = VectorBits >= 128 ? 128 / SrcBits : LHSVecLen;
+ const unsigned lanes = VectorBits >= 128 ? VectorBits / 128 : 1;
+
+ SmallVector<APValue, 64> Out;
+ Out.reserve(LHSVecLen + RHSVecLen);
+
+ for (unsigned lane = 0; lane != lanes; ++lane) {
+ unsigned base = lane * srcPerLane;
+ for (unsigned i = 0; i != srcPerLane; ++i)
+ Out.emplace_back(APValue(narrowElement(
+ LHS.getVectorElt(base + i).getInt(), SrcBits, SrcBits / 2, IsUnsat)));
+ for (unsigned i = 0; i != srcPerLane; ++i)
+ Out.emplace_back(APValue(narrowElement(
+ RHS.getVectorElt(base + i).getInt(), SrcBits, SrcBits / 2, IsUnsat)));
+ }
+
+ Result = APValue(Out.data(), Out.size());
+ return true;
+}
+
bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (!IsConstantEvaluatedBuiltinCall(E))
return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -11611,6 +11648,34 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), SourceLen), E);
};
+ auto NarrowElement = [](const APSInt &Element, unsigned SrcBits,
+ unsigned DstBits, bool IsUnsat) -> APSInt {
+ assert(Element.getBitWidth() == SrcBits &&
+ "pack builtin LHS/RHS Element Width must equal to SrcBits");
+
+ APInt Tmp = Element;
+ const APInt Lo = IsUnsat ? APInt(SrcBits, 0)
+ : APInt::getSignedMinValue(DstBits).sext(SrcBits);
+ const APInt Hi = IsUnsat ? APInt::getMaxValue(DstBits).zext(SrcBits)
+ : APInt::getSignedMaxValue(DstBits).sext(SrcBits);
+
+ APInt Narrow;
+ if (IsUnsat) {
+ if (Tmp.isNegative())
+ Tmp = Lo;
+ else if (Tmp.ugt(Hi))
+ Tmp = Hi;
+ Narrow = Tmp.zextOrTrunc(DstBits);
+ } else {
+ if (Tmp.sgt(Hi))
+ Tmp = Hi;
+ else if (Tmp.slt(Lo))
+ Tmp = Lo;
+ Narrow = Tmp.sextOrTrunc(DstBits);
+ }
+ return APSInt(Narrow, /*isUnsigned=*/IsUnsat);
+ };
+
switch (E->getBuiltinCallee()) {
default:
return false;
@@ -11768,7 +11833,26 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
}
return LHS.lshr(RHS.getZExtValue());
});
-
+ case X86::BI__builtin_ia32_packsswb128:
+ case X86::BI__builtin_ia32_packsswb256:
+ case X86::BI__builtin_ia32_packsswb512:
+ return evalPackBuiltin(E, Info, Result, /*SrcBits=*/16, /*IsUnsat=*/false,
+ NarrowElement);
+ case X86::BI__builtin_ia32_packuswb128:
+ case X86::BI__builtin_ia32_packuswb256:
+ case X86::BI__builtin_ia32_packuswb512:
+ return evalPackBuiltin(E, Info, Result, /*SrcBits=*/16, /*IsUnsat=*/true,
+ NarrowElement);
+ case X86::BI__builtin_ia32_packssdw128:
+ case X86::BI__builtin_ia32_packssdw256:
+ case X86::BI__builtin_ia32_packssdw512:
+ return evalPackBuiltin(E, Info, Result, /*SrcBits=*/32, /*IsUnsat=*/false,
+ NarrowElement);
+ case X86::BI__builtin_ia32_packusdw128:
+ case X86::BI__builtin_ia32_packusdw256:
+ case X86::BI__builtin_ia32_packusdw512:
+ return evalPackBuiltin(E, Info, Result, /*SrcBits=*/32, /*IsUnsat=*/true,
+ NarrowElement);
case clang::X86::BI__builtin_ia32_pmuldq128:
case clang::X86::BI__builtin_ia32_pmuldq256:
case clang::X86::BI__builtin_ia32_pmuldq512:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index e35c159fec7fd..a62c31e107a60 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -165,9 +165,8 @@ _mm256_abs_epi32(__m256i __a) {
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
}
@@ -197,9 +196,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi32(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
}
@@ -228,9 +226,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b)
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
}
@@ -260,9 +257,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
}
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 599cfbe479676..d0785add5f07d 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -516,9 +516,8 @@ _mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) {
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
}
@@ -538,9 +537,8 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
}
@@ -560,9 +558,8 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
(__v64qi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
}
@@ -582,9 +579,8 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
}
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 12260ec6ea14c..b5277acb33ff9 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4159,8 +4159,8 @@ void _mm_mfence(void);
/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
}
@@ -4182,8 +4182,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
/// are written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi32(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
}
@@ -4205,8 +4205,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
}
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 18e2c2154362a..5f617530b6f78 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -156,11 +156,10 @@ _mm_cvtm64_si64(__m64 __m)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packsswb128(
- (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packsswb128(
+ (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
}
/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -182,11 +181,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packssdw128(
- (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi32(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packssdw128(
+ (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
}
/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -208,11 +206,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packuswb128(
- (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pu16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packuswb128(
+ (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
}
/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 6319fdbbeb8f0..c1c9c3d47f805 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1466,8 +1466,8 @@ _mm_cvtepu32_epi64(__m128i __V) {
/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
- __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi32(__m128i __V1, __m128i __V2) {
return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
}
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 17ab47c72ad4b..a365c7bd38e76 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1029,24 +1029,28 @@ __m256i test_mm256_packs_epi16(__m256i a, __m256i b) {
// CHECK: call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_packs_epi16(a, b);
}
+TEST_CONSTEXPR(match_v32qi(_mm256_packs_epi16((__m256i)(__v16hi){130, -200, 127, -128, 300, -1000, 42, -42, 500, -500, 1, -1, 128, -129, 256, -256}, (__m256i)(__v16hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128, 127, -128, 1, -1, 127, -128, 127, -128, 127, -128, 127, -128, 127, -128, 90, -90));
__m256i test_mm256_packs_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_packs_epi32
// CHECK: call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_packs_epi32(a, b);
}
+TEST_CONSTEXPR(match_v16hi(_mm256_packs_epi32((__m256i)(__v8si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42}, (__m256i)(__v8si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767, 32767, -32768, 42, -42, -32768, 32767, 32767, -32768));
__m256i test_mm256_packs_epu16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_packs_epu16
// CHECK: call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_packus_epi16(a, b);
}
+TEST_CONSTEXPR(match_v32qi(_mm256_packus_epi16((__m256i)(__v16hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m256i)(__v16hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0));
__m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_packs_epu32
// CHECK: call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_packus_epi32(a, b);
}
+TEST_CONSTEXPR(match_v16hi(_mm256_packus_epi32((__m256i)(__v8si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42}, (__m256i)(__v8si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0));
__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_permute2x128_si256
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 3be708aea8a4d..8d290a5f3ec92 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
#include <immintrin.h>
@@ -956,6 +956,7 @@ __m512i test_mm512_packs_epi32(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.packssdw.512
return _mm512_packs_epi32(__A,__B);
}
+TEST_CONSTEXPR(match_v32hi(_mm512_packs_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 30000, 32768, -32769, 65535, -65536}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 22222, -22222, 40000, -40000}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767, 32767, -32768, 42, -42, -32768, 32767, 32767, -32768, 0, 1, -1, 30000, 32767, -32768, 32767, -32768, 32767, -32768, 32767, -32768, 22222, -22222, 32767, -32768));
__m512i test_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_packs_epi32
// CHECK: @llvm.x86.avx512.packssdw.512
@@ -973,6 +974,7 @@ __m512i test_mm512_packs_epi16(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.packsswb.512
return _mm512_packs_epi16(__A,__B);
}
+TEST_CONSTEXPR(match_v64qi(_mm512_packs_epi16((__m512i)(__v32hi){130, -200, 127, -128, 300, -1000, 42, -42, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 130, -200, 0, -1, 126, -127, 128, -129, 500, -500, 7, -7, 255, -255, 127, -128}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 5, -5, 100, -100, 127, -128, 512, -512, 1, 2, -2, 300, -300, 127, -128, 42, 0, 1, -1, 127, -128, 90, -90, -32768}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128, 127, -128, 127, -128, 127, -128, 90, -90, 5, -5, 100, -100, 127, -128, 127, -128, 127, -128, 0, -1, 126, -127, 127, -128, 1, 2, -2, 127, -128, 127, -128, 42, 127, -128, 7, -7, 127, -128, 127, -128, 0, 1, -1, 127, -128, 90, -90, -128));
__m512i test_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_packs_epi16
// CHECK: @llvm.x86.avx512.packsswb.512
@@ -990,6 +992,7 @@ __m512i test_mm512_packus_epi32(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.packusdw.512
return _mm512_packus_epi32(__A,__B);
}
+TEST_CONSTEXPR(match_v32hi(_mm512_packus_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 65535, 32768, -32769, 22222, -22222}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 40000, -40000, 65535, 0}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0, 0, 1, 0, -1, -1, 0, 32767, 0, -32768, 0, 22222, 0, -25536, 0, -1, 0));
__m512i test_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_packus_epi32
// CHECK: @llvm.x86.avx512.packusdw.512
@@ -1007,6 +1010,7 @@ __m512i test_mm512_packus_epi16(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.packuswb.512
return _mm512_packus_epi16(__A,__B);
}
+TEST_CONSTEXPR(match_v64qi(_mm512_packus_epi16((__m512i)(__v32hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129, -1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0, 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0));
__m512i test_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_packus_epi16
// CHECK: @llvm.x86.avx512.packuswb.512
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 43d9ec5e6cc8b..e8faf8f937f9d 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -448,18 +448,21 @@ __m64 test_mm_packs_pi16(__m64 a, __m64 b) {
// CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(
return _mm_packs_pi16(a, b);
}
+TEST_CONSTEXPR(match_v8qi(_mm_packs_pi16((__m64)(__v4hi){130, -200, 127, -128}, (__m64)(__v4hi){0, 1, -1, 255}), 127, -128, 127, -128, 0, 1, -1, 127));
__m64 test_mm_packs_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_packs_pi32
// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(
return _mm_packs_pi32(a, b);
}
+TEST_CONSTEXPR(match_v4hi(_mm_packs_pi32((__m64)(__v2si){40000, -50000}, (__m64)(__v2si){0, 70000}), 32767, -32768, 0, 32767));
__m64 test_mm_packs_pu16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_packs_pu16
// CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(
return _mm_packs_pu16(a, b);
}
+TEST_CONSTEXPR(match_v8qi(_mm_packs_pu16((__m64)(__v4hi){-1, 0, 128, 300}, (__m64)(__v4hi){255, -200, 42, -42}), 0, 0, -128, -1, -1, 0, 42, 0));
__m64 test_mm_sad_pu8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sad_pu8
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 0ba32bb230cdd..035f2df5559d6 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1020,18 +1020,21 @@ __m128i test_mm_packs_epi16(__m128i A, __m128i B) {
// CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_packs_epi16(A, B);
}
+TEST_CONSTEXPR(match_v16qi(_mm_packs_epi16((__m128i)(__v8hi){130, -200, 127, -128, 300, -1000, 42, -42}, (__m128i)(__v8hi){0, 1, -1, 255, -129, 128, 20000, -32768}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128));
__m128i test_mm_packs_epi32(__m128i A, __m128i B) {
// CHECK-LABEL: test_mm_packs_epi32
// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_packs_epi32(A, B);
}
+TEST_CONSTEXPR(match_v8hi(_mm_packs_epi32((__m128i)(__v4si){40000, -50000, 32767, -32768}, (__m128i)(__v4si){0, 1, -1, 70000}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767));
__m128i test_mm_packus_epi16(__m128i A, __m128i B) {
// CHECK-LABEL: test_mm_packus_epi16
// CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_packus_epi16(A, B);
}
+TEST_CONSTEXPR(match_v16qi(_mm_packus_epi16((__m128i)(__v8hi){-1, 0, 1, 127, 300, -1000, 255, -42}, (__m128i)(__v8hi){0, 1, -1, 255, -129, 128, 20000, -32768}), 0, 0, 1, 127, -1, 0, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0));
void test_mm_pause(void) {
// CHECK-LABEL: test_mm_pause
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index c7265b188d572..3c3724643870e 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -399,6 +399,7 @@ __m128i test_mm_packus_epi32(__m128i x, __m128i y) {
// CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_packus_epi32(x, y);
}
+TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 32767, -32768}, (__m128i)(__v4si){0, 1, -1, 70000}), -25536, 0, 32767, 0, 0, 1, 0, -1));
__m128d test_mm_round_pd(__m128d x) {
// CHECK-LABEL: test_mm_round_pd
>From f00dd0f376c62984be2083fb4510981f86fc4267 Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Wed, 17 Sep 2025 18:16:28 -0900
Subject: [PATCH 2/2] fix: review
---
clang/include/clang/Basic/BuiltinsX86.td | 2 +-
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 56 +++++++++-------------
clang/lib/AST/ExprConstant.cpp | 26 +++++-----
clang/test/CodeGen/X86/avx512bw-builtins.c | 18 +++----
4 files changed, 44 insertions(+), 58 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 730fd860b3330..cf2fe707b4cc5 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1309,7 +1309,7 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}
-let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 34cf8cb3bf30c..92ba6c0488399 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "../ExprConstShared.h"
#include "Boolean.h"
+#include "ByteCode/FixedPoint.h"
#include "EvalEmitter.h"
#include "Interp.h"
#include "InterpBuiltinBitCast.h"
@@ -2605,7 +2606,7 @@ static bool interp__builtin_elementwise_int_binop(
}
static bool interp__builtin_x86_pack(
- InterpState &S, CodePtr, const CallExpr *E, unsigned SrcBits, bool IsUnsat,
+ InterpState &S, CodePtr, const CallExpr *E, bool IsUnsat,
llvm::function_ref<APSInt(const APSInt &X, unsigned, unsigned, bool)>
narrowElement) {
const auto *VT0 = E->getArg(0)->getType()->castAs<VectorType>();
@@ -2615,12 +2616,19 @@ static bool interp__builtin_x86_pack(
VT0->getNumElements() == VT1->getNumElements() &&
"pack builtin VT0 and VT1 ElementType must be same");
- const unsigned LHSVecLen = VT0->getNumElements();
-
const Pointer &RHS = S.Stk.pop<Pointer>();
const Pointer &LHS = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();
+ ASTContext &Ctx = S.getASTContext();
+ const unsigned SrcBits = Ctx.getIntWidth(VT0->getElementType());
+ const unsigned DstBits = SrcBits / 2;
+ const unsigned LHSVecLen = VT0->getNumElements();
+ const unsigned VectorBits = LHSVecLen * SrcBits;
+ const unsigned srcPerLane = VectorBits >= 128 ? (128 / SrcBits) : LHSVecLen;
+ const unsigned lanes = VectorBits >= 128 ? (VectorBits / 128) : 1;
+ PrimType DstT = *S.getContext().classify(getElemType(Dst));
+
auto readSrc = [&](const Pointer &P, unsigned I) -> APSInt {
if (SrcBits == 16) {
int16_t v = P.elem<int16_t>(I);
@@ -2631,24 +2639,14 @@ static bool interp__builtin_x86_pack(
}
};
- const unsigned DstBits = SrcBits / 2;
- auto writeDst = [&](unsigned I, const APSInt &Y) {
- if (DstBits == 8) {
- if (IsUnsat)
- Dst.elem<uint8_t>(I) = static_cast<uint8_t>(Y.getZExtValue());
- else
- Dst.elem<int8_t>(I) = static_cast<int8_t>(Y.getSExtValue());
- } else {
- if (IsUnsat)
- Dst.elem<uint16_t>(I) = static_cast<uint16_t>(Y.getZExtValue());
- else
- Dst.elem<int16_t>(I) = static_cast<int16_t>(Y.getSExtValue());
- }
- };
+ auto writeDst = [&](unsigned I, const APSInt &Result) {
+ APSInt Tmp = Result;
+ Tmp.setIsUnsigned(IsUnsat);
+ Tmp = Tmp.extOrTrunc(DstBits);
- const unsigned VectorBits = LHSVecLen * SrcBits;
- const unsigned srcPerLane = VectorBits >= 128 ? (128 / SrcBits) : LHSVecLen;
- const unsigned lanes = VectorBits >= 128 ? (VectorBits / 128) : 1;
+ const Pointer &ElemPtr = Dst.atIndex(I);
+ assignInteger(S, ElemPtr, DstT, Tmp);
+ };
for (unsigned lane = 0; lane < lanes; ++lane) {
const unsigned baseSrc = lane * srcPerLane;
@@ -3529,26 +3527,18 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case clang::X86::BI__builtin_ia32_packsswb128:
case clang::X86::BI__builtin_ia32_packsswb256:
case clang::X86::BI__builtin_ia32_packsswb512:
- return interp__builtin_x86_pack(S, OpPC, Call,
- /*SrcBits=*/16, /*Unsat=*/false,
- NarrowElement);
- case clang::X86::BI__builtin_ia32_packuswb128:
- case clang::X86::BI__builtin_ia32_packuswb256:
- case clang::X86::BI__builtin_ia32_packuswb512:
- return interp__builtin_x86_pack(S, OpPC, Call,
- /*SrcBits=*/16, /*Unsat=*/true,
- NarrowElement);
case clang::X86::BI__builtin_ia32_packssdw128:
case clang::X86::BI__builtin_ia32_packssdw256:
case clang::X86::BI__builtin_ia32_packssdw512:
- return interp__builtin_x86_pack(S, OpPC, Call,
- /*SrcBits=*/32, /*Unsat=*/false,
+ return interp__builtin_x86_pack(S, OpPC, Call, /*Unsat=*/false,
NarrowElement);
case clang::X86::BI__builtin_ia32_packusdw128:
case clang::X86::BI__builtin_ia32_packusdw256:
case clang::X86::BI__builtin_ia32_packusdw512:
- return interp__builtin_x86_pack(S, OpPC, Call,
- /*SrcBits=*/32, /*Unsat=*/true,
+ case clang::X86::BI__builtin_ia32_packuswb128:
+ case clang::X86::BI__builtin_ia32_packuswb256:
+ case clang::X86::BI__builtin_ia32_packuswb512:
+ return interp__builtin_x86_pack(S, OpPC, Call, /*Unsat=*/true,
NarrowElement);
case clang::X86::BI__builtin_ia32_vprotbi:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 86f74b52e7a0a..c5d226888174f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11576,8 +11576,7 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
}
static bool evalPackBuiltin(
- const CallExpr *E, EvalInfo &Info, APValue &Result, unsigned SrcBits,
- bool IsUnsat,
+ const CallExpr *E, EvalInfo &Info, APValue &Result, bool IsUnsat,
llvm::function_ref<APSInt(const APSInt &, unsigned, unsigned, bool)>
narrowElement) {
APValue LHS, RHS;
@@ -11591,6 +11590,9 @@ static bool evalPackBuiltin(
assert(LHSVecLen != 0 && LHSVecLen == RHSVecLen &&
"pack builtin LHSVecLen must equal to RHSVecLen");
+ const VectorType *VT0 = E->getArg(0)->getType()->castAs<VectorType>();
+ const unsigned SrcBits = Info.Ctx.getIntWidth(VT0->getElementType());
+ const unsigned DstBits = SrcBits / 2;
const unsigned VectorBits = LHSVecLen * SrcBits;
const unsigned srcPerLane = VectorBits >= 128 ? 128 / SrcBits : LHSVecLen;
const unsigned lanes = VectorBits >= 128 ? VectorBits / 128 : 1;
@@ -11602,10 +11604,10 @@ static bool evalPackBuiltin(
unsigned base = lane * srcPerLane;
for (unsigned i = 0; i != srcPerLane; ++i)
Out.emplace_back(APValue(narrowElement(
- LHS.getVectorElt(base + i).getInt(), SrcBits, SrcBits / 2, IsUnsat)));
+ LHS.getVectorElt(base + i).getInt(), SrcBits, DstBits, IsUnsat)));
for (unsigned i = 0; i != srcPerLane; ++i)
Out.emplace_back(APValue(narrowElement(
- RHS.getVectorElt(base + i).getInt(), SrcBits, SrcBits / 2, IsUnsat)));
+ RHS.getVectorElt(base + i).getInt(), SrcBits, DstBits, IsUnsat)));
}
Result = APValue(Out.data(), Out.size());
@@ -11836,23 +11838,17 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case X86::BI__builtin_ia32_packsswb128:
case X86::BI__builtin_ia32_packsswb256:
case X86::BI__builtin_ia32_packsswb512:
- return evalPackBuiltin(E, Info, Result, /*SrcBits=*/16, /*IsUnsat=*/false,
- NarrowElement);
- case X86::BI__builtin_ia32_packuswb128:
- case X86::BI__builtin_ia32_packuswb256:
- case X86::BI__builtin_ia32_packuswb512:
- return evalPackBuiltin(E, Info, Result, /*SrcBits=*/16, /*IsUnsat=*/true,
- NarrowElement);
case X86::BI__builtin_ia32_packssdw128:
case X86::BI__builtin_ia32_packssdw256:
case X86::BI__builtin_ia32_packssdw512:
- return evalPackBuiltin(E, Info, Result, /*SrcBits=*/32, /*IsUnsat=*/false,
- NarrowElement);
+ return evalPackBuiltin(E, Info, Result, /*IsUnsat=*/false, NarrowElement);
case X86::BI__builtin_ia32_packusdw128:
case X86::BI__builtin_ia32_packusdw256:
case X86::BI__builtin_ia32_packusdw512:
- return evalPackBuiltin(E, Info, Result, /*SrcBits=*/32, /*IsUnsat=*/true,
- NarrowElement);
+ case X86::BI__builtin_ia32_packuswb128:
+ case X86::BI__builtin_ia32_packuswb256:
+ case X86::BI__builtin_ia32_packuswb512:
+ return evalPackBuiltin(E, Info, Result, /*IsUnsat=*/true, NarrowElement);
case clang::X86::BI__builtin_ia32_pmuldq128:
case clang::X86::BI__builtin_ia32_pmuldq256:
case clang::X86::BI__builtin_ia32_pmuldq512:
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 8d290a5f3ec92..c8bd2ab45fb35 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
-
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +evex512 -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion -fexperimental-new-constant-interpreter | FileCheck %s
#include <immintrin.h>
More information about the cfe-commits
mailing list