[clang] [Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AVX512 pack intrinsics to be used in constexpr (PR #156003)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Aug 29 03:36:16 PDT 2025
https://github.com/woruyu created https://github.com/llvm/llvm-project/pull/156003
### Summary
This PR resolves https://github.com/llvm/llvm-project/issues/154283
>From 372f6ce5986c374375bbcf1b555d649025552c6e Mon Sep 17 00:00:00 2001
From: woruyu <1214539920 at qq.com>
Date: Fri, 29 Aug 2025 18:34:43 +0800
Subject: [PATCH] [Headers][X86] VectorExprEvaluator::VisitCallExpr - allow
SSE/AVX2/AVX512 pack intrinsics to be used in constexpr
---
clang/include/clang/Basic/BuiltinsX86.td | 28 +++---
clang/lib/AST/ExprConstant.cpp | 99 ++++++++++++++++++++++
clang/lib/Headers/avx2intrin.h | 20 ++---
clang/lib/Headers/avx512bwintrin.h | 20 ++---
clang/lib/Headers/emmintrin.h | 12 +--
clang/lib/Headers/mmintrin.h | 27 +++---
clang/lib/Headers/smmintrin.h | 4 +-
clang/test/CodeGen/X86/avx2-builtins.c | 4 +
clang/test/CodeGen/X86/avx512bw-builtins.c | 4 +
clang/test/CodeGen/X86/mmx-builtins.c | 3 +
clang/test/CodeGen/X86/sse2-builtins.c | 3 +
clang/test/CodeGen/X86/sse41-builtins.c | 1 +
12 files changed, 166 insertions(+), 59 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 527acd9ef086e..737be0f673e96 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -95,9 +95,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
let Features = "sse2" in {
def pavgb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def pavgw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
- def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
- def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
- def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
@@ -108,6 +105,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
+ def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+ def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+ def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
}
let Features = "sse3" in {
@@ -314,7 +314,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
- def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -333,6 +332,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
+ def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
}
let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -568,10 +568,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
- def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
- def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
- def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
- def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
def pavgb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def pavgw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -636,6 +632,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+
+ def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+ def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+ def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
+ def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
}
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -1355,15 +1356,18 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi
let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
- def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
- def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
- def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
- def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def pavgb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
def pavgw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}
+let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+ def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+ def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+ def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
+ def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+}
+
let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
}
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a71cb8b0143be..8f649eca776ec 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11599,6 +11599,89 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
return false;
}
+enum class PackKind {
+ SSWB,
+ USWB,
+ SSDW,
+ USDW
+}; // 16→8 signed/unsigned; 32→16 signed/unsigned
+
+static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
+ PackKind K) {
+ APValue L, R;
+ if (!EvaluateAsRValue(Info, E->getArg(0), L) ||
+ !EvaluateAsRValue(Info, E->getArg(1), R))
+ return false;
+
+ unsigned SrcBits = (K == PackKind::SSWB || K == PackKind::USWB) ? 16 : 32;
+ unsigned DstBits = SrcBits / 2;
+
+ unsigned NL = L.getVectorLength();
+ unsigned NR = R.getVectorLength();
+ if (NL == 0 || NR == 0 || NL != NR)
+ return false;
+
+ // Bounds for saturation (extended to SrcBits for compares).
+ APInt Lo = (K == PackKind::USWB || K == PackKind::USDW)
+ ? APInt(SrcBits, 0)
+ : APInt::getSignedMinValue(DstBits).sext(SrcBits);
+ APInt Hi = (K == PackKind::USWB || K == PackKind::USDW)
+ ? APInt::getMaxValue(DstBits).zext(SrcBits)
+ : APInt::getSignedMaxValue(DstBits).sext(SrcBits);
+
+ // Result element signedness follows the builtin's return vector element type.
+ QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+ bool DestIsUnsigned = DestEltTy->isUnsignedIntegerType();
+
+ // Clamp one source element to the target range and narrow to DstBits.
+ auto clampOne = [&](const APSInt &X) -> APSInt {
+ APInt V = X;
+ if (V.getBitWidth() != SrcBits)
+ V = V.sextOrTrunc(SrcBits);
+
+ if (K == PackKind::USWB || K == PackKind::USDW) {
+ if (V.isNegative())
+ V = Lo;
+ else if (V.ugt(Hi))
+ V = Hi;
+ APInt Narrow = V.zextOrTrunc(DstBits);
+ return APSInt(Narrow, /*isUnsigned=*/true);
+ } else {
+ if (V.sgt(Hi))
+ V = Hi;
+ else if (V.slt(Lo))
+ V = Lo;
+ APInt Narrow = V.sextOrTrunc(DstBits);
+ return APSInt(Narrow, /*isUnsigned=*/DestIsUnsigned);
+ }
+ };
+
+ SmallVector<APValue, 64> Out;
+ Out.reserve(NL + NR);
+
+ // Process per 128-bit lane (MMX 64-bit uses a single lane).
+ unsigned VectorBits = NL * SrcBits;
+ unsigned srcPerLane, lanes;
+ if (VectorBits >= 128) {
+ srcPerLane = 128 / SrcBits; // 8 (16→8) or 4 (32→16)
+ lanes = VectorBits / 128; // 1 (128b), 2 (256b), 4 (512b)
+ } else {
+ srcPerLane = NL; // MMX
+ lanes = 1;
+ }
+
+ for (unsigned lane = 0; lane != lanes; ++lane) {
+ unsigned base = lane * srcPerLane;
+ for (unsigned i = 0; i != srcPerLane; ++i)
+ Out.push_back(APValue(clampOne(L.getVectorElt(base + i).getInt())));
+ for (unsigned i = 0; i != srcPerLane; ++i)
+ Out.push_back(APValue(clampOne(R.getVectorElt(base + i).getInt())));
+ }
+
+ Result = APValue(Out.data(), Out.size());
+ return true;
+}
+
bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (!IsConstantEvaluatedBuiltinCall(E))
return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -11752,6 +11835,22 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_packsswb128:
+ case X86::BI__builtin_ia32_packsswb256:
+ case X86::BI__builtin_ia32_packsswb512:
+ return evalPackBuiltin(E, Info, Result, PackKind::SSWB);
+ case X86::BI__builtin_ia32_packuswb128:
+ case X86::BI__builtin_ia32_packuswb256:
+ case X86::BI__builtin_ia32_packuswb512:
+ return evalPackBuiltin(E, Info, Result, PackKind::USWB);
+ case X86::BI__builtin_ia32_packssdw128:
+ case X86::BI__builtin_ia32_packssdw256:
+ case X86::BI__builtin_ia32_packssdw512:
+ return evalPackBuiltin(E, Info, Result, PackKind::SSDW);
+ case X86::BI__builtin_ia32_packusdw128:
+ case X86::BI__builtin_ia32_packusdw256:
+ case X86::BI__builtin_ia32_packusdw512:
+ return evalPackBuiltin(E, Info, Result, PackKind::USDW);
case clang::X86::BI__builtin_ia32_pmuldq128:
case clang::X86::BI__builtin_ia32_pmuldq256:
case clang::X86::BI__builtin_ia32_pmuldq512:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index ce5b2b7544d8c..22d7157f0db58 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -177,9 +177,8 @@ _mm256_abs_epi32(__m256i __a)
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
}
@@ -209,9 +208,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packs_epi32(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
}
@@ -240,9 +238,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b)
/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
}
@@ -272,9 +269,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b)
/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
/// result[255:192].
/// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
}
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 9263f7af3ee2f..b56f53107fa92 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -525,9 +525,8 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
(__v32hi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
}
@@ -547,9 +546,8 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packs_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
}
@@ -569,9 +567,8 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
(__v64qi)_mm512_setzero_si512());
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi32(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi32(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
}
@@ -591,9 +588,8 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_packus_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
}
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 8b6b62458dac1..f3c9eb7158c1f 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4166,8 +4166,8 @@ void _mm_mfence(void);
/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
}
@@ -4189,8 +4189,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
/// are written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packs_epi32(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
}
@@ -4212,8 +4212,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
- __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
}
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 6fe9d67b8976d..aa6845f60ee99 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -162,11 +162,10 @@ _mm_cvtm64_si64(__m64 __m)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packsswb128(
- (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packsswb128(
+ (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
}
/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@@ -188,11 +187,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packssdw128(
- (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pi32(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packssdw128(
+ (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
}
/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@@ -214,11 +212,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
/// written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
- return __trunc64(__builtin_ia32_packuswb128(
- (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_packs_pu16(__m64 __m1, __m64 __m2) {
+ return __trunc64(__builtin_ia32_packuswb128(
+ (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
}
/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 57d0d329312af..8acf4929d1125 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1475,8 +1475,8 @@ _mm_cvtepu32_epi64(__m128i __V) {
/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
/// written to the higher 64 bits of the result.
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
- __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_packus_epi32(__m128i __V1, __m128i __V2) {
return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
}
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 29cb3e8860be9..d2d8b53c24a96 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -964,24 +964,28 @@ __m256i test_mm256_packs_epi16(__m256i a, __m256i b) {
// CHECK: call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_packs_epi16(a, b);
}
+TEST_CONSTEXPR(match_v32qi(_mm256_packs_epi16((__m256i)(__v16hi){130, -200, 127, -128, 300, -1000, 42, -42, 500, -500, 1, -1, 128, -129, 256, -256}, (__m256i)(__v16hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128, 127, -128, 1, -1, 127, -128, 127, -128, 127, -128, 127, -128, 127, -128, 90, -90));
__m256i test_mm256_packs_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_packs_epi32
// CHECK: call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_packs_epi32(a, b);
}
+TEST_CONSTEXPR(match_v16hi(_mm256_packs_epi32((__m256i)(__v8si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42}, (__m256i)(__v8si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767, 32767, -32768, 42, -42, -32768, 32767, 32767, -32768));
__m256i test_mm256_packs_epu16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_packs_epu16
// CHECK: call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
return _mm256_packus_epi16(a, b);
}
+TEST_CONSTEXPR(match_v32qi(_mm256_packus_epi16((__m256i)(__v16hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m256i)(__v16hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0));
__m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_packs_epu32
// CHECK: call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
return _mm256_packus_epi32(a, b);
}
+TEST_CONSTEXPR(match_v16hi(_mm256_packus_epi32((__m256i)(__v8si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42}, (__m256i)(__v8si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0));
__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_permute2x128_si256
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 02cedc3c73fb7..f48be423f0b97 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -891,6 +891,7 @@ __m512i test_mm512_packs_epi32(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.packssdw.512
return _mm512_packs_epi32(__A,__B);
}
+TEST_CONSTEXPR(match_v32hi(_mm512_packs_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 30000, 32768, -32769, 65535, -65536}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 22222, -22222, 40000, -40000}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767, 32767, -32768, 42, -42, -32768, 32767, 32767, -32768, 0, 1, -1, 30000, 32767, -32768, 32767, -32768, 32767, -32768, 32767, -32768, 22222, -22222, 32767, -32768));
__m512i test_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_packs_epi32
// CHECK: @llvm.x86.avx512.packssdw.512
@@ -908,6 +909,7 @@ __m512i test_mm512_packs_epi16(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.packsswb.512
return _mm512_packs_epi16(__A,__B);
}
+TEST_CONSTEXPR(match_v64qi(_mm512_packs_epi16((__m512i)(__v32hi){130, -200, 127, -128, 300, -1000, 42, -42, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 130, -200, 0, -1, 126, -127, 128, -129, 500, -500, 7, -7, 255, -255, 127, -128}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 5, -5, 100, -100, 127, -128, 512, -512, 1, 2, -2, 300, -300, 127, -128, 42, 0, 1, -1, 127, -128, 90, -90, -32768}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128, 127, -128, 127, -128, 127, -128, 90, -90, 5, -5, 100, -100, 127, -128, 127, -128, 127, -128, 0, -1, 126, -127, 127, -128, 1, 2, -2, 127, -128, 127, -128, 42, 127, -128, 7, -7, 127, -128, 127, -128, 0, 1, -1, 127, -128, 90, -90, -128));
__m512i test_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_packs_epi16
// CHECK: @llvm.x86.avx512.packsswb.512
@@ -925,6 +927,7 @@ __m512i test_mm512_packus_epi32(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.packusdw.512
return _mm512_packus_epi32(__A,__B);
}
+TEST_CONSTEXPR(match_v32hi(_mm512_packus_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 65535, 32768, -32769, 22222, -22222}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 40000, -40000, 65535, 0}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0, 0, 1, 0, -1, -1, 0, 32767, 0, -32768, 0, 22222, 0, -25536, 0, -1, 0));
__m512i test_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_packus_epi32
// CHECK: @llvm.x86.avx512.packusdw.512
@@ -942,6 +945,7 @@ __m512i test_mm512_packus_epi16(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.packuswb.512
return _mm512_packus_epi16(__A,__B);
}
+TEST_CONSTEXPR(match_v64qi(_mm512_packus_epi16((__m512i)(__v32hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129, -1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0, 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0));
__m512i test_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_packus_epi16
// CHECK: @llvm.x86.avx512.packuswb.512
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 35f0d6c9b43e8..637bce01a5f72 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -430,18 +430,21 @@ __m64 test_mm_packs_pi16(__m64 a, __m64 b) {
// CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(
return _mm_packs_pi16(a, b);
}
+TEST_CONSTEXPR(match_v8qi(_mm_packs_pi16((__m64)(__v4hi){130, -200, 127, -128}, (__m64)(__v4hi){0, 1, -1, 255}), 127, -128, 127, -128, 0, 1, -1, 127));
__m64 test_mm_packs_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_packs_pi32
// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(
return _mm_packs_pi32(a, b);
}
+TEST_CONSTEXPR(match_v4hi(_mm_packs_pi32((__m64)(__v2si){40000, -50000}, (__m64)(__v2si){0, 70000}), 32767, -32768, 0, 32767));
__m64 test_mm_packs_pu16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_packs_pu16
// CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(
return _mm_packs_pu16(a, b);
}
+TEST_CONSTEXPR(match_v8qi(_mm_packs_pu16((__m64)(__v4hi){-1, 0, 128, 300}, (__m64)(__v4hi){255, -200, 42, -42}), 0, 0, -128, -1, -1, 0, 42, 0));
__m64 test_mm_sad_pu8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sad_pu8
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 38d5e877a5036..e98ba52efbdf8 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -984,18 +984,21 @@ __m128i test_mm_packs_epi16(__m128i A, __m128i B) {
// CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_packs_epi16(A, B);
}
+TEST_CONSTEXPR(match_v16qi(_mm_packs_epi16((__m128i)(__v8hi){130, -200, 127, -128, 300, -1000, 42, -42}, (__m128i)(__v8hi){0, 1, -1, 255, -129, 128, 20000, -32768}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128));
__m128i test_mm_packs_epi32(__m128i A, __m128i B) {
// CHECK-LABEL: test_mm_packs_epi32
// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_packs_epi32(A, B);
}
+TEST_CONSTEXPR(match_v8hi(_mm_packs_epi32((__m128i)(__v4si){40000, -50000, 32767, -32768}, (__m128i)(__v4si){0, 1, -1, 70000}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767));
__m128i test_mm_packus_epi16(__m128i A, __m128i B) {
// CHECK-LABEL: test_mm_packus_epi16
// CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
return _mm_packus_epi16(A, B);
}
+TEST_CONSTEXPR(match_v16qi(_mm_packus_epi16((__m128i)(__v8hi){-1, 0, 1, 127, 300, -1000, 255, -42}, (__m128i)(__v8hi){0, 1, -1, 255, -129, 128, 20000, -32768}), 0, 0, 1, 127, -1, 0, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0));
void test_mm_pause(void) {
// CHECK-LABEL: test_mm_pause
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 500b780d49057..7bebcfd764383 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -357,6 +357,7 @@ __m128i test_mm_packus_epi32(__m128i x, __m128i y) {
// CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
return _mm_packus_epi32(x, y);
}
+TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 32767, -32768}, (__m128i)(__v4si){0, 1, -1, 70000}), -25536, 0, 32767, 0, 0, 1, 0, -1));
__m128d test_mm_round_pd(__m128d x) {
// CHECK-LABEL: test_mm_round_pd
More information about the cfe-commits
mailing list