[clang] [X86][ByteCode] Allow PSHUFB intrinsics to be used in constexpr #156612 (PR #163148)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Oct 15 01:45:55 PDT 2025
https://github.com/shashank1545 updated https://github.com/llvm/llvm-project/pull/163148
>From 8d16fb7a55101dd282678bc06f91f8fca3d246ae Mon Sep 17 00:00:00 2001
From: shashank1545 <shashank1545 at gmail.com>
Date: Mon, 13 Oct 2025 14:06:45 +0530
Subject: [PATCH 1/4] [X86][ByteCode] Allow PSHUFB intrinsics to be used in
constexpr #156612
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The PSHUFB instruction shuffles bytes within each 128-bit lane: for each
control byte, if bit 7 is set, the output byte is zeroed; otherwise, the
low 4 bits select a source byte (0–15) from the same lane.
---
clang/include/clang/Basic/BuiltinsX86.td | 8 +--
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 40 +++++++++++++++
clang/lib/AST/ExprConstant.cpp | 54 ++++++++++++++++++++
clang/lib/Headers/avx2intrin.h | 5 +-
clang/lib/Headers/avx512bwintrin.h | 15 +++---
clang/lib/Headers/avx512vlbwintrin.h | 20 +++-----
clang/lib/Headers/tmmintrin.h | 21 ++++----
clang/test/CodeGen/X86/avx2-builtins.c | 2 +
clang/test/CodeGen/X86/avx512bw-builtins.c | 9 ++++
clang/test/CodeGen/X86/avx512vlbw-builtins.c | 13 +++++
clang/test/CodeGen/X86/mmx-builtins.c | 2 +
clang/test/CodeGen/X86/ssse3-builtins.c | 2 +
12 files changed, 153 insertions(+), 38 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 217589d7add1d..d5e6e94ce6010 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -124,7 +124,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
}
def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
- def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
@@ -132,6 +131,7 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
+ def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
}
}
@@ -588,7 +588,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
- def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
@@ -627,6 +626,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
+ def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+
def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
@@ -1318,7 +1319,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
- def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}
let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
@@ -1326,6 +1326,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVect
def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+
+ def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}
let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 3811fb072fa16..db3bc98452f0f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2714,6 +2714,41 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
return true;
}
+static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call) {
+ assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
+ const Pointer &Control = S.Stk.pop<Pointer>();
+ const Pointer &Src = S.Stk.pop<Pointer>();
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+
+ unsigned NumElems = Dst.getNumElems();
+ PrimType ElemT = Dst.getFieldDesc()->getPrimType();
+ unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);
+
+ assert(NumElems == 16 || NumElems == 32 || NumElems == 64);
+ assert(NumElems == Control.getNumElems());
+ assert(NumElems == Dst.getNumElems());
+
+ if (ElemBits != 8)
+ return false;
+
+ for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
+ uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));
+
+ if (Ctlb & 0x80) {
+ Dst.elem<int8_t>(Idx) = 0;
+ } else {
+ unsigned LaneBase = (Idx / 16) * 16;
+ unsigned SrcOffset = Ctlb & 0x0F;
+ unsigned SrcIdx = LaneBase + SrcOffset;
+
+ Dst.elem<int8_t>(Idx) = Src.elem<int8_t>(SrcIdx);
+ }
+ }
+ Dst.initializeAllElements();
+ return true;
+}
+
static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
const CallExpr *Call, bool IsShufHW) {
assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
@@ -3739,6 +3774,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_selectpd_512:
return interp__builtin_select(S, OpPC, Call);
+ case X86::BI__builtin_ia32_pshufb128:
+ case X86::BI__builtin_ia32_pshufb256:
+ case X86::BI__builtin_ia32_pshufb512:
+ return interp__builtin_ia32_pshufb(S, OpPC, Call);
+
case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 35a866ea5010f..2e5027d0397c4 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11615,6 +11615,51 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
return true;
}
+static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
+ APValue &Out) {
+ APValue SrcVec, ControlVec;
+ if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec))
+ return false;
+ if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec))
+ return false;
+
+ const auto *VT = Call->getType()->getAs<VectorType>();
+ if (!VT)
+ return false;
+
+ QualType ElemT = VT->getElementType();
+ unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
+
+ if (ElemBits != 8)
+ return false;
+ unsigned NumElts = VT->getNumElements();
+ if (NumElts != 16 && NumElts != 32 && NumElts != 64)
+ return false;
+
+ SmallVector<APValue, 64> ResultElements;
+ ResultElements.reserve(NumElts);
+
+ for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
+ APValue CtlVal = ControlVec.getVectorElt(Idx);
+ APSInt CtlByte = CtlVal.getInt();
+ uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue() & 0xFF);
+
+ if (Ctl & 0x80) {
+ APSInt Zero(ElemBits, /*isUnsigned*/ false);
+ Zero = 0;
+ ResultElements.push_back(APValue(Zero));
+ } else {
+ unsigned LaneBase = (Idx / 16) * 16;
+ unsigned SrcOffset = Ctl & 0x0F;
+ unsigned SrcIdx = LaneBase + SrcOffset;
+
+ ResultElements.push_back(SrcVec.getVectorElt(SrcIdx));
+ }
+ }
+ Out = APValue(ResultElements.data(), ResultElements.size());
+ return true;
+}
+
static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
bool IsShufHW, APValue &Out) {
APValue Vec;
@@ -12189,6 +12234,15 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_pshufb128:
+ case X86::BI__builtin_ia32_pshufb256:
+ case X86::BI__builtin_ia32_pshufb512: {
+ APValue R;
+ if (!evalPshufbBuiltin(Info, E, R))
+ return false;
+ return Success(R, E);
+ }
+
case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512: {
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 4aaca2db8236a..f586c1c1d7439 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1858,9 +1858,8 @@ _mm256_sad_epu8(__m256i __a, __m256i __b)
/// control byte specify the index (within the same 128-bit half) of \a __a
/// to copy to the result byte.
/// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shuffle_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
}
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 473fe94af65d8..23b2d290422b1 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -866,23 +866,20 @@ _mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
(__v32hi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_shuffle_epi8(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
(__v64qi)_mm512_shuffle_epi8(__A, __B),
(__v64qi)__W);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
(__v64qi)_mm512_shuffle_epi8(__A, __B),
(__v64qi)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 81e4cbb9615c1..639fb60f476c6 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1067,33 +1067,29 @@ _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
(__v16hi)__W);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
(__v16qi)_mm_shuffle_epi8(__A, __B),
(__v16qi)__W);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
(__v16qi)_mm_shuffle_epi8(__A, __B),
(__v16qi)_mm_setzero_si128());
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
(__v32qi)_mm256_shuffle_epi8(__A, __B),
(__v32qi)__W);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
(__v32qi)_mm256_shuffle_epi8(__A, __B),
(__v32qi)_mm256_setzero_si256());
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 3fc9f9834baa9..386c3704a2251 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -603,10 +603,9 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b)
/// Bits [6:4] Reserved. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 128-bit integer vector containing the copied or cleared values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shuffle_epi8(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_shuffle_epi8(__m128i __a, __m128i __b) {
+ return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
}
/// Copies the 8-bit integers from a 64-bit integer vector to the
@@ -628,13 +627,13 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
/// destination. \n
/// Bits [2:0] select the source byte to be copied.
/// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_shuffle_pi8(__m64 __a, __m64 __b)
-{
- return __trunc64(__builtin_ia32_pshufb128(
- (__v16qi)__builtin_shufflevector(
- (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
- (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_shuffle_pi8(__m64 __a, __m64 __b) {
+ return __trunc64(__builtin_ia32_pshufb128(
+ (__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){},
+ 0, 1, 0, 1),
+ (__v16qi)__builtin_shufflevector((__v2si)(__b), __extension__(__v2si){},
+ 0, 1, 0, 1)));
}
/// For each 8-bit integer in the first source operand, perform one of
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 55f18f947b96f..7826de484f2c7 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1106,6 +1106,8 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
return _mm256_shuffle_epi8(a, b);
}
+TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){0,33,2,35,4,37,6,39,8,41,10,43,12,45,14,47,16,49,18,51,20,53,22,55,24,57,26,59,28,61,30,63}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31));
+
__m256i test_mm256_shuffle_epi32(__m256i a) {
// CHECK-LABEL: test_mm256_shuffle_epi32
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index af1c904a6de48..568adaa5a49f4 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1466,18 +1466,27 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.pshuf.b.512
return _mm512_shuffle_epi8(__A,__B);
}
+
+TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63));
+
__m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_shuffle_epi8
// CHECK: @llvm.x86.avx512.pshuf.b.512
// CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
return _mm512_mask_shuffle_epi8(__W,__U,__A,__B);
}
+
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_shuffle_epi8((__m512i)(__v64qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8}, 0xFFFFFFFF00000000, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48));
+
__m512i test_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_shuffle_epi8
// CHECK: @llvm.x86.avx512.pshuf.b.512
// CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
return _mm512_maskz_shuffle_epi8(__U,__A,__B);
}
+
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_shuffle_epi8(0x8888888888888888,(__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64}), 0,0,0,12,0,0,0,8,0,0,0,4,0,0,0,0,0,0,0,28,0,0,0,24,0,0,0,20,0,0,0,16,0,0,0,44,0,0,0,40,0,0,0,36,0,0,0,32,0,0,0,60,0,0,0,56,0,0,0,52,0,0,0,48));
+
__m512i test_mm512_subs_epi8(__m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_subs_epi8
// CHECK: @llvm.ssub.sat.v64i8
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index c0e46de8b81bf..d569283928a0a 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -1688,24 +1688,37 @@ __m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m12
// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
return _mm_mask_shuffle_epi8(__W,__U,__A,__B);
}
+
+TEST_CONSTEXPR(match_v16qi(_mm_mask_shuffle_epi8((__m128i)(__v16qi){1,1,1,1,1,1,1,1,2,2,4,4,6,6,8,8}, 0x00FF, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,2,2,4,4,6,6,8,8));
+
__m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_maskz_shuffle_epi8
// CHECK: @llvm.x86.ssse3.pshuf.b
// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
return _mm_maskz_shuffle_epi8(__U,__A,__B);
}
+
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_shuffle_epi8(0xAAAA, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 0,14,0,12,0,10,0,8,0,6,0,4,0,2,0,0));
+
__m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_mask_shuffle_epi8
// CHECK: @llvm.x86.avx2.pshuf.b
// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
return _mm256_mask_shuffle_epi8(__W,__U,__A,__B);
}
+
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_shuffle_epi8((__m256i)(__v32qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4}, 0x80808080, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,8,2,2,2,2,2,2,2,0,3,3,3,3,3,3,3,24,4,4,4,4,4,4,4,16));
+
+
__m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_maskz_shuffle_epi8
// CHECK: @llvm.x86.avx2.pshuf.b
// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
return _mm256_maskz_shuffle_epi8(__U,__A,__B);
}
+
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_shuffle_epi8(0x0000FFFF, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
+
__m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_subs_epi8
// CHECK: @llvm.ssub.sat.v16i8
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 2b45b920cf2bc..9f2c2f302d515 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -583,6 +583,8 @@ __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
return _mm_shuffle_pi8(a, b);
}
+TEST_CONSTEXPR(match_v8qi(_mm_shuffle_pi8((__m64)(__v8qi){0,1,2,3,4,5,6,7}, (__m64)(__v8qi){10,20,30,40,50,60,70,80}), 2,4,6,0,2,4,6,0));
+
__m64 test_mm_shuffle_pi16(__m64 a) {
// CHECK-LABEL: test_mm_shuffle_pi16
// CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 588576879903b..509eec6d05baa 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -110,6 +110,8 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
return _mm_shuffle_epi8(a, b);
}
+TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,char(-15)}, (__m128i)(__v16qi){15,char(-14),13,12,11,10,9,8,7,6,5,4,3,2,1,0}), -15,0,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+
__m128i test_mm_sign_epi8(__m128i a, __m128i b) {
// CHECK-LABEL: test_mm_sign_epi8
// CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
>From ac77cd12eef7d94b3727e2b2336816f164f8fbbd Mon Sep 17 00:00:00 2001
From: shashank1545 <shashank1545 at gmail.com>
Date: Tue, 14 Oct 2025 17:54:23 +0530
Subject: [PATCH 2/4] Address review comments for PSHUFB constexpr evaluation
---
clang/lib/AST/ByteCode/InterpBuiltin.cpp | 7 -------
clang/lib/AST/ExprConstant.cpp | 10 +++-------
clang/lib/Headers/tmmintrin.h | 3 +--
3 files changed, 4 insertions(+), 16 deletions(-)
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index db3bc98452f0f..f24417c2f9503 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2722,16 +2722,9 @@ static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
const Pointer &Dst = S.Stk.peek<Pointer>();
unsigned NumElems = Dst.getNumElems();
- PrimType ElemT = Dst.getFieldDesc()->getPrimType();
- unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);
-
- assert(NumElems == 16 || NumElems == 32 || NumElems == 64);
assert(NumElems == Control.getNumElems());
assert(NumElems == Dst.getNumElems());
- if (ElemBits != 8)
- return false;
-
for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 2e5027d0397c4..a21a3b25bb429 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11629,12 +11629,8 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
QualType ElemT = VT->getElementType();
unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
-
- if (ElemBits != 8)
- return false;
unsigned NumElts = VT->getNumElements();
- if (NumElts != 16 && NumElts != 32 && NumElts != 64)
- return false;
+ bool DestUnsigned = ElemT->isUnsignedIntegerOrEnumerationType();
SmallVector<APValue, 64> ResultElements;
ResultElements.reserve(NumElts);
@@ -11642,10 +11638,10 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
APValue CtlVal = ControlVec.getVectorElt(Idx);
APSInt CtlByte = CtlVal.getInt();
- uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue() & 0xFF);
+ uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue());
if (Ctl & 0x80) {
- APSInt Zero(ElemBits, /*isUnsigned*/ false);
+ APSInt Zero(ElemBits, DestUnsigned);
Zero = 0;
ResultElements.push_back(APValue(Zero));
} else {
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 386c3704a2251..8bead94bae945 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -632,8 +632,7 @@ _mm_shuffle_pi8(__m64 __a, __m64 __b) {
return __trunc64(__builtin_ia32_pshufb128(
(__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){},
0, 1, 0, 1),
- (__v16qi)__builtin_shufflevector((__v2si)(__b), __extension__(__v2si){},
- 0, 1, 0, 1)));
+ (__v16qi)__zext128(__b)));
}
/// For each 8-bit integer in the first source operand, perform one of
>From 709e3164f48b6438267119b8afaa1aeefb700b71 Mon Sep 17 00:00:00 2001
From: shashank1545 <shashank1545 at gmail.com>
Date: Tue, 14 Oct 2025 18:47:51 +0530
Subject: [PATCH 3/4] Simplify Zero initialization
---
clang/lib/AST/ExprConstant.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a21a3b25bb429..194eb433e5ec8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11641,9 +11641,8 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue());
if (Ctl & 0x80) {
- APSInt Zero(ElemBits, DestUnsigned);
- Zero = 0;
- ResultElements.push_back(APValue(Zero));
+ APValue Zero(Info.Ctx.MakeIntValue(0, ElemT));
+ ResultElements.push_back(Zero);
} else {
unsigned LaneBase = (Idx / 16) * 16;
unsigned SrcOffset = Ctl & 0x0F;
>From df3cf14c7260c3e7e83ce1dba1ce0d2d61f0ef9e Mon Sep 17 00:00:00 2001
From: shashank1545 <shashank1545 at gmail.com>
Date: Wed, 15 Oct 2025 11:17:52 +0530
Subject: [PATCH 4/4] Removed unused variables and made test more extensive
---
clang/lib/AST/ExprConstant.cpp | 2 --
clang/test/CodeGen/X86/avx2-builtins.c | 2 +-
clang/test/CodeGen/X86/avx512bw-builtins.c | 2 +-
clang/test/CodeGen/X86/ssse3-builtins.c | 2 +-
4 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 194eb433e5ec8..0cefe6f3b9b31 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11628,9 +11628,7 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
return false;
QualType ElemT = VT->getElementType();
- unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
unsigned NumElts = VT->getNumElements();
- bool DestUnsigned = ElemT->isUnsignedIntegerOrEnumerationType();
SmallVector<APValue, 64> ResultElements;
ResultElements.reserve(NumElts);
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 7826de484f2c7..39ae5e1bf3fc0 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1106,7 +1106,7 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
return _mm256_shuffle_epi8(a, b);
}
-TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){0,33,2,35,4,37,6,39,8,41,10,43,12,45,14,47,16,49,18,51,20,53,22,55,24,57,26,59,28,61,30,63}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31));
+TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qs){0,33,2,35,4,37,6,-39,8,41,10,43,12,45,14,-47,16,49,18,51,20,53,22,-55,24,57,26,59,28,61,30,-63}), 0,1,2,3,4,5,6,0,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,0,24,25,26,27,28,29,30,0));
__m256i test_mm256_shuffle_epi32(__m256i a) {
// CHECK-LABEL: test_mm256_shuffle_epi32
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 568adaa5a49f4..fddf17d524310 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1467,7 +1467,7 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
return _mm512_shuffle_epi8(__A,__B);
}
-TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63));
+TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,-79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,-95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,0,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,0));
__m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_shuffle_epi8
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 509eec6d05baa..fb569be398111 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -110,7 +110,7 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
return _mm_shuffle_epi8(a, b);
}
-TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,char(-15)}, (__m128i)(__v16qi){15,char(-14),13,12,11,10,9,8,7,6,5,4,3,2,1,0}), -15,0,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qs){0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15}, (__m128i)(__v16qs){15,-14,13,-12,11,-10,9,-8,7,-6,5,-4,3,-2,1,0}), -15,0,-13,0,-11,0,-9,0,-7,0,-5,0,-3,0,-1,0));
__m128i test_mm_sign_epi8(__m128i a, __m128i b) {
// CHECK-LABEL: test_mm_sign_epi8
More information about the cfe-commits
mailing list