[clang] [X86][ByteCode] Allow PSHUFB intrinsics to be used in constexpr #156612 (PR #163148)

via cfe-commits cfe-commits at lists.llvm.org
Wed Oct 15 01:45:55 PDT 2025


https://github.com/shashank1545 updated https://github.com/llvm/llvm-project/pull/163148

>From 8d16fb7a55101dd282678bc06f91f8fca3d246ae Mon Sep 17 00:00:00 2001
From: shashank1545 <shashank1545 at gmail.com>
Date: Mon, 13 Oct 2025 14:06:45 +0530
Subject: [PATCH 1/4] [X86][ByteCode] Allow PSHUFB intrinsics to be used in
 constexpr #156612
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PSHUFB instruction shuffles bytes within each 128-bit lane: for each
control byte, if bit 7 is set, the output byte is zeroed; otherwise, the
low 4 bits select a source byte (0–15) from the same lane.
---
 clang/include/clang/Basic/BuiltinsX86.td     |  8 +--
 clang/lib/AST/ByteCode/InterpBuiltin.cpp     | 40 +++++++++++++++
 clang/lib/AST/ExprConstant.cpp               | 54 ++++++++++++++++++++
 clang/lib/Headers/avx2intrin.h               |  5 +-
 clang/lib/Headers/avx512bwintrin.h           | 15 +++---
 clang/lib/Headers/avx512vlbwintrin.h         | 20 +++-----
 clang/lib/Headers/tmmintrin.h                | 21 ++++----
 clang/test/CodeGen/X86/avx2-builtins.c       |  2 +
 clang/test/CodeGen/X86/avx512bw-builtins.c   |  9 ++++
 clang/test/CodeGen/X86/avx512vlbw-builtins.c | 13 +++++
 clang/test/CodeGen/X86/mmx-builtins.c        |  2 +
 clang/test/CodeGen/X86/ssse3-builtins.c      |  2 +
 12 files changed, 153 insertions(+), 38 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 217589d7add1d..d5e6e94ce6010 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -124,7 +124,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
     }
 
     def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-    def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
     def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
     def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
@@ -132,6 +131,7 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
 
   let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
     def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
+    def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
   }
 }
 
@@ -588,7 +588,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
   def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
-  def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
@@ -627,6 +626,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
 
+  def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+
   def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
   def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
   def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
@@ -1318,7 +1319,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
-  def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
 }
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
@@ -1326,6 +1326,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVect
   def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
   def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
   def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
+
+  def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
 }
 
 let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 3811fb072fa16..db3bc98452f0f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2714,6 +2714,41 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
+                                        const CallExpr *Call) {
+  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
+  const Pointer &Control = S.Stk.pop<Pointer>();
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  unsigned NumElems = Dst.getNumElems();
+  PrimType ElemT = Dst.getFieldDesc()->getPrimType();
+  unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);
+
+  assert(NumElems == 16 || NumElems == 32 || NumElems == 64);
+  assert(NumElems == Control.getNumElems());
+  assert(NumElems == Dst.getNumElems());
+
+  if (ElemBits != 8)
+    return false;
+
+  for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
+    uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));
+
+    if (Ctlb & 0x80) {
+      Dst.elem<int8_t>(Idx) = 0;
+    } else {
+      unsigned LaneBase = (Idx / 16) * 16;
+      unsigned SrcOffset = Ctlb & 0x0F;
+      unsigned SrcIdx = LaneBase + SrcOffset;
+
+      Dst.elem<int8_t>(Idx) = Src.elem<int8_t>(SrcIdx);
+    }
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
                                        const CallExpr *Call, bool IsShufHW) {
   assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
@@ -3739,6 +3774,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_selectpd_512:
     return interp__builtin_select(S, OpPC, Call);
 
+  case X86::BI__builtin_ia32_pshufb128:
+  case X86::BI__builtin_ia32_pshufb256:
+  case X86::BI__builtin_ia32_pshufb512:
+    return interp__builtin_ia32_pshufb(S, OpPC, Call);
+
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 35a866ea5010f..2e5027d0397c4 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11615,6 +11615,51 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
   return true;
 }
 
+static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
+                              APValue &Out) {
+  APValue SrcVec, ControlVec;
+  if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec))
+    return false;
+  if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec))
+    return false;
+
+  const auto *VT = Call->getType()->getAs<VectorType>();
+  if (!VT)
+    return false;
+
+  QualType ElemT = VT->getElementType();
+  unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
+
+  if (ElemBits != 8)
+    return false;
+  unsigned NumElts = VT->getNumElements();
+  if (NumElts != 16 && NumElts != 32 && NumElts != 64)
+    return false;
+
+  SmallVector<APValue, 64> ResultElements;
+  ResultElements.reserve(NumElts);
+
+  for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
+    APValue CtlVal = ControlVec.getVectorElt(Idx);
+    APSInt CtlByte = CtlVal.getInt();
+    uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue() & 0xFF);
+
+    if (Ctl & 0x80) {
+      APSInt Zero(ElemBits, /*isUnsigned*/ false);
+      Zero = 0;
+      ResultElements.push_back(APValue(Zero));
+    } else {
+      unsigned LaneBase = (Idx / 16) * 16;
+      unsigned SrcOffset = Ctl & 0x0F;
+      unsigned SrcIdx = LaneBase + SrcOffset;
+
+      ResultElements.push_back(SrcVec.getVectorElt(SrcIdx));
+    }
+  }
+  Out = APValue(ResultElements.data(), ResultElements.size());
+  return true;
+}
+
 static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
                              bool IsShufHW, APValue &Out) {
   APValue Vec;
@@ -12189,6 +12234,15 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
+  case X86::BI__builtin_ia32_pshufb128:
+  case X86::BI__builtin_ia32_pshufb256:
+  case X86::BI__builtin_ia32_pshufb512: {
+    APValue R;
+    if (!evalPshufbBuiltin(Info, E, R))
+      return false;
+    return Success(R, E);
+  }
+
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 4aaca2db8236a..f586c1c1d7439 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1858,9 +1858,8 @@ _mm256_sad_epu8(__m256i __a, __m256i __b)
 ///    control byte specify the index (within the same 128-bit half) of \a __a
 ///    to copy to the result byte.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shuffle_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
 }
 
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 473fe94af65d8..23b2d290422b1 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -866,23 +866,20 @@ _mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
                                             (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_shuffle_epi8(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                          (__v64qi)_mm512_shuffle_epi8(__A, __B),
                                          (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                          (__v64qi)_mm512_shuffle_epi8(__A, __B),
                                          (__v64qi)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 81e4cbb9615c1..639fb60f476c6 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1067,33 +1067,29 @@ _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
                                             (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                             (__v16qi)_mm_shuffle_epi8(__A, __B),
                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                             (__v16qi)_mm_shuffle_epi8(__A, __B),
                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                          (__v32qi)_mm256_shuffle_epi8(__A, __B),
                                          (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                          (__v32qi)_mm256_shuffle_epi8(__A, __B),
                                          (__v32qi)_mm256_setzero_si256());
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 3fc9f9834baa9..386c3704a2251 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -603,10 +603,9 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 ///    Bits [6:4] Reserved.  \n
 ///    Bits [3:0] select the source byte to be copied.
 /// \returns A 128-bit integer vector containing the copied or cleared values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shuffle_epi8(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_shuffle_epi8(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Copies the 8-bit integers from a 64-bit integer vector to the
@@ -628,13 +627,13 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 ///    destination. \n
 ///    Bits [2:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_shuffle_pi8(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pshufb128(
-        (__v16qi)__builtin_shufflevector(
-            (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
-        (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_shuffle_pi8(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pshufb128(
+      (__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){},
+                                       0, 1, 0, 1),
+      (__v16qi)__builtin_shufflevector((__v2si)(__b), __extension__(__v2si){},
+                                       0, 1, 0, 1)));
 }
 
 /// For each 8-bit integer in the first source operand, perform one of
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 55f18f947b96f..7826de484f2c7 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1106,6 +1106,8 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
   return _mm256_shuffle_epi8(a, b);
 }
 
+TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){0,33,2,35,4,37,6,39,8,41,10,43,12,45,14,47,16,49,18,51,20,53,22,55,24,57,26,59,28,61,30,63}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31));
+
 __m256i test_mm256_shuffle_epi32(__m256i a) {
   // CHECK-LABEL: test_mm256_shuffle_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index af1c904a6de48..568adaa5a49f4 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1466,18 +1466,27 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   return _mm512_shuffle_epi8(__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63));
+
 __m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_shuffle_epi8
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_shuffle_epi8((__m512i)(__v64qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8}, 0xFFFFFFFF00000000, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48));
+
 __m512i test_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_shuffle_epi8
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_shuffle_epi8(__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_shuffle_epi8(0x8888888888888888,(__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64}), 0,0,0,12,0,0,0,8,0,0,0,4,0,0,0,0,0,0,0,28,0,0,0,24,0,0,0,20,0,0,0,16,0,0,0,44,0,0,0,40,0,0,0,36,0,0,0,32,0,0,0,60,0,0,0,56,0,0,0,52,0,0,0,48));
+
 __m512i test_mm512_subs_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_subs_epi8
   // CHECK: @llvm.ssub.sat.v64i8
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index c0e46de8b81bf..d569283928a0a 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -1688,24 +1688,37 @@ __m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m12
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v16qi(_mm_mask_shuffle_epi8((__m128i)(__v16qi){1,1,1,1,1,1,1,1,2,2,4,4,6,6,8,8}, 0x00FF, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,2,2,4,4,6,6,8,8));
+
 __m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_shuffle_epi8
   // CHECK: @llvm.x86.ssse3.pshuf.b
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_shuffle_epi8(__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_shuffle_epi8(0xAAAA, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 0,14,0,12,0,10,0,8,0,6,0,4,0,2,0,0));
+
 __m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_shuffle_epi8
   // CHECK: @llvm.x86.avx2.pshuf.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_shuffle_epi8((__m256i)(__v32qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4}, 0x80808080, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,8,2,2,2,2,2,2,2,0,3,3,3,3,3,3,3,24,4,4,4,4,4,4,4,16));
+                
+
 __m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_shuffle_epi8
   // CHECK: @llvm.x86.avx2.pshuf.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_shuffle_epi8(__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_shuffle_epi8(0x0000FFFF, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
+
 __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_subs_epi8
   // CHECK: @llvm.ssub.sat.v16i8
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 2b45b920cf2bc..9f2c2f302d515 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -583,6 +583,8 @@ __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
   return _mm_shuffle_pi8(a, b);
 }
 
+TEST_CONSTEXPR(match_v8qi(_mm_shuffle_pi8((__m64)(__v8qi){0,1,2,3,4,5,6,7}, (__m64)(__v8qi){10,20,30,40,50,60,70,80}), 2,4,6,0,2,4,6,0));
+
 __m64 test_mm_shuffle_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_shuffle_pi16
   // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 588576879903b..509eec6d05baa 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -110,6 +110,8 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
   return _mm_shuffle_epi8(a, b);
 }
 
+TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,char(-15)}, (__m128i)(__v16qi){15,char(-14),13,12,11,10,9,8,7,6,5,4,3,2,1,0}), -15,0,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+
 __m128i test_mm_sign_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi8
   // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})

>From ac77cd12eef7d94b3727e2b2336816f164f8fbbd Mon Sep 17 00:00:00 2001
From: shashank1545 <shashank1545 at gmail.com>
Date: Tue, 14 Oct 2025 17:54:23 +0530
Subject: [PATCH 2/4] Address review comments for PSHUFB constexpr evaluation

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp |  7 -------
 clang/lib/AST/ExprConstant.cpp           | 10 +++-------
 clang/lib/Headers/tmmintrin.h            |  3 +--
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index db3bc98452f0f..f24417c2f9503 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2722,16 +2722,9 @@ static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
   const Pointer &Dst = S.Stk.peek<Pointer>();
 
   unsigned NumElems = Dst.getNumElems();
-  PrimType ElemT = Dst.getFieldDesc()->getPrimType();
-  unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);
-
-  assert(NumElems == 16 || NumElems == 32 || NumElems == 64);
   assert(NumElems == Control.getNumElems());
   assert(NumElems == Dst.getNumElems());
 
-  if (ElemBits != 8)
-    return false;
-
   for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
     uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 2e5027d0397c4..a21a3b25bb429 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11629,12 +11629,8 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
 
   QualType ElemT = VT->getElementType();
   unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
-
-  if (ElemBits != 8)
-    return false;
   unsigned NumElts = VT->getNumElements();
-  if (NumElts != 16 && NumElts != 32 && NumElts != 64)
-    return false;
+  bool DestUnsigned = ElemT->isUnsignedIntegerOrEnumerationType();
 
   SmallVector<APValue, 64> ResultElements;
   ResultElements.reserve(NumElts);
@@ -11642,10 +11638,10 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
   for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
     APValue CtlVal = ControlVec.getVectorElt(Idx);
     APSInt CtlByte = CtlVal.getInt();
-    uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue() & 0xFF);
+    uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue());
 
     if (Ctl & 0x80) {
-      APSInt Zero(ElemBits, /*isUnsigned*/ false);
+      APSInt Zero(ElemBits, DestUnsigned);
       Zero = 0;
       ResultElements.push_back(APValue(Zero));
     } else {
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 386c3704a2251..8bead94bae945 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -632,8 +632,7 @@ _mm_shuffle_pi8(__m64 __a, __m64 __b) {
   return __trunc64(__builtin_ia32_pshufb128(
       (__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){},
                                        0, 1, 0, 1),
-      (__v16qi)__builtin_shufflevector((__v2si)(__b), __extension__(__v2si){},
-                                       0, 1, 0, 1)));
+      (__v16qi)__zext128(__b)));
 }
 
 /// For each 8-bit integer in the first source operand, perform one of

>From 709e3164f48b6438267119b8afaa1aeefb700b71 Mon Sep 17 00:00:00 2001
From: shashank1545 <shashank1545 at gmail.com>
Date: Tue, 14 Oct 2025 18:47:51 +0530
Subject: [PATCH 3/4] Simplify Zero initialization

---
 clang/lib/AST/ExprConstant.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a21a3b25bb429..194eb433e5ec8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11641,9 +11641,8 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
     uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue());
 
     if (Ctl & 0x80) {
-      APSInt Zero(ElemBits, DestUnsigned);
-      Zero = 0;
-      ResultElements.push_back(APValue(Zero));
+      APValue Zero(Info.Ctx.MakeIntValue(0, ElemT));
+      ResultElements.push_back(Zero);
     } else {
       unsigned LaneBase = (Idx / 16) * 16;
       unsigned SrcOffset = Ctl & 0x0F;

>From df3cf14c7260c3e7e83ce1dba1ce0d2d61f0ef9e Mon Sep 17 00:00:00 2001
From: shashank1545 <shashank1545 at gmail.com>
Date: Wed, 15 Oct 2025 11:17:52 +0530
Subject: [PATCH 4/4] Removed unused variables and made test more extensive

---
 clang/lib/AST/ExprConstant.cpp             | 2 --
 clang/test/CodeGen/X86/avx2-builtins.c     | 2 +-
 clang/test/CodeGen/X86/avx512bw-builtins.c | 2 +-
 clang/test/CodeGen/X86/ssse3-builtins.c    | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 194eb433e5ec8..0cefe6f3b9b31 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11628,9 +11628,7 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
     return false;
 
   QualType ElemT = VT->getElementType();
-  unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
   unsigned NumElts = VT->getNumElements();
-  bool DestUnsigned = ElemT->isUnsignedIntegerOrEnumerationType();
 
   SmallVector<APValue, 64> ResultElements;
   ResultElements.reserve(NumElts);
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 7826de484f2c7..39ae5e1bf3fc0 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1106,7 +1106,7 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
   return _mm256_shuffle_epi8(a, b);
 }
 
-TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){0,33,2,35,4,37,6,39,8,41,10,43,12,45,14,47,16,49,18,51,20,53,22,55,24,57,26,59,28,61,30,63}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31));
+TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qs){0,33,2,35,4,37,6,-39,8,41,10,43,12,45,14,-47,16,49,18,51,20,53,22,-55,24,57,26,59,28,61,30,-63}), 0,1,2,3,4,5,6,0,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,0,24,25,26,27,28,29,30,0));
 
 __m256i test_mm256_shuffle_epi32(__m256i a) {
   // CHECK-LABEL: test_mm256_shuffle_epi32
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 568adaa5a49f4..fddf17d524310 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1467,7 +1467,7 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
   return _mm512_shuffle_epi8(__A,__B); 
 }
 
-TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63));
+TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,-79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,-95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,0,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,0));
 
 __m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_shuffle_epi8
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 509eec6d05baa..fb569be398111 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -110,7 +110,7 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
   return _mm_shuffle_epi8(a, b);
 }
 
-TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,char(-15)}, (__m128i)(__v16qi){15,char(-14),13,12,11,10,9,8,7,6,5,4,3,2,1,0}), -15,0,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qs){0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15}, (__m128i)(__v16qs){15,-14,13,-12,11,-10,9,-8,7,-6,5,-4,3,-2,1,0}), -15,0,-13,0,-11,0,-9,0,-7,0,-5,0,-3,0,-1,0));
 
 __m128i test_mm_sign_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi8



More information about the cfe-commits mailing list