[clang] [Clang] Allow VDBPSADBW intrinsics in constexpr (PR #188887)

Sun Mar 29 10:49:20 PDT 2026

https://github.com/pierluigilenoci updated https://github.com/llvm/llvm-project/pull/188887

>From fa3f5ac7567fde45327eeaa6fa429bcfd4150592 Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Fri, 27 Mar 2026 01:43:36 +0100
Subject: [PATCH 1/4] [Clang] Allow VDBPSADBW intrinsics to be used in
 constexpr

Add constexpr evaluation support for the VDBPSADBW (Double Block Packed
Sum-Absolute-Differences) intrinsics (__builtin_ia32_dbpsadbw128/256/512)
in both the tree-based constant evaluator (ExprConstant.cpp) and the
bytecode constexpr interpreter (InterpBuiltin.cpp).

The VDBPSADBW instruction computes the sum of absolute differences of
groups of 4 unsigned bytes from the second source against two 4-byte
reference blocks selected from the first source by the immediate operand.
Per 128-bit lane, imm8[1:0] selects blockA and imm8[3:2] selects blockB
from the first source. For each group of 4 bytes in the second source,
two SAD values are computed (one against each block), producing 8 result
words per 128-bit lane.

Care is taken to treat input bytes as unsigned (the builtin signature
uses signed char vectors) by extracting via getZExtValue() and casting
to uint8_t before computing absolute differences.

Fixes #188747

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp     | 64 ++++++++++++++++++++
 clang/lib/AST/ExprConstant.cpp               | 57 +++++++++++++++++
 clang/test/CodeGen/X86/avx512bw-builtins.c   | 23 +++++--
 clang/test/CodeGen/X86/avx512vlbw-builtins.c | 58 +++++++++++++++---
 4 files changed, 190 insertions(+), 12 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 214013396e885..d9a14f84e4a8a 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2820,6 +2820,65 @@ static bool interp__builtin_ia32_pmul(
   return true;
 }
 
+static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
+                                          const CallExpr *Call) {
+  assert(Call->getNumArgs() == 3);
+  QualType Arg2Type = Call->getArg(2)->getType();
+  APSInt ImmVal = popToAPSInt(S, Arg2Type);
+  unsigned Imm = ImmVal.getZExtValue();
+
+  const Pointer &Src2 = S.Stk.pop<Pointer>();
+  const Pointer &Src1 = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  const auto *SrcVT = Call->getArg(0)->getType()->castAs<VectorType>();
+  PrimType SrcElemT = *S.getContext().classify(SrcVT->getElementType());
+  unsigned SourceLen = SrcVT->getNumElements();
+
+  const auto *DestVT = Call->getType()->castAs<VectorType>();
+  PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
+  unsigned LaneSize = 16; // 128-bit lane = 16 bytes
+  unsigned NumLanes = SourceLen / LaneSize;
+  unsigned BlockOffsetA = (Imm & 0x3) * 4;
+  unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
+
+  unsigned DstIdx = 0;
+  for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+    unsigned LaneStart = Lane * LaneSize;
+
+    for (unsigned J = 0; J < 4; ++J) {
+      unsigned SadA = 0;
+      unsigned SadB = 0;
+      for (unsigned K = 0; K < 4; ++K) {
+        unsigned A1Val, A2Val, BVal;
+        INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
+          // Treat as unsigned bytes
+          A1Val = static_cast<uint8_t>(
+              Src1.elem<T>(LaneStart + BlockOffsetA + K).toAPSInt().getZExtValue());
+          A2Val = static_cast<uint8_t>(
+              Src1.elem<T>(LaneStart + BlockOffsetB + K).toAPSInt().getZExtValue());
+          BVal = static_cast<uint8_t>(
+              Src2.elem<T>(LaneStart + 4 * J + K).toAPSInt().getZExtValue());
+        });
+        SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
+        SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);
+      }
+      INT_TYPE_SWITCH_NO_BOOL(DestElemT, {
+        Dst.elem<T>(DstIdx) =
+            static_cast<T>(APSInt(APInt(16, SadA), DestUnsigned));
+        Dst.elem<T>(DstIdx + 1) =
+            static_cast<T>(APSInt(APInt(16, SadB), DestUnsigned));
+      });
+      DstIdx += 2;
+    }
+  }
+
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp_builtin_horizontal_int_binop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
@@ -4861,6 +4920,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                  (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth));
         });
 
+  case clang::X86::BI__builtin_ia32_dbpsadbw128:
+  case clang::X86::BI__builtin_ia32_dbpsadbw256:
+  case clang::X86::BI__builtin_ia32_dbpsadbw512:
+    return interp__builtin_ia32_dbpsadbw(S, OpPC, Call);
+
   case clang::X86::BI__builtin_ia32_pmulhuw128:
   case clang::X86::BI__builtin_ia32_pmulhuw256:
   case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 4f45fa728c605..fc4a4834b462a 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12564,6 +12564,63 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
+  case clang::X86::BI__builtin_ia32_dbpsadbw128:
+  case clang::X86::BI__builtin_ia32_dbpsadbw256:
+  case clang::X86::BI__builtin_ia32_dbpsadbw512: {
+    APValue SourceA, SourceB, SourceImm;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceB) ||
+        !EvaluateAsRValue(Info, E->getArg(2), SourceImm))
+      return false;
+
+    unsigned SourceLen = SourceA.getVectorLength();
+    unsigned LaneSize = 16; // 128-bit lane = 16 bytes
+    unsigned NumLanes = SourceLen / LaneSize;
+    unsigned Imm = SourceImm.getInt().getZExtValue();
+    unsigned BlockOffsetA = (Imm & 0x3) * 4;
+    unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
+
+    auto *DestTy = E->getType()->castAs<VectorType>();
+    QualType DestEltTy = DestTy->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+    SmallVector<APValue, 32> ResultElements;
+    ResultElements.reserve(SourceLen / 2);
+
+    for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+      unsigned LaneStart = Lane * LaneSize;
+
+      for (unsigned J = 0; J < 4; ++J) {
+        // Compute SAD of SourceB[4*J..4*J+3] vs blockA from SourceA
+        unsigned SadA = 0;
+        unsigned SadB = 0;
+        for (unsigned K = 0; K < 4; ++K) {
+          // Treat input bytes as unsigned
+          unsigned A = static_cast<uint8_t>(
+              SourceA.getVectorElt(LaneStart + BlockOffsetA + K)
+                  .getInt()
+                  .getZExtValue());
+          unsigned B = static_cast<uint8_t>(
+              SourceB.getVectorElt(LaneStart + 4 * J + K)
+                  .getInt()
+                  .getZExtValue());
+          SadA += (B > A) ? (B - A) : (A - B);
+
+          unsigned A2 = static_cast<uint8_t>(
+              SourceA.getVectorElt(LaneStart + BlockOffsetB + K)
+                  .getInt()
+                  .getZExtValue());
+          SadB += (B > A2) ? (B - A2) : (A2 - B);
+        }
+        ResultElements.push_back(
+            APValue(APSInt(APInt(16, SadA), DestUnsigned)));
+        ResultElements.push_back(
+            APValue(APSInt(APInt(16, SadB), DestUnsigned)));
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case clang::X86::BI__builtin_ia32_pmulhuw128:
   case clang::X86::BI__builtin_ia32_pmulhuw256:
   case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 2020b72a649ae..488146e740db4 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -3230,21 +3230,36 @@ TEST_CONSTEXPR(match_v64qi(_mm512_maskz_alignr_epi8((__mmask64)0x000000000000000
 __m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mm_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
-  return _mm512_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm512_dbsad_epu8(__A, __B, 170);
+}
+// 512-bit: 4 lanes, imm8=0: blockA=blockB=lane[0..3] for each lane
+// Each lane behaves the same as the 128-bit case with matching data
+TEST_CONSTEXPR(match_v32hu(_mm512_dbsad_epu8(
+  ((__m512i)(__v64qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m512i)(__v64qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52));
 
 __m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mm_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
 
 __m512i test_mm512_mm_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mm_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
 
 __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 0ee14909ae805..098ee29b1989e 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -3676,41 +3676,83 @@ TEST_CONSTEXPR(match_v32qi(_mm256_maskz_alignr_epi8((__mmask32)0xf000000f, ((__m
 __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
-  return _mm_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm_dbsad_epu8(__A, __B, 170);
+}
+// imm8=4: blockA=A[0..3]={0,1,2,3}, blockB=A[4..7]={4,5,6,7}
+// J=0: B[0..3]={1,2,3,4} vs blockA=4, vs blockB=12
+// J=1: B[4..7]={5,6,7,8} vs blockA=20, vs blockB=4
+// J=2: B[8..11]={9,10,11,12} vs blockA=36, vs blockB=20
+// J=3: B[12..15]={13,14,15,16} vs blockA=52, vs blockB=36
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 4, 12, 20, 4, 36, 20, 52, 36));
+// imm8=0: blockA=blockB=A[0..3]={0,1,2,3}
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52));
+// Test with unsigned values > 127 (signed overflow territory)
+// A[0..3]={200,100,50,25}, B[0..3]={180,120,40,30}
+// imm8=0: blockA=blockB=A[0..3]
+// SAD = |180-200|+|120-100|+|40-50|+|30-25| = 20+20+10+5 = 55
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){200, 100, 50, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
+  ((__m128i)(__v16qu){180, 120, 40, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
+  0), 55, 55, 375, 375, 375, 375, 375, 375));
 
 __m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
+// Test masked version: mask=0x55 (keep even elements, passthrough odd)
+TEST_CONSTEXPR(match_v8hu(_mm_mask_dbsad_epu8(
+  ((__m128i)(__v8hu){99, 99, 99, 99, 99, 99, 99, 99}), (__mmask8)0x55,
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 4, 99, 20, 99, 36, 99, 52, 99));
 
 __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
+// Test zero-masked version: mask=0xAA (keep odd elements, zero even)
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_dbsad_epu8((__mmask8)0xAA,
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 0, 12, 0, 4, 0, 20, 0, 36));
 
 __m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
-  return _mm256_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm256_dbsad_epu8(__A, __B, 170);
+}
+// 256-bit: 2 lanes, imm8=0: blockA=blockB=lane[0..3]
+// Lane 0: same as 128-bit test above
+// Lane 1: A[16..19]={16,17,18,19}, B[16..19]={17,18,19,20} -> SAD=4
+TEST_CONSTEXPR(match_v16hu(_mm256_dbsad_epu8(
+  ((__m256i)(__v32qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}),
+  ((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52, 4, 4, 20, 20, 36, 36, 52, 52));
 
 __m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
 
 __m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
 __mmask8 test_mm_movepi16_mask(__m128i __A) {
   // CHECK-LABEL: test_mm_movepi16_mask

>From 8b292ead3eb31c002f6deb2e7179b1b208f6076d Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Fri, 27 Mar 2026 18:43:08 +0100
Subject: [PATCH 2/4] fix: address reviewer feedback for constexpr VDBPSADBW

- Add Constexpr tags to BuiltinsX86.td for VDBPSADBW builtins
- Update InterpBuiltin.cpp per tbaederr's suggestions:
  - Use popToUInt64 instead of popToAPSInt for immediate value
  - Use != instead of < in loop comparison
  - Simplify element access by removing unnecessary toAPSInt().getZExtValue()
- Apply clang-format fix in ExprConstant.cpp

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/include/clang/Basic/BuiltinsX86.td |  6 +++---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 12 +++++-------
 clang/lib/AST/ExprConstant.cpp           |  8 ++++----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index f47532a63de04..e54f8d66843bf 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -3197,15 +3197,15 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVect
   def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def dbpsadbw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def dbpsadbw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
   def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d9a14f84e4a8a..5d46b2c595b1f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2823,9 +2823,7 @@ static bool interp__builtin_ia32_pmul(
 static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
                                           const CallExpr *Call) {
   assert(Call->getNumArgs() == 3);
-  QualType Arg2Type = Call->getArg(2)->getType();
-  APSInt ImmVal = popToAPSInt(S, Arg2Type);
-  unsigned Imm = ImmVal.getZExtValue();
+  unsigned Imm = popToUInt64(S, Call->getArg(2));
 
   const Pointer &Src2 = S.Stk.pop<Pointer>();
   const Pointer &Src1 = S.Stk.pop<Pointer>();
@@ -2845,7 +2843,7 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
   unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
 
   unsigned DstIdx = 0;
-  for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
     unsigned LaneStart = Lane * LaneSize;
 
     for (unsigned J = 0; J < 4; ++J) {
@@ -2856,11 +2854,11 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
         INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
           // Treat as unsigned bytes
           A1Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetA + K).toAPSInt().getZExtValue());
+              Src1.elem<T>(LaneStart + BlockOffsetA + K));
           A2Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetB + K).toAPSInt().getZExtValue());
+              Src1.elem<T>(LaneStart + BlockOffsetB + K));
           BVal = static_cast<uint8_t>(
-              Src2.elem<T>(LaneStart + 4 * J + K).toAPSInt().getZExtValue());
+              Src2.elem<T>(LaneStart + 4 * J + K));
         });
         SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
         SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index fc4a4834b462a..2a6e1713fba4d 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12599,10 +12599,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
               SourceA.getVectorElt(LaneStart + BlockOffsetA + K)
                   .getInt()
                   .getZExtValue());
-          unsigned B = static_cast<uint8_t>(
-              SourceB.getVectorElt(LaneStart + 4 * J + K)
-                  .getInt()
-                  .getZExtValue());
+          unsigned B =
+              static_cast<uint8_t>(SourceB.getVectorElt(LaneStart + 4 * J + K)
+                                       .getInt()
+                                       .getZExtValue());
           SadA += (B > A) ? (B - A) : (A - B);
 
           unsigned A2 = static_cast<uint8_t>(

>From b2ef04823423b38720746a63a0a661a54852de90 Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Sat, 28 Mar 2026 16:50:42 +0100
Subject: [PATCH 3/4] style: apply clang-format to modified files

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5d46b2c595b1f..4ba611cf68013 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2853,12 +2853,11 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
         unsigned A1Val, A2Val, BVal;
         INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
           // Treat as unsigned bytes
-          A1Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetA + K));
-          A2Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetB + K));
-          BVal = static_cast<uint8_t>(
-              Src2.elem<T>(LaneStart + 4 * J + K));
+          A1Val =
+              static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetA + K));
+          A2Val =
+              static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetB + K));
+          BVal = static_cast<uint8_t>(Src2.elem<T>(LaneStart + 4 * J + K));
         });
         SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
         SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);

>From bea99a2dc693330aeacf0c7a386bee072221cd5c Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Sat, 28 Mar 2026 22:35:25 +0100
Subject: [PATCH 4/4] fix: address review feedback - split psadbw512 from
 constexpr block

Split psadbw512 out of the shared BuiltinsX86.td let-block with
dbpsadbw512 to avoid erroneously marking psadbw512 as Constexpr.
The psadbw512 builtin does not have constexpr evaluation support,
so it should not be tagged with Constexpr.

This addresses RKSimon's review feedback about missing/incorrect
Constexpr tags. The three code suggestions from tbaederr were
already addressed in a prior commit:
- Use popToUInt64 instead of popToAPSInt for the immediate value
- Use != instead of < in the lane loop comparison
- Remove unnecessary .toAPSInt().getZExtValue() from element access

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/include/clang/Basic/BuiltinsX86.td | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index e54f8d66843bf..59c79bf681103 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -3207,6 +3207,9 @@ let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, Req
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">;
 }