[clang] [Clang] Allow VDBPSADBW intrinsics in constexpr (PR #188887)

Sun Mar 29 14:21:37 PDT 2026

https://github.com/pierluigilenoci updated https://github.com/llvm/llvm-project/pull/188887

>From fa3f5ac7567fde45327eeaa6fa429bcfd4150592 Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Fri, 27 Mar 2026 01:43:36 +0100
Subject: [PATCH 1/5] [Clang] Allow VDBPSADBW intrinsics to be used in
 constexpr

Add constexpr evaluation support for the VDBPSADBW (Double Block Packed
Sum-Absolute-Differences) intrinsics (__builtin_ia32_dbpsadbw128/256/512)
in both the tree-based constant evaluator (ExprConstant.cpp) and the
bytecode constexpr interpreter (InterpBuiltin.cpp).

The VDBPSADBW instruction computes the sum of absolute differences of
groups of 4 unsigned bytes from the second source against two 4-byte
reference blocks selected from the first source by the immediate operand.
Per 128-bit lane, imm8[1:0] selects blockA and imm8[3:2] selects blockB
from the first source. For each group of 4 bytes in the second source,
two SAD values are computed (one against each block), producing 8 result
words per 128-bit lane.

Care is taken to treat input bytes as unsigned (the builtin signature
uses signed char vectors) by extracting via getZExtValue() and casting
to uint8_t before computing absolute differences.

Fixes #188747

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp     | 64 ++++++++++++++++++++
 clang/lib/AST/ExprConstant.cpp               | 57 +++++++++++++++++
 clang/test/CodeGen/X86/avx512bw-builtins.c   | 23 +++++--
 clang/test/CodeGen/X86/avx512vlbw-builtins.c | 58 +++++++++++++++---
 4 files changed, 190 insertions(+), 12 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 214013396e885..d9a14f84e4a8a 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2820,6 +2820,65 @@ static bool interp__builtin_ia32_pmul(
   return true;
 }
 
+static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
+                                          const CallExpr *Call) {
+  assert(Call->getNumArgs() == 3);
+  QualType Arg2Type = Call->getArg(2)->getType();
+  APSInt ImmVal = popToAPSInt(S, Arg2Type);
+  unsigned Imm = ImmVal.getZExtValue();
+
+  const Pointer &Src2 = S.Stk.pop<Pointer>();
+  const Pointer &Src1 = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  const auto *SrcVT = Call->getArg(0)->getType()->castAs<VectorType>();
+  PrimType SrcElemT = *S.getContext().classify(SrcVT->getElementType());
+  unsigned SourceLen = SrcVT->getNumElements();
+
+  const auto *DestVT = Call->getType()->castAs<VectorType>();
+  PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
+  unsigned LaneSize = 16; // 128-bit lane = 16 bytes
+  unsigned NumLanes = SourceLen / LaneSize;
+  unsigned BlockOffsetA = (Imm & 0x3) * 4;
+  unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
+
+  unsigned DstIdx = 0;
+  for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+    unsigned LaneStart = Lane * LaneSize;
+
+    for (unsigned J = 0; J < 4; ++J) {
+      unsigned SadA = 0;
+      unsigned SadB = 0;
+      for (unsigned K = 0; K < 4; ++K) {
+        unsigned A1Val, A2Val, BVal;
+        INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
+          // Treat as unsigned bytes
+          A1Val = static_cast<uint8_t>(
+              Src1.elem<T>(LaneStart + BlockOffsetA + K).toAPSInt().getZExtValue());
+          A2Val = static_cast<uint8_t>(
+              Src1.elem<T>(LaneStart + BlockOffsetB + K).toAPSInt().getZExtValue());
+          BVal = static_cast<uint8_t>(
+              Src2.elem<T>(LaneStart + 4 * J + K).toAPSInt().getZExtValue());
+        });
+        SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
+        SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);
+      }
+      INT_TYPE_SWITCH_NO_BOOL(DestElemT, {
+        Dst.elem<T>(DstIdx) =
+            static_cast<T>(APSInt(APInt(16, SadA), DestUnsigned));
+        Dst.elem<T>(DstIdx + 1) =
+            static_cast<T>(APSInt(APInt(16, SadB), DestUnsigned));
+      });
+      DstIdx += 2;
+    }
+  }
+
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp_builtin_horizontal_int_binop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
@@ -4861,6 +4920,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                  (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth));
         });
 
+  case clang::X86::BI__builtin_ia32_dbpsadbw128:
+  case clang::X86::BI__builtin_ia32_dbpsadbw256:
+  case clang::X86::BI__builtin_ia32_dbpsadbw512:
+    return interp__builtin_ia32_dbpsadbw(S, OpPC, Call);
+
   case clang::X86::BI__builtin_ia32_pmulhuw128:
   case clang::X86::BI__builtin_ia32_pmulhuw256:
   case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 4f45fa728c605..fc4a4834b462a 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12564,6 +12564,63 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
+  case clang::X86::BI__builtin_ia32_dbpsadbw128:
+  case clang::X86::BI__builtin_ia32_dbpsadbw256:
+  case clang::X86::BI__builtin_ia32_dbpsadbw512: {
+    APValue SourceA, SourceB, SourceImm;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceB) ||
+        !EvaluateAsRValue(Info, E->getArg(2), SourceImm))
+      return false;
+
+    unsigned SourceLen = SourceA.getVectorLength();
+    unsigned LaneSize = 16; // 128-bit lane = 16 bytes
+    unsigned NumLanes = SourceLen / LaneSize;
+    unsigned Imm = SourceImm.getInt().getZExtValue();
+    unsigned BlockOffsetA = (Imm & 0x3) * 4;
+    unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
+
+    auto *DestTy = E->getType()->castAs<VectorType>();
+    QualType DestEltTy = DestTy->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+    SmallVector<APValue, 32> ResultElements;
+    ResultElements.reserve(SourceLen / 2);
+
+    for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+      unsigned LaneStart = Lane * LaneSize;
+
+      for (unsigned J = 0; J < 4; ++J) {
+        // Compute SAD of SourceB[4*J..4*J+3] vs blockA from SourceA
+        unsigned SadA = 0;
+        unsigned SadB = 0;
+        for (unsigned K = 0; K < 4; ++K) {
+          // Treat input bytes as unsigned
+          unsigned A = static_cast<uint8_t>(
+              SourceA.getVectorElt(LaneStart + BlockOffsetA + K)
+                  .getInt()
+                  .getZExtValue());
+          unsigned B = static_cast<uint8_t>(
+              SourceB.getVectorElt(LaneStart + 4 * J + K)
+                  .getInt()
+                  .getZExtValue());
+          SadA += (B > A) ? (B - A) : (A - B);
+
+          unsigned A2 = static_cast<uint8_t>(
+              SourceA.getVectorElt(LaneStart + BlockOffsetB + K)
+                  .getInt()
+                  .getZExtValue());
+          SadB += (B > A2) ? (B - A2) : (A2 - B);
+        }
+        ResultElements.push_back(
+            APValue(APSInt(APInt(16, SadA), DestUnsigned)));
+        ResultElements.push_back(
+            APValue(APSInt(APInt(16, SadB), DestUnsigned)));
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case clang::X86::BI__builtin_ia32_pmulhuw128:
   case clang::X86::BI__builtin_ia32_pmulhuw256:
   case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 2020b72a649ae..488146e740db4 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -3230,21 +3230,36 @@ TEST_CONSTEXPR(match_v64qi(_mm512_maskz_alignr_epi8((__mmask64)0x000000000000000
 __m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mm_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
-  return _mm512_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm512_dbsad_epu8(__A, __B, 170);
+}
+// 512-bit: 4 lanes, imm8=0: blockA=blockB=lane[0..3] for each lane
+// Each lane behaves the same as the 128-bit case with matching data
+TEST_CONSTEXPR(match_v32hu(_mm512_dbsad_epu8(
+  ((__m512i)(__v64qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m512i)(__v64qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52));
 
 __m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mm_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
 
 __m512i test_mm512_mm_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mm_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
 
 __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 0ee14909ae805..098ee29b1989e 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -3676,41 +3676,83 @@ TEST_CONSTEXPR(match_v32qi(_mm256_maskz_alignr_epi8((__mmask32)0xf000000f, ((__m
 __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
-  return _mm_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm_dbsad_epu8(__A, __B, 170);
+}
+// imm8=4: blockA=A[0..3]={0,1,2,3}, blockB=A[4..7]={4,5,6,7}
+// J=0: B[0..3]={1,2,3,4} vs blockA=4, vs blockB=12
+// J=1: B[4..7]={5,6,7,8} vs blockA=20, vs blockB=4
+// J=2: B[8..11]={9,10,11,12} vs blockA=36, vs blockB=20
+// J=3: B[12..15]={13,14,15,16} vs blockA=52, vs blockB=36
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 4, 12, 20, 4, 36, 20, 52, 36));
+// imm8=0: blockA=blockB=A[0..3]={0,1,2,3}
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52));
+// Test with unsigned values > 127 (signed overflow territory)
+// A[0..3]={200,100,50,25}, B[0..3]={180,120,40,30}
+// imm8=0: blockA=blockB=A[0..3]
+// SAD = |180-200|+|120-100|+|40-50|+|30-25| = 20+20+10+5 = 55
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){200, 100, 50, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
+  ((__m128i)(__v16qu){180, 120, 40, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
+  0), 55, 55, 375, 375, 375, 375, 375, 375));
 
 __m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
+// Test masked version: mask=0x55 (keep even elements, passthrough odd)
+TEST_CONSTEXPR(match_v8hu(_mm_mask_dbsad_epu8(
+  ((__m128i)(__v8hu){99, 99, 99, 99, 99, 99, 99, 99}), (__mmask8)0x55,
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 4, 99, 20, 99, 36, 99, 52, 99));
 
 __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
+// Test zero-masked version: mask=0xAA (keep odd elements, zero even)
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_dbsad_epu8((__mmask8)0xAA,
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 0, 12, 0, 4, 0, 20, 0, 36));
 
 __m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
-  return _mm256_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm256_dbsad_epu8(__A, __B, 170);
+}
+// 256-bit: 2 lanes, imm8=0: blockA=blockB=lane[0..3]
+// Lane 0: same as 128-bit test above
+// Lane 1: A[16..19]={16,17,18,19}, B[16..19]={17,18,19,20} -> SAD=4
+TEST_CONSTEXPR(match_v16hu(_mm256_dbsad_epu8(
+  ((__m256i)(__v32qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}),
+  ((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52, 4, 4, 20, 20, 36, 36, 52, 52));
 
 __m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
 
 __m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
 __mmask8 test_mm_movepi16_mask(__m128i __A) {
   // CHECK-LABEL: test_mm_movepi16_mask

>From 8b292ead3eb31c002f6deb2e7179b1b208f6076d Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Fri, 27 Mar 2026 18:43:08 +0100
Subject: [PATCH 2/5] fix: address reviewer feedback for constexpr VDBPSADBW

- Add Constexpr tags to BuiltinsX86.td for VDBPSADBW builtins
- Update InterpBuiltin.cpp per tbaederr's suggestions:
  - Use popToUInt64 instead of popToAPSInt for immediate value
  - Use != instead of < in loop comparison
  - Simplify element access by removing unnecessary toAPSInt().getZExtValue()
- Apply clang-format fix in ExprConstant.cpp

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/include/clang/Basic/BuiltinsX86.td |  6 +++---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 12 +++++-------
 clang/lib/AST/ExprConstant.cpp           |  8 ++++----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index f47532a63de04..e54f8d66843bf 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -3197,15 +3197,15 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVect
   def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def dbpsadbw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def dbpsadbw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
   def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d9a14f84e4a8a..5d46b2c595b1f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2823,9 +2823,7 @@ static bool interp__builtin_ia32_pmul(
 static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
                                           const CallExpr *Call) {
   assert(Call->getNumArgs() == 3);
-  QualType Arg2Type = Call->getArg(2)->getType();
-  APSInt ImmVal = popToAPSInt(S, Arg2Type);
-  unsigned Imm = ImmVal.getZExtValue();
+  unsigned Imm = popToUInt64(S, Call->getArg(2));
 
   const Pointer &Src2 = S.Stk.pop<Pointer>();
   const Pointer &Src1 = S.Stk.pop<Pointer>();
@@ -2845,7 +2843,7 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
   unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
 
   unsigned DstIdx = 0;
-  for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
     unsigned LaneStart = Lane * LaneSize;
 
     for (unsigned J = 0; J < 4; ++J) {
@@ -2856,11 +2854,11 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
         INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
           // Treat as unsigned bytes
           A1Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetA + K).toAPSInt().getZExtValue());
+              Src1.elem<T>(LaneStart + BlockOffsetA + K));
           A2Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetB + K).toAPSInt().getZExtValue());
+              Src1.elem<T>(LaneStart + BlockOffsetB + K));
           BVal = static_cast<uint8_t>(
-              Src2.elem<T>(LaneStart + 4 * J + K).toAPSInt().getZExtValue());
+              Src2.elem<T>(LaneStart + 4 * J + K));
         });
         SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
         SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index fc4a4834b462a..2a6e1713fba4d 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12599,10 +12599,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
               SourceA.getVectorElt(LaneStart + BlockOffsetA + K)
                   .getInt()
                   .getZExtValue());
-          unsigned B = static_cast<uint8_t>(
-              SourceB.getVectorElt(LaneStart + 4 * J + K)
-                  .getInt()
-                  .getZExtValue());
+          unsigned B =
+              static_cast<uint8_t>(SourceB.getVectorElt(LaneStart + 4 * J + K)
+                                       .getInt()
+                                       .getZExtValue());
           SadA += (B > A) ? (B - A) : (A - B);
 
           unsigned A2 = static_cast<uint8_t>(

>From b2ef04823423b38720746a63a0a661a54852de90 Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Sat, 28 Mar 2026 16:50:42 +0100
Subject: [PATCH 3/5] style: apply clang-format to modified files

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5d46b2c595b1f..4ba611cf68013 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2853,12 +2853,11 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
         unsigned A1Val, A2Val, BVal;
         INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
           // Treat as unsigned bytes
-          A1Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetA + K));
-          A2Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetB + K));
-          BVal = static_cast<uint8_t>(
-              Src2.elem<T>(LaneStart + 4 * J + K));
+          A1Val =
+              static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetA + K));
+          A2Val =
+              static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetB + K));
+          BVal = static_cast<uint8_t>(Src2.elem<T>(LaneStart + 4 * J + K));
         });
         SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
         SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);

>From bea99a2dc693330aeacf0c7a386bee072221cd5c Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Sat, 28 Mar 2026 22:35:25 +0100
Subject: [PATCH 4/5] fix: address review feedback - split psadbw512 from
 constexpr block

Split psadbw512 out of the shared BuiltinsX86.td let-block with
dbpsadbw512 to avoid erroneously marking psadbw512 as Constexpr.
The psadbw512 builtin does not have constexpr evaluation support,
so it should not be tagged with Constexpr.

This addresses RKSimon's review feedback about missing/incorrect
Constexpr tags. The three code suggestions from tbaederr were
already addressed in a prior commit:
- Use popToUInt64 instead of popToAPSInt for the immediate value
- Use != instead of < in the lane loop comparison
- Remove unnecessary .toAPSInt().getZExtValue() from element access

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/include/clang/Basic/BuiltinsX86.td | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index e54f8d66843bf..59c79bf681103 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -3207,6 +3207,9 @@ let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, Req
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">;
 }
 

>From c5f9446197ab98ad3a7415f0439c0b2bcc701b07 Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
Date: Sun, 29 Mar 2026 23:21:18 +0200
Subject: [PATCH 5/5] fix: rewrite VDBPSADBW constexpr to match hardware
 behavior

The previous implementation was fundamentally incorrect: it only used
two 2-bit fields from imm8 to select two fixed blocks from src1, then
computed a simple block-vs-block SAD. The actual VDBPSADBW instruction
uses a two-phase algorithm:

Phase 1 (Shuffle): All four 2-bit fields of imm8 are used to shuffle
src2 within each 128-bit lane. Each field selects one of four 4-byte
blocks from src2.

Phase 2 (Sliding SAD): A sliding/overlapping window computes SADs
between src1 bytes and the shuffled src2 bytes. Groups of 4 output
u16 values are produced using overlapping offsets into both arrays.

The correct algorithm matches GCC's reference implementation in
gcc/testsuite/gcc.target/i386/avx512bw-vdbpsadbw-2.c and has been
verified against hardware output provided by @RKSimon:
  _mm_dbsad_epu8([0..15], [1..16], 4) = [4, 8, 4, 0, 28, 28, 44, 44]

Both ExprConstant.cpp and InterpBuiltin.cpp are updated with the same
corrected algorithm. All TEST_CONSTEXPR expected values are recomputed
to match.

Signed-off-by: Pierluigi Lenoci <pierluigi.lenoci at gmail.com>
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp     | 61 +++++++++++--------
 clang/lib/AST/ExprConstant.cpp               | 62 +++++++++++---------
 clang/test/CodeGen/X86/avx512bw-builtins.c   | 11 ++--
 clang/test/CodeGen/X86/avx512vlbw-builtins.c | 29 ++++-----
 4 files changed, 87 insertions(+), 76 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 305559b3eb025..15b5d85947433 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2840,38 +2840,51 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
   PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
   bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
 
-  unsigned LaneSize = 16; // 128-bit lane = 16 bytes
-  unsigned NumLanes = SourceLen / LaneSize;
-  unsigned BlockOffsetA = (Imm & 0x3) * 4;
-  unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
-
-  unsigned DstIdx = 0;
-  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-    unsigned LaneStart = Lane * LaneSize;
+  constexpr unsigned LaneSize = 16; // 128-bit lane = 16 bytes
 
+  // Phase 1: Shuffle Src2 using all four 2-bit fields of imm8.
+  // Within each 128-bit lane, for group j (0..3), select a 4-byte block
+  // from Src2 based on bits [2*j+1:2*j] of imm8.
+  uint8_t Shuffled[64]; // max 512-bit = 64 bytes
+  for (unsigned I = 0; I < SourceLen; I += LaneSize) {
     for (unsigned J = 0; J < 4; ++J) {
-      unsigned SadA = 0;
-      unsigned SadB = 0;
+      unsigned Part = (Imm >> (2 * J)) & 3;
       for (unsigned K = 0; K < 4; ++K) {
-        unsigned A1Val, A2Val, BVal;
         INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
-          // Treat as unsigned bytes
-          A1Val =
-              static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetA + K));
-          A2Val =
-              static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetB + K));
-          BVal = static_cast<uint8_t>(Src2.elem<T>(LaneStart + 4 * J + K));
+          Shuffled[I + 4 * J + K] =
+              static_cast<uint8_t>(Src2.elem<T>(I + 4 * Part + K));
         });
-        SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
-        SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);
       }
+    }
+  }
+
+  // Phase 2: Sliding SAD computation.
+  // For every group of 4 output u16 values, compute absolute differences
+  // using overlapping windows into Src1 and the shuffled array.
+  unsigned Size = SourceLen / 2; // number of output u16 elements
+  unsigned DstIdx = 0;
+  for (unsigned I = 0; I < Size; I += 4) {
+    unsigned Sad[4] = {0, 0, 0, 0};
+    for (unsigned J = 0; J < 4; ++J) {
+      uint8_t A1, A2;
+      INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
+        A1 = static_cast<uint8_t>(Src1.elem<T>(2 * I + J));
+        A2 = static_cast<uint8_t>(Src1.elem<T>(2 * I + J + 4));
+      });
+      uint8_t B0 = Shuffled[2 * I + J];
+      uint8_t B1 = Shuffled[2 * I + J + 1];
+      uint8_t B2 = Shuffled[2 * I + J + 2];
+      uint8_t B3 = Shuffled[2 * I + J + 3];
+      Sad[0] += (A1 > B0) ? (A1 - B0) : (B0 - A1);
+      Sad[1] += (A1 > B1) ? (A1 - B1) : (B1 - A1);
+      Sad[2] += (A2 > B2) ? (A2 - B2) : (B2 - A2);
+      Sad[3] += (A2 > B3) ? (A2 - B3) : (B3 - A2);
+    }
+    for (unsigned R = 0; R < 4; ++R) {
       INT_TYPE_SWITCH_NO_BOOL(DestElemT, {
-        Dst.elem<T>(DstIdx) =
-            static_cast<T>(APSInt(APInt(16, SadA), DestUnsigned));
-        Dst.elem<T>(DstIdx + 1) =
-            static_cast<T>(APSInt(APInt(16, SadB), DestUnsigned));
+        Dst.elem<T>(DstIdx++) =
+            static_cast<T>(APSInt(APInt(16, Sad[R]), DestUnsigned));
       });
-      DstIdx += 2;
     }
   }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 2a6e1713fba4d..cc8b3d8e61b93 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12574,11 +12574,8 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       return false;
 
     unsigned SourceLen = SourceA.getVectorLength();
-    unsigned LaneSize = 16; // 128-bit lane = 16 bytes
-    unsigned NumLanes = SourceLen / LaneSize;
+    constexpr unsigned LaneSize = 16; // 128-bit lane = 16 bytes
     unsigned Imm = SourceImm.getInt().getZExtValue();
-    unsigned BlockOffsetA = (Imm & 0x3) * 4;
-    unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
 
     auto *DestTy = E->getType()->castAs<VectorType>();
     QualType DestEltTy = DestTy->getElementType();
@@ -12586,38 +12583,47 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     SmallVector<APValue, 32> ResultElements;
     ResultElements.reserve(SourceLen / 2);
 
-    for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
-      unsigned LaneStart = Lane * LaneSize;
-
+    // Phase 1: Shuffle SourceB using all four 2-bit fields of imm8.
+    // Within each 128-bit lane, for group j (0..3), select a 4-byte block
+    // from SourceB based on bits [2*j+1:2*j] of imm8.
+    SmallVector<uint8_t, 64> Shuffled(SourceLen);
+    for (unsigned I = 0; I < SourceLen; I += LaneSize) {
       for (unsigned J = 0; J < 4; ++J) {
-        // Compute SAD of SourceB[4*J..4*J+3] vs blockA from SourceA
-        unsigned SadA = 0;
-        unsigned SadB = 0;
+        unsigned Part = (Imm >> (2 * J)) & 3;
         for (unsigned K = 0; K < 4; ++K) {
-          // Treat input bytes as unsigned
-          unsigned A = static_cast<uint8_t>(
-              SourceA.getVectorElt(LaneStart + BlockOffsetA + K)
-                  .getInt()
-                  .getZExtValue());
-          unsigned B =
-              static_cast<uint8_t>(SourceB.getVectorElt(LaneStart + 4 * J + K)
-                                       .getInt()
-                                       .getZExtValue());
-          SadA += (B > A) ? (B - A) : (A - B);
-
-          unsigned A2 = static_cast<uint8_t>(
-              SourceA.getVectorElt(LaneStart + BlockOffsetB + K)
+          Shuffled[I + 4 * J + K] = static_cast<uint8_t>(
+              SourceB.getVectorElt(I + 4 * Part + K)
                   .getInt()
                   .getZExtValue());
-          SadB += (B > A2) ? (B - A2) : (A2 - B);
         }
-        ResultElements.push_back(
-            APValue(APSInt(APInt(16, SadA), DestUnsigned)));
-        ResultElements.push_back(
-            APValue(APSInt(APInt(16, SadB), DestUnsigned)));
       }
     }
 
+    // Phase 2: Sliding SAD computation.
+    // For every group of 4 output u16 values, compute absolute differences
+    // using overlapping windows into SourceA and the shuffled array.
+    unsigned Size = SourceLen / 2; // number of output u16 elements
+    for (unsigned I = 0; I < Size; I += 4) {
+      unsigned Sad[4] = {0, 0, 0, 0};
+      for (unsigned J = 0; J < 4; ++J) {
+        uint8_t A1 = static_cast<uint8_t>(
+            SourceA.getVectorElt(2 * I + J).getInt().getZExtValue());
+        uint8_t A2 = static_cast<uint8_t>(
+            SourceA.getVectorElt(2 * I + J + 4).getInt().getZExtValue());
+        uint8_t B0 = Shuffled[2 * I + J];
+        uint8_t B1 = Shuffled[2 * I + J + 1];
+        uint8_t B2 = Shuffled[2 * I + J + 2];
+        uint8_t B3 = Shuffled[2 * I + J + 3];
+        Sad[0] += (A1 > B0) ? (A1 - B0) : (B0 - A1);
+        Sad[1] += (A1 > B1) ? (A1 - B1) : (B1 - A1);
+        Sad[2] += (A2 > B2) ? (A2 - B2) : (B2 - A2);
+        Sad[3] += (A2 > B3) ? (A2 - B3) : (B3 - A2);
+      }
+      for (unsigned R = 0; R < 4; ++R)
+        ResultElements.push_back(
+            APValue(APSInt(APInt(16, Sad[R]), DestUnsigned)));
+    }
+
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 488146e740db4..b9fc4fa3f7ab9 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -3232,8 +3232,7 @@ __m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) {
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   return _mm512_dbsad_epu8(__A, __B, 170);
 }
-// 512-bit: 4 lanes, imm8=0: blockA=blockB=lane[0..3] for each lane
-// Each lane behaves the same as the 128-bit case with matching data
+// 512-bit: 4 lanes, imm8=0: all shuffle groups select block 0 per lane
 TEST_CONSTEXPR(match_v32hu(_mm512_dbsad_epu8(
   ((__m512i)(__v64qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
@@ -3243,10 +3242,10 @@ TEST_CONSTEXPR(match_v32hu(_mm512_dbsad_epu8(
                      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
-  0), 4, 4, 20, 20, 36, 36, 52, 52,
-      4, 4, 20, 20, 36, 36, 52, 52,
-      4, 4, 20, 20, 36, 36, 52, 52,
-      4, 4, 20, 20, 36, 36, 52, 52));
+  0), 4, 8, 12, 12, 28, 28, 44, 44,
+      4, 8, 12, 12, 28, 28, 44, 44,
+      4, 8, 12, 12, 28, 28, 44, 44,
+      4, 8, 12, 12, 28, 28, 44, 44));
 
 __m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mm_mask_dbsad_epu8
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 098ee29b1989e..2e148278a7cf2 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -3678,28 +3678,23 @@ __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
   return _mm_dbsad_epu8(__A, __B, 170);
 }
-// imm8=4: blockA=A[0..3]={0,1,2,3}, blockB=A[4..7]={4,5,6,7}
-// J=0: B[0..3]={1,2,3,4} vs blockA=4, vs blockB=12
-// J=1: B[4..7]={5,6,7,8} vs blockA=20, vs blockB=4
-// J=2: B[8..11]={9,10,11,12} vs blockA=36, vs blockB=20
-// J=3: B[12..15]={13,14,15,16} vs blockA=52, vs blockB=36
+// imm8=4 (0b00000100): shuffle selects src2 blocks [0,1,0,0] per lane
+// Phase 1 builds tmp, Phase 2 computes sliding SADs
 TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
   ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
   ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
-  4), 4, 12, 20, 4, 36, 20, 52, 36));
-// imm8=0: blockA=blockB=A[0..3]={0,1,2,3}
+  4), 4, 8, 4, 0, 28, 28, 44, 44));
+// imm8=0: all four 2-bit fields select block 0 from src2
 TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
   ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
   ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
-  0), 4, 4, 20, 20, 36, 36, 52, 52));
+  0), 4, 8, 12, 12, 28, 28, 44, 44));
 // Test with unsigned values > 127 (signed overflow territory)
-// A[0..3]={200,100,50,25}, B[0..3]={180,120,40,30}
-// imm8=0: blockA=blockB=A[0..3]
-// SAD = |180-200|+|120-100|+|40-50|+|30-25| = 20+20+10+5 = 55
+// imm8=0: all shuffle groups select src2[0..3]={180,120,40,30}
 TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
   ((__m128i)(__v16qu){200, 100, 50, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
   ((__m128i)(__v16qu){180, 120, 40, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
-  0), 55, 55, 375, 375, 375, 375, 375, 375));
+  0), 55, 315, 370, 370, 370, 370, 370, 370));
 
 __m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dbsad_epu8
@@ -3712,7 +3707,7 @@ TEST_CONSTEXPR(match_v8hu(_mm_mask_dbsad_epu8(
   ((__m128i)(__v8hu){99, 99, 99, 99, 99, 99, 99, 99}), (__mmask8)0x55,
   ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
   ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
-  4), 4, 99, 20, 99, 36, 99, 52, 99));
+  4), 4, 99, 4, 99, 28, 99, 44, 99));
 
 __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dbsad_epu8
@@ -3724,22 +3719,20 @@ __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
 TEST_CONSTEXPR(match_v8hu(_mm_maskz_dbsad_epu8((__mmask8)0xAA,
   ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
   ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
-  4), 0, 12, 0, 4, 0, 20, 0, 36));
+  4), 0, 8, 0, 0, 0, 28, 0, 44));
 
 __m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
   return _mm256_dbsad_epu8(__A, __B, 170);
 }
-// 256-bit: 2 lanes, imm8=0: blockA=blockB=lane[0..3]
-// Lane 0: same as 128-bit test above
-// Lane 1: A[16..19]={16,17,18,19}, B[16..19]={17,18,19,20} -> SAD=4
+// 256-bit: 2 lanes, imm8=0: all shuffle groups select block 0 per lane
 TEST_CONSTEXPR(match_v16hu(_mm256_dbsad_epu8(
   ((__m256i)(__v32qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}),
   ((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                      17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}),
-  0), 4, 4, 20, 20, 36, 36, 52, 52, 4, 4, 20, 20, 36, 36, 52, 52));
+  0), 4, 8, 12, 12, 28, 28, 44, 44, 4, 8, 12, 12, 28, 28, 44, 44));
 
 __m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dbsad_epu8