[clang] [161340] constexpr support for VNNI Intrinsics (PR #190525)

Akash Deo via cfe-commits cfe-commits at lists.llvm.org
Sun Apr 5 07:08:56 PDT 2026


https://github.com/AkashDeoNU created https://github.com/llvm/llvm-project/pull/190525

None

>From 650bd9f12f24a7dc6c7d8d9b25fba7111092fed9 Mon Sep 17 00:00:00 2001
From: AkashDeoNU <AkashDeo2025 at u.Northwestern.edu>
Date: Tue, 10 Mar 2026 17:27:23 -0500
Subject: [PATCH 01/18] modifying builtins.td, avx512vnniintrin.h, and part of
 interpreter

---
 clang/include/clang/Basic/BuiltinsX86.td | 24 ++++-----
 clang/lib/AST/ExprConstant.cpp           | 69 ++++++++++++++++++++++++
 clang/lib/Headers/avx512vnniintrin.h     | 29 +++++-----
 3 files changed, 97 insertions(+), 25 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 09b4d1c9970fd..27950ce8a263c 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1075,51 +1075,51 @@ let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVecto
   def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, char>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, char>)">;
 }
 
-let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, char>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, char>)">;
 }
 
-let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
-let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
-let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 429fef0a1afa8..3c79ca0db1bbf 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14546,6 +14546,75 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
             }))
       return false;
     return Success(R, E);
+  }
+	// Signed
+  case X86::BI__builtin_ia32_vpdpwssd128:
+  case X86::BI__builtin_ia32_vpdpwssd256:
+  case X86::BI__builtin_ia32_vpdpwssd512:
+  case X86::BI__builtin_ia32_vpdpwssds128:
+  case X86::BI__builtin_ia32_vpdpwssds256:
+  case X86::BI__builtin_ia32_vpdpwssds512:
+	// Get the number of elements from the 0th arg
+	// Get the bitmask over each lane
+	// Result SmallVector
+	// return the Result SmallVector
+
+	// Unsigned
+  case X86::BI__builtin_ia32_vpdpbusds128:
+  case X86::BI__builtin_ia32_vpdpbusds256:
+  case X86::BI__builtin_ia32_vpdpbusds512:
+  case X86::BI__builtin_ia32_vpdpbusd128:
+  case X86::BI__builtin_ia32_vpdpbusd256:
+  case X86::BI__builtin_ia32_vpdpbusd512: {
+	llvm::outs() << "DEBUG: constexpr evaluator is firing properly\n";
+    
+    APValue Source, OperandA, OperandB;
+    if (!EvaluateAsRValue(Info, E->getArg(0), Source) ||
+        !EvaluateAsRValue(Info, E->getArg(1), OperandA) ||
+        !EvaluateAsRValue(Info, E->getArg(2), OperandB)) {
+	  llvm::outs() << "DEBUG: an operand was rejected when evaluated\n";
+	  return false;
+    }
+    
+    unsigned NumElements = Source.getVectorLength();
+
+    SmallVector<APValue, 64> Result;
+    Result.reserve(NumElements);
+
+    for (unsigned I = 0; I < NumElements; ++I) {
+	  Result.push_back(APValue(0)); 
+    }
+    
+	return Success(APValue(Result.data(), Result.size()), E);
+
+    /**
+ case clang::X86::BI__builtin_ia32_addsubps256: {
+    // Addsub: alternates between subtraction and addition
+    // Result[i] = (i % 2 == 0) ? (a[i] - b[i]) : (a[i] + b[i])
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    unsigned NumElems = SourceLHS.getVectorLength();
+    SmallVector<APValue, 8> ResultElements;
+    ResultElements.reserve(NumElems);
+    llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
+
+    for (unsigned I = 0; I != NumElems; ++I) {
+      APFloat LHS = SourceLHS.getVectorElt(I).getFloat();
+      APFloat RHS = SourceRHS.getVectorElt(I).getFloat();
+      if (I % 2 == 0) {
+        // Even indices: subtract
+        LHS.subtract(RHS, RM);
+      } else {
+        // Odd indices: add
+        LHS.add(RHS, RM);
+      }
+      ResultElements.push_back(APValue(LHS));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+	 */
   }
   }
 }
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index 2ce88efe4a04f..2d74987df15c6 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -19,14 +19,16 @@
   __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"),     \
                  __min_vector_width__(512)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v64qu)__A,
-                                             (__v64qi)__B);
+                                             (__v64qi)__B;)
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -34,7 +36,7 @@ _mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
                                     (__v16si)__S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -42,14 +44,14 @@ _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
                                     (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v64qu)__A,
                                               (__v64qi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -57,7 +59,7 @@ _mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
                                    (__v16si)__S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -65,14 +67,14 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
                                    (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A,
                                              (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -80,7 +82,7 @@ _mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
                                     (__v16si)__S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -88,14 +90,14 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
                                     (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A,
                                               (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -103,7 +105,7 @@ _mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
                                    (__v16si)__S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -112,5 +114,6 @@ _mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif

>From 74fc954c9b2c7076083c50e1cac9e3170928abe0 Mon Sep 17 00:00:00 2001
From: AkashDeoNU <AkashDeo2025 at u.Northwestern.edu>
Date: Tue, 10 Mar 2026 17:56:26 -0500
Subject: [PATCH 02/18] quick dummy fix

---
 clang/lib/AST/ExprConstant.cpp       | 2 +-
 clang/lib/Headers/avx512vnniintrin.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 3c79ca0db1bbf..f9a62a01460e2 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14582,7 +14582,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     Result.reserve(NumElements);
 
     for (unsigned I = 0; I < NumElements; ++I) {
-	  Result.push_back(APValue(0)); 
+	  Result.push_back(APValue(APSInt(APInt(32, 0))));
     }
     
 	return Success(APValue(Result.data(), Result.size()), E);
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index 2d74987df15c6..f7f017ee26dfa 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -25,7 +25,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v64qu)__A,
-                                             (__v64qi)__B;)
+                                             (__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR

>From ff2a8765d7dffd816575de27d4494377d061fe66 Mon Sep 17 00:00:00 2001
From: AkashDeoNU <AkashDeo2025 at u.Northwestern.edu>
Date: Tue, 10 Mar 2026 18:57:39 -0500
Subject: [PATCH 03/18] core logic is done

---
 clang/lib/AST/ExprConstant.cpp | 66 ++++++++++++++--------------------
 1 file changed, 26 insertions(+), 40 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index f9a62a01460e2..8fd95d557ca7b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14547,34 +14547,43 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       return false;
     return Success(R, E);
   }
-	// Signed
   case X86::BI__builtin_ia32_vpdpwssd128:
   case X86::BI__builtin_ia32_vpdpwssd256:
   case X86::BI__builtin_ia32_vpdpwssd512:
   case X86::BI__builtin_ia32_vpdpwssds128:
   case X86::BI__builtin_ia32_vpdpwssds256:
   case X86::BI__builtin_ia32_vpdpwssds512:
-	// Get the number of elements from the 0th arg
-	// Get the bitmask over each lane
-	// Result SmallVector
-	// return the Result SmallVector
-
-	// Unsigned
   case X86::BI__builtin_ia32_vpdpbusds128:
   case X86::BI__builtin_ia32_vpdpbusds256:
   case X86::BI__builtin_ia32_vpdpbusds512:
   case X86::BI__builtin_ia32_vpdpbusd128:
   case X86::BI__builtin_ia32_vpdpbusd256:
   case X86::BI__builtin_ia32_vpdpbusd512: {
-	llvm::outs() << "DEBUG: constexpr evaluator is firing properly\n";
-    
+    // TODO: Before I make a PR, I should encapsulate all this into a
+    // lambda at the top of the function. 
+    bool IsByteDot =
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd128  ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd256  ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd512  ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds128 ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds256 ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds512;
+    bool IsSaturating =
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds128 ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds256 ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds512 ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds128 ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds256 ||
+        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds512;
+
     APValue Source, OperandA, OperandB;
     if (!EvaluateAsRValue(Info, E->getArg(0), Source) ||
         !EvaluateAsRValue(Info, E->getArg(1), OperandA) ||
         !EvaluateAsRValue(Info, E->getArg(2), OperandB)) {
-	  llvm::outs() << "DEBUG: an operand was rejected when evaluated\n";
 	  return false;
     }
+
+    // Assume IsByteDot == true and IsSaturating == false
     
     unsigned NumElements = Source.getVectorLength();
 
@@ -14582,39 +14591,16 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     Result.reserve(NumElements);
 
     for (unsigned I = 0; I < NumElements; ++I) {
-	  Result.push_back(APValue(APSInt(APInt(32, 0))));
+	  APSInt DotProduct = Source.getVectorElt(I).getInt();
+      for (unsigned J = 0; J < 4; ++J) {
+        APSInt OpA = APSInt(OperandA.getVectorElt(4*I+J).getInt().zext(16), false);
+        APSInt OpB = APSInt(OperandB.getVectorElt(4*I+J).getInt().sext(16), false);
+		DotProduct += (OpA * OpB);
+	  }
+	  Result.push_back(APValue(DotProduct));
     }
     
 	return Success(APValue(Result.data(), Result.size()), E);
-
-    /**
- case clang::X86::BI__builtin_ia32_addsubps256: {
-    // Addsub: alternates between subtraction and addition
-    // Result[i] = (i % 2 == 0) ? (a[i] - b[i]) : (a[i] + b[i])
-    APValue SourceLHS, SourceRHS;
-    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
-        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
-      return false;
-    unsigned NumElems = SourceLHS.getVectorLength();
-    SmallVector<APValue, 8> ResultElements;
-    ResultElements.reserve(NumElems);
-    llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
-
-    for (unsigned I = 0; I != NumElems; ++I) {
-      APFloat LHS = SourceLHS.getVectorElt(I).getFloat();
-      APFloat RHS = SourceRHS.getVectorElt(I).getFloat();
-      if (I % 2 == 0) {
-        // Even indices: subtract
-        LHS.subtract(RHS, RM);
-      } else {
-        // Odd indices: add
-        LHS.add(RHS, RM);
-      }
-      ResultElements.push_back(APValue(LHS));
-    }
-    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
-  }
-	 */
   }
   }
 }

>From 2c219f03d42e9983ea4c4a1cde6320a23460e858 Mon Sep 17 00:00:00 2001
From: AkashDeoNU <AkashDeo2025 at u.Northwestern.edu>
Date: Tue, 10 Mar 2026 19:20:33 -0500
Subject: [PATCH 04/18] saturating

---
 clang/lib/AST/ExprConstant.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 8fd95d557ca7b..615c7b6ea5b4f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14591,15 +14591,22 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     Result.reserve(NumElements);
 
     for (unsigned I = 0; I < NumElements; ++I) {
-	  APSInt DotProduct = Source.getVectorElt(I).getInt();
+      APSInt DotProduct = Source.getVectorElt(I).getInt();
+      if (IsSaturating) {
+		DotProduct = DotProduct.sext(64);
+      }
       for (unsigned J = 0; J < 4; ++J) {
         APSInt OpA = APSInt(OperandA.getVectorElt(4*I+J).getInt().zext(16), false);
-        APSInt OpB = APSInt(OperandB.getVectorElt(4*I+J).getInt().sext(16), false);
-		DotProduct += (OpA * OpB);
+        APSInt OpB =
+            APSInt(OperandB.getVectorElt(4 * I + J).getInt().sext(16), false);
+        DotProduct += APSInt((OpA * OpB).sext(64), false);
+      }
+      if (IsSaturating) {
+        DotProduct = APSInt(DotProduct.truncSSat(32), false);
 	  }
 	  Result.push_back(APValue(DotProduct));
     }
-    
+
 	return Success(APValue(Result.data(), Result.size()), E);
   }
   }

>From ec0bbaf82b38a1189df29c3a7f4b2a22c4992ae9 Mon Sep 17 00:00:00 2001
From: AkashDeoNU <AkashDeo2025 at u.Northwestern.edu>
Date: Tue, 10 Mar 2026 19:22:29 -0500
Subject: [PATCH 05/18] the isbyte / word thing is done too. but its unreadable

---
 clang/lib/AST/ExprConstant.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 615c7b6ea5b4f..c8c1b3a9a8816 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14595,10 +14595,12 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       if (IsSaturating) {
 		DotProduct = DotProduct.sext(64);
       }
-      for (unsigned J = 0; J < 4; ++J) {
-        APSInt OpA = APSInt(OperandA.getVectorElt(4*I+J).getInt().zext(16), false);
-        APSInt OpB =
-            APSInt(OperandB.getVectorElt(4 * I + J).getInt().sext(16), false);
+      unsigned Iters = IsByteDot ? 4 : 2;
+      for (unsigned J = 0; J < Iters; ++J) {
+        APSInt OpA = IsByteDot
+            ? APSInt(OperandA.getVectorElt(Iters*I+J).getInt().zext(16), false)
+            : APSInt(OperandA.getVectorElt(Iters*I+J).getInt(), false);
+        APSInt OpB = APSInt(OperandB.getVectorElt(Iters*I+J).getInt().sext(16), false);
         DotProduct += APSInt((OpA * OpB).sext(64), false);
       }
       if (IsSaturating) {

>From 50ec2ab1fe74681964b4c07b2da3982646e6b7a5 Mon Sep 17 00:00:00 2001
From: AkashDeoNU <AkashDeo2025 at u.Northwestern.edu>
Date: Tue, 10 Mar 2026 21:29:54 -0500
Subject: [PATCH 06/18] draft of the bytecode interpreter code

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 40 ++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index abc746af81306..936fefcad2309 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -5953,6 +5953,46 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return EvalScalarMinMaxFp(A, B, RoundingMode, /*IsMin=*/false);
         },
         /*IsScalar=*/true);
+  case X86::BI__builtin_ia32_vpdpwssd128:
+  case X86::BI__builtin_ia32_vpdpwssd256:
+  case X86::BI__builtin_ia32_vpdpwssd512:
+  case X86::BI__builtin_ia32_vpdpwssds128:
+  case X86::BI__builtin_ia32_vpdpwssds256:
+  case X86::BI__builtin_ia32_vpdpwssds512:
+  case X86::BI__builtin_ia32_vpdpbusds128:
+  case X86::BI__builtin_ia32_vpdpbusds256:
+  case X86::BI__builtin_ia32_vpdpbusds512:
+  case X86::BI__builtin_ia32_vpdpbusd128:
+  case X86::BI__builtin_ia32_vpdpbusd256:
+  case X86::BI__builtin_ia32_vpdpbusd512: {
+	bool IsByteDot =
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd128  ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd256  ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd512  ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds128 ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds256 ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds512;
+    bool IsSaturating =
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds128 ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds256 ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds512 ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds128 ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds256 ||
+	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds512;
+    return interp__builtin_elementwise_triop(
+        S, OpPC, Call,
+        [](const APSInt &Source, const APSInt &A, const APSInt &B) {
+          APSInt DotProduct = Source;
+          unsigned Iters = 4;
+          unsigned Shift = 8;
+          for (unsigned J = 0; J < Iters; ++J) {
+            APSInt OpA = APSInt(APSInt(A.lshr(J * Shift).trunc(Shift)).zext(16), false);
+            APSInt OpB = APSInt(APSInt(B.lshr(J * Shift).trunc(Shift)).sext(16), false);
+			DotProduct += APSInt((OpA * OpB).sext(32), false);
+		  }
+          return DotProduct;
+        });
+  }
 
   default:
     S.FFDiag(S.Current->getLocation(OpPC),

>From e3a9857ccfb26c981336b95737c6cd367cce07ab Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Wed, 11 Mar 2026 22:18:51 -0500
Subject: [PATCH 07/18] interim changes to interpbuiltin.cpp and
 exprconstant.cpp

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 61 ++++++++++++++++++------
 clang/lib/AST/ExprConstant.cpp           |  2 +-
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 936fefcad2309..657d83fd39e9b 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -5956,30 +5956,61 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_vpdpwssd128:
   case X86::BI__builtin_ia32_vpdpwssd256:
   case X86::BI__builtin_ia32_vpdpwssd512:
+	// !IsByte, !IsSaturated
+	return interp__builtin_elementwise_triop(
+        S, OpPC, Call,
+        [](const APSInt &Source, const APSInt &A, const APSInt &B) {
+          APSInt DotProduct = Source;
+          unsigned Iters = 2;
+          unsigned Shift = 16;
+          for (unsigned J = 0; J < Iters; ++J) {
+            APSInt OpA = APSInt(APSInt(A.lshr(J * Shift).trunc(Shift)).sext(32), false);
+            APSInt OpB = APSInt(APSInt(B.lshr(J * Shift).trunc(Shift)).sext(32), false);
+			DotProduct += APSInt((OpA * OpB).sext(32), false);
+		  }
+          return DotProduct;
+        });
   case X86::BI__builtin_ia32_vpdpwssds128:
   case X86::BI__builtin_ia32_vpdpwssds256:
   case X86::BI__builtin_ia32_vpdpwssds512:
+	// !IsByte, IsSaturated
+	return interp__builtin_elementwise_triop(
+        S, OpPC, Call,
+        [](const APSInt &Source, const APSInt &A, const APSInt &B) {
+          APSInt DotProduct = APSInt(Source.sext(64), false);
+          unsigned Iters = 2;
+          unsigned Shift = 16;
+          for (unsigned J = 0; J < Iters; ++J) {
+            APSInt OpA = APSInt(APSInt(A.lshr(J * Shift).trunc(Shift)).sext(32), false);
+            APSInt OpB = APSInt(APSInt(B.lshr(J * Shift).trunc(Shift)).sext(32), false);
+			DotProduct += APSInt((OpA * OpB).sext(32), false);
+          }
+		  DotProduct = DotProduct.truncSSat(32);
+          return DotProduct;
+        });
   case X86::BI__builtin_ia32_vpdpbusds128:
   case X86::BI__builtin_ia32_vpdpbusds256:
   case X86::BI__builtin_ia32_vpdpbusds512:
+	// IsByte, IsSaturated
+	return interp__builtin_elementwise_triop(
+        S, OpPC, Call,
+        [](const APSInt &Source, const APSInt &A, const APSInt &B) {
+          APSInt DotProduct = APSInt(Source.sext(64), false);
+          unsigned Iters = 4;
+          unsigned Shift = 8;
+          for (unsigned J = 0; J < Iters; ++J) {
+            APSInt OpA = APSInt(APSInt(A.lshr(J * Shift).trunc(Shift)).zext(16), false);
+            APSInt OpB = APSInt(APSInt(B.lshr(J * Shift).trunc(Shift)).sext(16), false);
+			DotProduct += APSInt((OpA * OpB).sext(32), false);
+          }
+		  DotProduct = DotProduct.truncSSat(32);
+          return DotProduct;
+        });
   case X86::BI__builtin_ia32_vpdpbusd128:
   case X86::BI__builtin_ia32_vpdpbusd256:
   case X86::BI__builtin_ia32_vpdpbusd512: {
-	bool IsByteDot =
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd128  ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd256  ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd512  ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds128 ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds256 ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds512;
-    bool IsSaturating =
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds128 ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds256 ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds512 ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds128 ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds256 ||
-	  Call->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds512;
-    return interp__builtin_elementwise_triop(
+	// IsByte, !IsSaturated
+	return interp__builtin_elementwise_triop(
         S, OpPC, Call,
         [](const APSInt &Source, const APSInt &A, const APSInt &B) {
           APSInt DotProduct = Source;
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index c8c1b3a9a8816..da204b2f924dd 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14587,7 +14587,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     
     unsigned NumElements = Source.getVectorLength();
 
-    SmallVector<APValue, 64> Result;
+    SmallVector<APValue, 16> Result;
     Result.reserve(NumElements);
 
     for (unsigned I = 0; I < NumElements; ++I) {

>From a1bfc241c33b0882f4567e8459af7297def80b50 Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 11:52:39 -0500
Subject: [PATCH 08/18] expr constant cpp

---
 clang/lib/AST/ExprConstant.cpp | 61 +++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 23 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 345e2073304ac..6bac9e3b54b10 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14646,22 +14646,36 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_vpdpbusd128:
   case X86::BI__builtin_ia32_vpdpbusd256:
   case X86::BI__builtin_ia32_vpdpbusd512: {
-    // TODO: Before I make a PR, I should encapsulate all this into a
-    // lambda at the top of the function. 
-    bool IsByteDot =
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd128  ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd256  ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusd512  ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds128 ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds256 ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds512;
-    bool IsSaturating =
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds128 ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds256 ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpwssds512 ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds128 ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds256 ||
-        E->getBuiltinCallee() == X86::BI__builtin_ia32_vpdpbusds512;
+    unsigned BuiltinID = E->getBuiltinCallee();
+    bool IsDottingWord = false;
+    bool IsSaturating = false;
+    switch (BuiltinID) {
+    case X86::BI__builtin_ia32_vpdpwssd128:
+	case X86::BI__builtin_ia32_vpdpwssd256:
+	case X86::BI__builtin_ia32_vpdpwssd512:
+	  IsDottingWord = true;
+	  IsSaturating = false;
+	  break;
+	case X86::BI__builtin_ia32_vpdpwssds128:
+	case X86::BI__builtin_ia32_vpdpwssds256:
+	case X86::BI__builtin_ia32_vpdpwssds512:
+	  IsDottingWord = true;
+	  IsSaturating = true;
+	  break;
+	case X86::BI__builtin_ia32_vpdpbusds128:
+	case X86::BI__builtin_ia32_vpdpbusds256:
+	case X86::BI__builtin_ia32_vpdpbusds512:
+	  IsDottingWord = false;
+	  IsSaturating = true;
+	  break;
+	case X86::BI__builtin_ia32_vpdpbusd128:
+	case X86::BI__builtin_ia32_vpdpbusd256:
+	case X86::BI__builtin_ia32_vpdpbusd512:
+	  IsDottingWord = false;
+	  IsSaturating = false;
+	  break;
+    }
+	
 
     APValue Source, OperandA, OperandB;
     if (!EvaluateAsRValue(Info, E->getArg(0), Source) ||
@@ -14670,23 +14684,24 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 	  return false;
     }
 
-    // Assume IsByteDot == true and IsSaturating == false
-    
+
     unsigned NumElements = Source.getVectorLength();
 
     SmallVector<APValue, 16> Result;
     Result.reserve(NumElements);
-
+	unsigned Iters = IsDottingWord ? 2 : 4;
     for (unsigned I = 0; I < NumElements; ++I) {
       APSInt DotProduct = Source.getVectorElt(I).getInt();
       if (IsSaturating) {
 		DotProduct = DotProduct.sext(64);
       }
-      unsigned Iters = IsByteDot ? 4 : 2;
       for (unsigned J = 0; J < Iters; ++J) {
-        APSInt OpA = IsByteDot
-            ? APSInt(OperandA.getVectorElt(Iters*I+J).getInt().zext(16), false)
-            : APSInt(OperandA.getVectorElt(Iters*I+J).getInt(), false);
+        APSInt OpA;
+        if (IsDottingWord) {
+		  OpA = APSInt(OperandA.getVectorElt(Iters*I+J).getInt(), false);
+        } else {
+		  OpA = APSInt(OperandA.getVectorElt(Iters*I+J).getInt().zext(16), true);
+		}
         APSInt OpB = APSInt(OperandB.getVectorElt(Iters*I+J).getInt().sext(16), false);
         DotProduct += APSInt((OpA * OpB).sext(64), false);
       }

>From 76f53fdfcf007c0844dbb29743448d2846c93054 Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 11:53:01 -0500
Subject: [PATCH 09/18] make custom helper function

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 144 ++++++++++++++---------
 1 file changed, 86 insertions(+), 58 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 435e10c190bdf..a5a8d678da82d 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -4189,6 +4189,61 @@ static bool interp__builtin_ia32_gfni_mul(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+/// Implements VNNI dot-product builtins (vpdpbusd, vpdpwssd, and their
+/// saturating variants). Unlike interp__builtin_elementwise_triop, this
+/// handles operands with different element types and counts: the accumulator
+/// (Arg0) has i32 elements, while the dot-product operands (Arg1, Arg2) have
+/// smaller element types (i8/u8 or i16).
+static bool interp__builtin_ia32_vpdpwssd(InterpState &S, CodePtr OpPC,
+                                           const CallExpr *Call,
+                                           bool IsDottingWord,
+                                           bool IsSaturating) {
+  const auto *SrcVecT = Call->getArg(0)->getType()->castAs<VectorType>();
+  const auto *OpAVecT = Call->getArg(1)->getType()->castAs<VectorType>();
+  const auto *OpBVecT = Call->getArg(2)->getType()->castAs<VectorType>();
+  PrimType SrcElemT = *S.getContext().classify(SrcVecT->getElementType());
+  PrimType OpAElemT = *S.getContext().classify(OpAVecT->getElementType());
+  PrimType OpBElemT = *S.getContext().classify(OpBVecT->getElementType());
+  unsigned NumElements = SrcVecT->getNumElements();
+  unsigned Iters = IsDottingWord ? 2 : 4;
+
+  const Pointer &OpBPtr = S.Stk.pop<Pointer>();
+  const Pointer &OpAPtr = S.Stk.pop<Pointer>();
+  const Pointer &SrcPtr = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  for (unsigned I = 0; I < NumElements; ++I) {
+    APSInt Acc;
+    INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
+      Acc = SrcPtr.elem<T>(I).toAPSInt();
+    });
+    if (IsSaturating)
+      Acc = Acc.sext(64);
+    for (unsigned J = 0; J < Iters; ++J) {
+      APSInt OpA, OpB;
+      INT_TYPE_SWITCH_NO_BOOL(OpAElemT, {
+        OpA = OpAPtr.elem<T>(Iters * I + J).toAPSInt();
+      });
+      INT_TYPE_SWITCH_NO_BOOL(OpBElemT, {
+        OpB = OpBPtr.elem<T>(Iters * I + J).toAPSInt();
+      });
+      if (IsDottingWord)
+        OpA = APSInt(OpA.sext(64), false);
+      else
+        OpA = APSInt(OpA.zext(64), true);
+      OpB = APSInt(OpB.sext(64), false);
+      Acc += APSInt((OpA * OpB).sext(64), false);
+    }
+    if (IsSaturating)
+      Acc = APSInt(Acc.truncSSat(32), false);
+    INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
+      Dst.elem<T>(I) = static_cast<T>(Acc);
+    });
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -6052,73 +6107,46 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_vpdpwssd128:
   case X86::BI__builtin_ia32_vpdpwssd256:
   case X86::BI__builtin_ia32_vpdpwssd512:
-	// !IsByte, !IsSaturated
-	return interp__builtin_elementwise_triop(
-        S, OpPC, Call,
-        [](const APSInt &Source, const APSInt &A, const APSInt &B) {
-          APSInt DotProduct = Source;
-          unsigned Iters = 2;
-          unsigned Shift = 16;
-          for (unsigned J = 0; J < Iters; ++J) {
-            APSInt OpA = APSInt(APSInt(A.lshr(J * Shift).trunc(Shift)).sext(32), false);
-            APSInt OpB = APSInt(APSInt(B.lshr(J * Shift).trunc(Shift)).sext(32), false);
-			DotProduct += APSInt((OpA * OpB).sext(32), false);
-		  }
-          return DotProduct;
-        });
   case X86::BI__builtin_ia32_vpdpwssds128:
   case X86::BI__builtin_ia32_vpdpwssds256:
   case X86::BI__builtin_ia32_vpdpwssds512:
-	// !IsByte, IsSaturated
-	return interp__builtin_elementwise_triop(
-        S, OpPC, Call,
-        [](const APSInt &Source, const APSInt &A, const APSInt &B) {
-          APSInt DotProduct = APSInt(Source.sext(64), false);
-          unsigned Iters = 2;
-          unsigned Shift = 16;
-          for (unsigned J = 0; J < Iters; ++J) {
-            APSInt OpA = APSInt(APSInt(A.lshr(J * Shift).trunc(Shift)).sext(32), false);
-            APSInt OpB = APSInt(APSInt(B.lshr(J * Shift).trunc(Shift)).sext(32), false);
-			DotProduct += APSInt((OpA * OpB).sext(32), false);
-          }
-		  DotProduct = DotProduct.truncSSat(32);
-          return DotProduct;
-        });
   case X86::BI__builtin_ia32_vpdpbusds128:
   case X86::BI__builtin_ia32_vpdpbusds256:
   case X86::BI__builtin_ia32_vpdpbusds512:
-	// IsByte, IsSaturated
-	return interp__builtin_elementwise_triop(
-        S, OpPC, Call,
-        [](const APSInt &Source, const APSInt &A, const APSInt &B) {
-          APSInt DotProduct = APSInt(Source.sext(64), false);
-          unsigned Iters = 4;
-          unsigned Shift = 8;
-          for (unsigned J = 0; J < Iters; ++J) {
-            APSInt OpA = APSInt(APSInt(A.lshr(J * Shift).trunc(Shift)).zext(16), false);
-            APSInt OpB = APSInt(APSInt(B.lshr(J * Shift).trunc(Shift)).sext(16), false);
-			DotProduct += APSInt((OpA * OpB).sext(32), false);
-          }
-		  DotProduct = DotProduct.truncSSat(32);
-          return DotProduct;
-        });
   case X86::BI__builtin_ia32_vpdpbusd128:
   case X86::BI__builtin_ia32_vpdpbusd256:
   case X86::BI__builtin_ia32_vpdpbusd512: {
-	// IsByte, !IsSaturated
-	return interp__builtin_elementwise_triop(
-        S, OpPC, Call,
-        [](const APSInt &Source, const APSInt &A, const APSInt &B) {
-          APSInt DotProduct = Source;
-          unsigned Iters = 4;
-          unsigned Shift = 8;
-          for (unsigned J = 0; J < Iters; ++J) {
-            APSInt OpA = APSInt(APSInt(A.lshr(J * Shift).trunc(Shift)).zext(16), false);
-            APSInt OpB = APSInt(APSInt(B.lshr(J * Shift).trunc(Shift)).sext(16), false);
-			DotProduct += APSInt((OpA * OpB).sext(32), false);
-		  }
-          return DotProduct;
-        });
+    unsigned BuiltinID = Call->getBuiltinCallee();
+    bool IsDottingWord;
+    bool IsSaturating;
+    switch (BuiltinID) {
+    case X86::BI__builtin_ia32_vpdpwssd128:
+    case X86::BI__builtin_ia32_vpdpwssd256:
+    case X86::BI__builtin_ia32_vpdpwssd512:
+      IsDottingWord = true;
+      IsSaturating = false;
+      break;
+    case X86::BI__builtin_ia32_vpdpwssds128:
+    case X86::BI__builtin_ia32_vpdpwssds256:
+    case X86::BI__builtin_ia32_vpdpwssds512:
+      IsDottingWord = true;
+      IsSaturating = true;
+      break;
+    case X86::BI__builtin_ia32_vpdpbusds128:
+    case X86::BI__builtin_ia32_vpdpbusds256:
+    case X86::BI__builtin_ia32_vpdpbusds512:
+      IsDottingWord = false;
+      IsSaturating = true;
+      break;
+    case X86::BI__builtin_ia32_vpdpbusd128:
+    case X86::BI__builtin_ia32_vpdpbusd256:
+    case X86::BI__builtin_ia32_vpdpbusd512:
+      IsDottingWord = false;
+      IsSaturating = false;
+      break;
+    }
+    return interp__builtin_ia32_vpdpwssd(S, OpPC, Call, IsDottingWord,
+                                         IsSaturating);
   }
 
   default:

>From 557c3f67a108939abbf8e52004194f8b8b7f1296 Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 16:20:46 -0500
Subject: [PATCH 10/18] make extensions for the operands really generous

---
 clang/lib/AST/ExprConstant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 6bac9e3b54b10..dfa8707f04285 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14698,11 +14698,11 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       for (unsigned J = 0; J < Iters; ++J) {
         APSInt OpA;
         if (IsDottingWord) {
-		  OpA = APSInt(OperandA.getVectorElt(Iters*I+J).getInt(), false);
+		  OpA = APSInt(OperandA.getVectorElt(Iters*I+J).getInt().sext(64), false);
         } else {
-		  OpA = APSInt(OperandA.getVectorElt(Iters*I+J).getInt().zext(16), true);
+		  OpA = APSInt(OperandA.getVectorElt(Iters*I+J).getInt().zext(64), true);
 		}
-        APSInt OpB = APSInt(OperandB.getVectorElt(Iters*I+J).getInt().sext(16), false);
+        APSInt OpB = APSInt(OperandB.getVectorElt(Iters*I+J).getInt().sext(64), false);
         DotProduct += APSInt((OpA * OpB).sext(64), false);
       }
       if (IsSaturating) {

>From 206f48b8a98a6209862a898015141d4bcb593c41 Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 16:21:29 -0500
Subject: [PATCH 11/18] formatting changes on the interp builtin vpdp helper

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index a5a8d678da82d..21ab67237591a 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -4189,21 +4189,18 @@ static bool interp__builtin_ia32_gfni_mul(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-/// Implements VNNI dot-product builtins (vpdpbusd, vpdpwssd, and their
-/// saturating variants). Unlike interp__builtin_elementwise_triop, this
-/// handles operands with different element types and counts: the accumulator
-/// (Arg0) has i32 elements, while the dot-product operands (Arg1, Arg2) have
-/// smaller element types (i8/u8 or i16).
-static bool interp__builtin_ia32_vpdpwssd(InterpState &S, CodePtr OpPC,
+static bool interp__builtin_ia32_vpdp(InterpState &S, CodePtr OpPC,
                                            const CallExpr *Call,
                                            bool IsDottingWord,
                                            bool IsSaturating) {
   const auto *SrcVecT = Call->getArg(0)->getType()->castAs<VectorType>();
   const auto *OpAVecT = Call->getArg(1)->getType()->castAs<VectorType>();
   const auto *OpBVecT = Call->getArg(2)->getType()->castAs<VectorType>();
+
   PrimType SrcElemT = *S.getContext().classify(SrcVecT->getElementType());
   PrimType OpAElemT = *S.getContext().classify(OpAVecT->getElementType());
   PrimType OpBElemT = *S.getContext().classify(OpBVecT->getElementType());
+
   unsigned NumElements = SrcVecT->getNumElements();
   unsigned Iters = IsDottingWord ? 2 : 4;
 
@@ -4217,25 +4214,28 @@ static bool interp__builtin_ia32_vpdpwssd(InterpState &S, CodePtr OpPC,
     INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
       Acc = SrcPtr.elem<T>(I).toAPSInt();
     });
-    if (IsSaturating)
+    if (IsSaturating) {
       Acc = Acc.sext(64);
+	}
     for (unsigned J = 0; J < Iters; ++J) {
       APSInt OpA, OpB;
       INT_TYPE_SWITCH_NO_BOOL(OpAElemT, {
-        OpA = OpAPtr.elem<T>(Iters * I + J).toAPSInt();
+        OpA = OpAPtr.elem<T>(Iters*I+J).toAPSInt();
       });
       INT_TYPE_SWITCH_NO_BOOL(OpBElemT, {
-        OpB = OpBPtr.elem<T>(Iters * I + J).toAPSInt();
+        OpB = OpBPtr.elem<T>(Iters*I+J).toAPSInt();
       });
-      if (IsDottingWord)
+      if (IsDottingWord) {
         OpA = APSInt(OpA.sext(64), false);
-      else
-        OpA = APSInt(OpA.zext(64), true);
+	  } else {
+		OpA = APSInt(OpA.zext(64), true);
+	  }
       OpB = APSInt(OpB.sext(64), false);
       Acc += APSInt((OpA * OpB).sext(64), false);
     }
-    if (IsSaturating)
+    if (IsSaturating) {
       Acc = APSInt(Acc.truncSSat(32), false);
+	}
     INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
       Dst.elem<T>(I) = static_cast<T>(Acc);
     });
@@ -6145,8 +6145,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
       IsSaturating = false;
       break;
     }
-    return interp__builtin_ia32_vpdpwssd(S, OpPC, Call, IsDottingWord,
-                                         IsSaturating);
+    return interp__builtin_ia32_vpdp(S, OpPC, Call, IsDottingWord,
+									 IsSaturating);
   }
 
   default:

>From 4165161139868cdfeac294f8f425371f7b19899a Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 16:22:30 -0500
Subject: [PATCH 12/18] add c++ guard to avx612vnniintrin.h

---
 clang/lib/Headers/avx512vnniintrin.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index f7f017ee26dfa..25ea12ed51c7a 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -19,7 +19,11 @@
   __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"),     \
                  __min_vector_width__(512)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)

>From 2645d19777f614d68caf0f82e5c273d43f782436 Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 16:22:45 -0500
Subject: [PATCH 13/18] add one example test

---
 clang/test/CodeGen/X86/avx512vnni-builtins.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
index 6b8465206eedb..e35cf67d17aad 100644
--- a/clang/test/CodeGen/X86/avx512vnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c
@@ -2,8 +2,11 @@
 //  RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 //  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 //  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
+//  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
+//  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m512i test_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpbusd_epi32
@@ -24,6 +27,11 @@ __m512i test_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_dpbusd_epi32(__S, __A, __B);
 }
+// Each lane: 1*1 + 1*1 + 1*1 + 1*1 = 4
+TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+  (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4));
 
 __m512i test_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpbusds_epi32

>From 4839f4ceb2509290819b7901291dc0ae72ba22f2 Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 16:32:27 -0500
Subject: [PATCH 14/18] avxvnniintrin.h constexpr header update

---
 clang/lib/Headers/avxvnniintrin.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
index 1d2e8c906effc..661c70751b65b 100644
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -43,6 +43,14 @@
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
 
+#if defined(__cplusplus) && (__cpluspluc >= 201103L)
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FNATTRS256 constexpr
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#else
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FNATTRS256
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#endif
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
@@ -60,7 +68,7 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v32qu)__A,
@@ -84,7 +92,7 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v32qu)__A,
@@ -106,7 +114,7 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A,
@@ -128,7 +136,7 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A,
@@ -152,7 +160,7 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v16qu)__A,
@@ -176,7 +184,7 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v16qu)__A,
@@ -198,7 +206,7 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A,
@@ -220,7 +228,7 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A,

>From c7b065221a4e30fc4345aba30547cebe26f9809b Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 16:38:54 -0500
Subject: [PATCH 15/18] avx512vlvnniintrin.h constexpr header update

---
 clang/lib/Headers/avx512vlvnniintrin.h | 40 +++++++++++++++-----------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index 4b8a199af32e5..ff4c85c8b2a62 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -24,6 +24,14 @@
                  __target__("avx512vl,avx512vnni"),                            \
                  __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
@@ -179,7 +187,7 @@
 #define _mm_dpwssds_epi32(S, A, B)                                             \
   ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -187,7 +195,7 @@ _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                      (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -195,7 +203,7 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -203,7 +211,7 @@ _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                     (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -211,7 +219,7 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -219,7 +227,7 @@ _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                      (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -227,7 +235,7 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -235,7 +243,7 @@ _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                     (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPRZ1
 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -243,7 +251,7 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -251,7 +259,7 @@ _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
                                         (__v4si)__S);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -259,7 +267,7 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                         (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -267,7 +275,7 @@ _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
                                        (__v4si)__S);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -275,7 +283,7 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                        (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -283,7 +291,7 @@ _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
                                         (__v4si)__S);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -291,7 +299,7 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                         (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -299,7 +307,7 @@ _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
                                        (__v4si)__S);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,

>From 1aefa73905d6793b57480332d16fd52ee27feb12 Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 16:53:29 -0500
Subject: [PATCH 16/18] fix headers

---
 clang/lib/Headers/avx512vlvnniintrin.h | 2 +-
 clang/lib/Headers/avxvnniintrin.h      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index ff4c85c8b2a62..34d6abd4bbff8 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -243,7 +243,7 @@ _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                     (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPRZ1
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
index 661c70751b65b..16df0f2def108 100644
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -43,11 +43,11 @@
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
 
-#if defined(__cplusplus) && (__cpluspluc >= 201103L)
-#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FNATTRS256 constexpr
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
 #else
-#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FNATTRS256
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
 #endif
 

>From 7f23bea7fac9228dfed922b17f6e407299e847ab Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sat, 4 Apr 2026 21:05:23 -0500
Subject: [PATCH 17/18] ton of tests

---
 .../test/CodeGen/X86/avx512vlvnni-builtins.c  | 344 ++++++++++++++++++
 clang/test/CodeGen/X86/avx512vnni-builtins.c  | 156 +++++++-
 clang/test/CodeGen/X86/avxvnni-builtins.c     | 286 +++++++++++++++
 3 files changed, 784 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
index 11dbd717a9f77..ae068073c8f8e 100644
--- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
@@ -3,7 +3,13 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpbusd_epi32
@@ -11,6 +17,14 @@ __m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpbusd_epi32(__S, __U, __A, __B);
 }
+// mask 0x55: odd lanes keep src
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_dpbusd_epi32(
+    (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800},
+    (__mmask8)0x55,
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  104, 200, 304, 400, 504, 600, 704, 800));
 
 __m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpbusd_epi32
@@ -18,12 +32,48 @@ __m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpbusd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x0F: first 4 lanes updated, last 4 zeroed
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_dpbusd_epi32(
+    (__mmask8)0x0F,
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 0, 0, 0, 0));
 
 __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusd_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// accumulation: src=10, dot=4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){10, 10, 10, 10, 10, 10, 10, 10}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  14, 14, 14, 14, 14, 14, 14, 14));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+    ((__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpbusds_epi32
@@ -31,6 +81,14 @@ __m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
+// mask 0xAA: even lanes keep src
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_dpbusds_epi32(
+    (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800},
+    (__mmask8)0xAA,
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 204, 300, 404, 500, 604, 700, 804));
 
 __m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpbusds_epi32
@@ -38,12 +96,41 @@ __m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
+// maskz 0xFF: all lanes updated
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_dpbusds_epi32(
+    (__mmask8)0xFF,
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4));
 
 __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusds_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}),
+    ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssd_epi32
@@ -51,6 +138,14 @@ __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
+// mask 0xF0: last 4 lanes updated
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_dpwssd_epi32(
+    (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800},
+    (__mmask8)0xF0,
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 200, 300, 400, 502, 602, 702, 802));
 
 __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssd_epi32
@@ -58,12 +153,55 @@ __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x0F: first 4 lanes updated, last 4 zeroed
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_dpwssd_epi32(
+    (__mmask8)0x0F,
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 0, 0, 0, 0));
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// accumulation: src=10, dot=2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){10, 10, 10, 10, 10, 10, 10, 10}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  12, 12, 12, 12, 12, 12, 12, 12));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  -2, -2, -2, -2, -2, -2, -2, -2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}),
+    ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssds_epi32
@@ -71,6 +209,14 @@ __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
+// mask 0xAA: odd lanes updated
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_dpwssds_epi32(
+    (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800},
+    (__mmask8)0xAA,
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 202, 300, 402, 500, 602, 700, 802));
 
 __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssds_epi32
@@ -78,12 +224,41 @@ __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
+// maskz 0xFF: all lanes
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_dpwssds_epi32(
+    (__mmask8)0xFF,
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2));
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}),
+    ((__m256i)(__v16hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpbusd_epi32
@@ -91,6 +266,14 @@ __m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpbusd_epi32(__S, __U, __A, __B);
 }
+// mask 0x05: lanes 0,2 updated
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_dpbusd_epi32(
+    (__m128i)(__v4si){100, 200, 300, 400},
+    (__mmask8)0x05,
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  104, 200, 304, 400));
 
 __m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpbusd_epi32
@@ -98,12 +281,48 @@ __m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m12
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpbusd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x03: first 2 lanes updated, last 2 zeroed
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_dpbusd_epi32(
+    (__mmask8)0x03,
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 0, 0));
 
 __m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusd_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4));
+// accumulation: src=10, dot=4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){10, 10, 10, 10}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  14, 14, 14, 14));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+    ((__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpbusds_epi32
@@ -111,6 +330,14 @@ __m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m12
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
+// mask 0x0A: lanes 1,3 updated
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_dpbusds_epi32(
+    (__m128i)(__v4si){100, 200, 300, 400},
+    (__mmask8)0x0A,
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 204, 300, 404));
 
 __m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpbusds_epi32
@@ -118,12 +345,41 @@ __m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m1
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
+// maskz 0x0F: all 4 lanes
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_dpbusds_epi32(
+    (__mmask8)0x0F,
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4));
 
 __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusds_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}),
+    ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssd_epi32
@@ -131,6 +387,14 @@ __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
+// mask 0x05: lanes 0,2 updated
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_dpwssd_epi32(
+    (__m128i)(__v4si){100, 200, 300, 400},
+    (__mmask8)0x05,
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  102, 200, 302, 400));
 
 __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssd_epi32
@@ -138,12 +402,55 @@ __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m12
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x03: first 2 lanes
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_dpwssd_epi32(
+    (__mmask8)0x03,
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  2, 2, 0, 0));
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2));
+// accumulation: src=10, dot=2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){10, 10, 10, 10}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  12, 12, 12, 12));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){-1,-1,-1,-1,-1,-1,-1,-1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  -2, -2, -2, -2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v8hi){1,0,1,0,1,0,1,0}),
+    ((__m128i)(__v8hi){1,0,1,0,1,0,1,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssds_epi32
@@ -151,6 +458,14 @@ __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m12
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
+// mask 0x0A: lanes 1,3 updated
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_dpwssds_epi32(
+    (__m128i)(__v4si){100, 200, 300, 400},
+    (__mmask8)0x0A,
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  100, 202, 300, 402));
 
 __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssds_epi32
@@ -158,10 +473,39 @@ __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m1
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
+// maskz 0x0F: all 4 lanes
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_dpwssds_epi32(
+    (__mmask8)0x0F,
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2));
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}),
+    ((__m128i)(__v8hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
index e35cf67d17aad..a0e35039a135e 100644
--- a/clang/test/CodeGen/X86/avx512vnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c
@@ -2,8 +2,11 @@
 //  RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 //  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 //  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
-//  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+//  RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 //  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+//  RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+//  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
 #include <immintrin.h>
 #include "builtin_test_helpers.h"
@@ -27,18 +30,50 @@ __m512i test_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_dpbusd_epi32(__S, __A, __B);
 }
-// Each lane: 1*1 + 1*1 + 1*1 + 1*1 = 4
+
 TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
   (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
   (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
   4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4));
 
+TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+  (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5));
+
+TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  (__m512i)(__v64qu){1,2,3,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+  (__m512i)(__v64qi){5,6,7,8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  72, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5));
+
+TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  (__m512i)(__v64qu){1,2,3,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+  (__m512i)(__v64qi){5,6,7,8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  72, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5));
+
+// two's complement wrap: INT32_MAX + 1 wraps to INT32_MIN
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpbusd_epi32(
+    (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m512i)(__v64qu){1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0},
+    (__m512i)(__v64qi){1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0}),
+  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648, 
+   -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648));
+
 __m512i test_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpbusds_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
+// mask 0x5555: even lanes updated, odd lanes keep src
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_dpbusds_epi32(
+    (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600},
+    (__mmask16)0x5555,
+    (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  104, 200, 304, 400, 504, 600, 704, 800, 904, 1000, 1104, 1200, 1304, 1400, 1504, 1600));
 
 __m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpbusds_epi32
@@ -46,12 +81,41 @@ __m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A,
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
+// maskz 0x00FF: first 8 lanes updated, last 8 zeroed
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_dpbusds_epi32(
+    (__mmask16)0x00FF,
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpbusds_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpbusds_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4));
+// saturate to INT32_MAX: src=MAX, dot=4
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpbusds_epi32(
+    (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN: src=MIN, dot=255*(-1)*4=-1020
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpbusds_epi32(
+    (__m512i)(__v16si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1},
+    (__m512i)(__v64qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255},
+    (__m512i)(__v64qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssd_epi32
@@ -59,6 +123,14 @@ __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
+// mask 0xFF00: last 8 lanes updated, first 8 keep src
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_dpwssd_epi32(
+    (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600},
+    (__mmask16)0xFF00,
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 200, 300, 400, 500, 600, 700, 800, 902, 1002, 1102, 1202, 1302, 1402, 1502, 1602));
 
 __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssd_epi32
@@ -66,12 +138,55 @@ __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, _
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x000F: first 4 lanes updated, rest zeroed
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_dpwssd_epi32(
+    (__mmask16)0x000F,
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssd_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2));
+// accumulation: src=10, dot=2
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m512i)(__v32hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0},
+    (__m512i)(__v32hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssds_epi32
@@ -79,6 +194,14 @@ __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, _
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
+// mask 0xAAAA: odd lanes updated, even lanes keep src
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_dpwssds_epi32(
+    (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600},
+    (__mmask16)0xAAAA,
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 202, 300, 402, 500, 602, 700, 802, 900, 1002, 1100, 1202, 1300, 1402, 1500, 1602));
 
 __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssds_epi32
@@ -86,10 +209,39 @@ __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A,
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
+// maskz 0xFFFF: all lanes updated
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_dpwssds_epi32(
+    (__mmask16)0xFFFF,
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2));
 
 __m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssds_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssds_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2));
+// saturate to INT32_MAX: src=MAX, dot=32767*32767*2=2147352578
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssds_epi32(
+    (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN: src=MIN, dot=(-32768)*32767*2=-2147418112
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssds_epi32(
+    (__m512i)(__v16si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1},
+    (__m512i)(__v32hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c
index 6557a26807eb2..0a12a43e6cee7 100644
--- a/clang/test/CodeGen/X86/avxvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@@ -3,100 +3,386 @@
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusd_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+    ((__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusds_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  -2, -2, -2, -2, -2, -2, -2, -2));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}),
+    ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
 
 __m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusd_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+    ((__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusds_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2147483647, 2147483647, 2147483647, 2147483647));
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v8hi){1,0,1,0,1,0,1,0}),
+    ((__m128i)(__v8hi){1,0,1,0,1,0,1,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147483647, 2147483647, 2147483647, 2147483647));
 
 __m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusd_avx_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusd_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255},
+    (__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+  -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_avx_epi32(
+    (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0},
+    (__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusds_avx_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusds_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_avx_epi32(
+    (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
 
 __m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_avx_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  -2, -2, -2, -2, -2, -2, -2, -2));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_avx_epi32(
+    (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0},
+    (__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_avx_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_avx_epi32(
+    (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
 
 __m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusd_avx_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusd_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255},
+    (__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+  -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_avx_epi32(
+    (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647},
+    (__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0},
+    (__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusds_avx_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusds_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_avx_epi32(
+    (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2147483647, 2147483647, 2147483647, 2147483647));
 
 __m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_avx_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_avx_epi32(
+    (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647},
+    (__m128i)(__v8hi){1,0,1,0,1,0,1,0},
+    (__m128i)(__v8hi){1,0,1,0,1,0,1,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_avx_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_avx_epi32(
+    (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647},
+    (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147483647, 2147483647, 2147483647, 2147483647));

>From ddfab9e4040afc9930522addfda26e6716cc2a76 Mon Sep 17 00:00:00 2001
From: Akash Deo <AkashDeo2025 at u.Northwestern.edu>
Date: Sun, 5 Apr 2026 09:03:36 -0500
Subject: [PATCH 18/18] add constexpr support for vnni intrinsics [161340]

---
 clang/include/clang/Basic/BuiltinsX86.td      |  24 +-
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  99 +++++
 clang/lib/AST/ExprConstant.cpp                |  79 ++++
 clang/lib/Headers/avx512vlvnniintrin.h        |  40 +-
 clang/lib/Headers/avx512vnniintrin.h          |  31 +-
 clang/lib/Headers/avxvnniintrin.h             |  24 +-
 .../test/CodeGen/X86/avx512vlvnni-builtins.c  | 344 ++++++++++++++++++
 clang/test/CodeGen/X86/avx512vnni-builtins.c  | 160 ++++++++
 clang/test/CodeGen/X86/avxvnni-builtins.c     | 286 +++++++++++++++
 9 files changed, 1039 insertions(+), 48 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 0cab8c77d465d..342a23e1f2aab 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1075,51 +1075,51 @@ let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVecto
   def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, char>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, char>)">;
 }
 
-let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, char>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, char>)">;
 }
 
-let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
-let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
-let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
-let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index e7b3ef6ce1510..21ab67237591a 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -4189,6 +4189,61 @@ static bool interp__builtin_ia32_gfni_mul(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_vpdp(InterpState &S, CodePtr OpPC,
+                                           const CallExpr *Call,
+                                           bool IsDottingWord,
+                                           bool IsSaturating) {
+  const auto *SrcVecT = Call->getArg(0)->getType()->castAs<VectorType>();
+  const auto *OpAVecT = Call->getArg(1)->getType()->castAs<VectorType>();
+  const auto *OpBVecT = Call->getArg(2)->getType()->castAs<VectorType>();
+
+  PrimType SrcElemT = *S.getContext().classify(SrcVecT->getElementType());
+  PrimType OpAElemT = *S.getContext().classify(OpAVecT->getElementType());
+  PrimType OpBElemT = *S.getContext().classify(OpBVecT->getElementType());
+
+  unsigned NumElements = SrcVecT->getNumElements();
+  unsigned Iters = IsDottingWord ? 2 : 4;
+
+  const Pointer &OpBPtr = S.Stk.pop<Pointer>();
+  const Pointer &OpAPtr = S.Stk.pop<Pointer>();
+  const Pointer &SrcPtr = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  for (unsigned I = 0; I < NumElements; ++I) {
+    APSInt Acc;
+    INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
+      Acc = SrcPtr.elem<T>(I).toAPSInt();
+    });
+    if (IsSaturating) {
+      Acc = Acc.sext(64);
+	}
+    for (unsigned J = 0; J < Iters; ++J) {
+      APSInt OpA, OpB;
+      INT_TYPE_SWITCH_NO_BOOL(OpAElemT, {
+        OpA = OpAPtr.elem<T>(Iters*I+J).toAPSInt();
+      });
+      INT_TYPE_SWITCH_NO_BOOL(OpBElemT, {
+        OpB = OpBPtr.elem<T>(Iters*I+J).toAPSInt();
+      });
+      if (IsDottingWord) {
+        OpA = APSInt(OpA.sext(64), false);
+	  } else {
+		OpA = APSInt(OpA.zext(64), true);
+	  }
+      OpB = APSInt(OpB.sext(64), false);
+      Acc += APSInt((OpA * OpB).sext(64), false);
+    }
+    if (IsSaturating) {
+      Acc = APSInt(Acc.truncSSat(32), false);
+	}
+    INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
+      Dst.elem<T>(I) = static_cast<T>(Acc);
+    });
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -6049,6 +6104,50 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return EvalScalarMinMaxFp(A, B, RoundingMode, /*IsMin=*/false);
         },
         /*IsScalar=*/true);
+  case X86::BI__builtin_ia32_vpdpwssd128:
+  case X86::BI__builtin_ia32_vpdpwssd256:
+  case X86::BI__builtin_ia32_vpdpwssd512:
+  case X86::BI__builtin_ia32_vpdpwssds128:
+  case X86::BI__builtin_ia32_vpdpwssds256:
+  case X86::BI__builtin_ia32_vpdpwssds512:
+  case X86::BI__builtin_ia32_vpdpbusds128:
+  case X86::BI__builtin_ia32_vpdpbusds256:
+  case X86::BI__builtin_ia32_vpdpbusds512:
+  case X86::BI__builtin_ia32_vpdpbusd128:
+  case X86::BI__builtin_ia32_vpdpbusd256:
+  case X86::BI__builtin_ia32_vpdpbusd512: {
+    unsigned BuiltinID = Call->getBuiltinCallee();
+    bool IsDottingWord;
+    bool IsSaturating;
+    switch (BuiltinID) {
+    case X86::BI__builtin_ia32_vpdpwssd128:
+    case X86::BI__builtin_ia32_vpdpwssd256:
+    case X86::BI__builtin_ia32_vpdpwssd512:
+      IsDottingWord = true;
+      IsSaturating = false;
+      break;
+    case X86::BI__builtin_ia32_vpdpwssds128:
+    case X86::BI__builtin_ia32_vpdpwssds256:
+    case X86::BI__builtin_ia32_vpdpwssds512:
+      IsDottingWord = true;
+      IsSaturating = true;
+      break;
+    case X86::BI__builtin_ia32_vpdpbusds128:
+    case X86::BI__builtin_ia32_vpdpbusds256:
+    case X86::BI__builtin_ia32_vpdpbusds512:
+      IsDottingWord = false;
+      IsSaturating = true;
+      break;
+    case X86::BI__builtin_ia32_vpdpbusd128:
+    case X86::BI__builtin_ia32_vpdpbusd256:
+    case X86::BI__builtin_ia32_vpdpbusd512:
+      IsDottingWord = false;
+      IsSaturating = false;
+      break;
+    }
+    return interp__builtin_ia32_vpdp(S, OpPC, Call, IsDottingWord,
+									 IsSaturating);
+  }
 
   default:
     S.FFDiag(S.Current->getLocation(OpPC),
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 4f45fa728c605..dfa8707f04285 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14634,6 +14634,85 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       return false;
     return Success(R, E);
   }
+  case X86::BI__builtin_ia32_vpdpwssd128:
+  case X86::BI__builtin_ia32_vpdpwssd256:
+  case X86::BI__builtin_ia32_vpdpwssd512:
+  case X86::BI__builtin_ia32_vpdpwssds128:
+  case X86::BI__builtin_ia32_vpdpwssds256:
+  case X86::BI__builtin_ia32_vpdpwssds512:
+  case X86::BI__builtin_ia32_vpdpbusds128:
+  case X86::BI__builtin_ia32_vpdpbusds256:
+  case X86::BI__builtin_ia32_vpdpbusds512:
+  case X86::BI__builtin_ia32_vpdpbusd128:
+  case X86::BI__builtin_ia32_vpdpbusd256:
+  case X86::BI__builtin_ia32_vpdpbusd512: {
+    unsigned BuiltinID = E->getBuiltinCallee();
+    bool IsDottingWord = false;
+    bool IsSaturating = false;
+    switch (BuiltinID) {
+    case X86::BI__builtin_ia32_vpdpwssd128:
+	case X86::BI__builtin_ia32_vpdpwssd256:
+	case X86::BI__builtin_ia32_vpdpwssd512:
+	  IsDottingWord = true;
+	  IsSaturating = false;
+	  break;
+	case X86::BI__builtin_ia32_vpdpwssds128:
+	case X86::BI__builtin_ia32_vpdpwssds256:
+	case X86::BI__builtin_ia32_vpdpwssds512:
+	  IsDottingWord = true;
+	  IsSaturating = true;
+	  break;
+	case X86::BI__builtin_ia32_vpdpbusds128:
+	case X86::BI__builtin_ia32_vpdpbusds256:
+	case X86::BI__builtin_ia32_vpdpbusds512:
+	  IsDottingWord = false;
+	  IsSaturating = true;
+	  break;
+	case X86::BI__builtin_ia32_vpdpbusd128:
+	case X86::BI__builtin_ia32_vpdpbusd256:
+	case X86::BI__builtin_ia32_vpdpbusd512:
+	  IsDottingWord = false;
+	  IsSaturating = false;
+	  break;
+    }
+	
+
+    APValue Source, OperandA, OperandB;
+    if (!EvaluateAsRValue(Info, E->getArg(0), Source) ||
+        !EvaluateAsRValue(Info, E->getArg(1), OperandA) ||
+        !EvaluateAsRValue(Info, E->getArg(2), OperandB)) {
+	  return false;
+    }
+
+
+    unsigned NumElements = Source.getVectorLength();
+
+    SmallVector<APValue, 16> Result;
+    Result.reserve(NumElements);
+	unsigned Iters = IsDottingWord ? 2 : 4;
+    for (unsigned I = 0; I < NumElements; ++I) {
+      APSInt DotProduct = Source.getVectorElt(I).getInt();
+      if (IsSaturating) {
+		DotProduct = DotProduct.sext(64);
+      }
+      for (unsigned J = 0; J < Iters; ++J) {
+        APSInt OpA;
+        if (IsDottingWord) {
+		  OpA = APSInt(OperandA.getVectorElt(Iters*I+J).getInt().sext(64), false);
+        } else {
+		  OpA = APSInt(OperandA.getVectorElt(Iters*I+J).getInt().zext(64), true);
+		}
+        APSInt OpB = APSInt(OperandB.getVectorElt(Iters*I+J).getInt().sext(64), false);
+        DotProduct += APSInt((OpA * OpB).sext(64), false);
+      }
+      if (IsSaturating) {
+        DotProduct = APSInt(DotProduct.truncSSat(32), false);
+	  }
+	  Result.push_back(APValue(DotProduct));
+    }
+
+	return Success(APValue(Result.data(), Result.size()), E);
+  }
   }
 }
 
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index 4b8a199af32e5..34d6abd4bbff8 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -24,6 +24,14 @@
                  __target__("avx512vl,avx512vnni"),                            \
                  __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
@@ -179,7 +187,7 @@
 #define _mm_dpwssds_epi32(S, A, B)                                             \
   ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -187,7 +195,7 @@ _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                      (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -195,7 +203,7 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -203,7 +211,7 @@ _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                     (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -211,7 +219,7 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -219,7 +227,7 @@ _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                      (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -227,7 +235,7 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -235,7 +243,7 @@ _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
                                     (__v8si)__S);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -243,7 +251,7 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -251,7 +259,7 @@ _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
                                         (__v4si)__S);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -259,7 +267,7 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                         (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -267,7 +275,7 @@ _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
                                        (__v4si)__S);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -275,7 +283,7 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                        (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -283,7 +291,7 @@ _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
                                         (__v4si)__S);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -291,7 +299,7 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                         (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -299,7 +307,7 @@ _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
                                        (__v4si)__S);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectd_128(__U,
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index 2ce88efe4a04f..25ea12ed51c7a 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -19,14 +19,20 @@
   __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"),     \
                  __min_vector_width__(512)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v64qu)__A,
                                              (__v64qi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -34,7 +40,7 @@ _mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
                                     (__v16si)__S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -42,14 +48,14 @@ _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
                                     (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v64qu)__A,
                                               (__v64qi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -57,7 +63,7 @@ _mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
                                    (__v16si)__S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -65,14 +71,14 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
                                    (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A,
                                              (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -80,7 +86,7 @@ _mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
                                     (__v16si)__S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -88,14 +94,14 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
                                     (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A,
                                               (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -103,7 +109,7 @@ _mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
                                    (__v16si)__S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectd_512(__U,
@@ -112,5 +118,6 @@ _mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
index 1d2e8c906effc..16df0f2def108 100644
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -43,6 +43,14 @@
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#else
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#endif
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
 /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
@@ -60,7 +68,7 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v32qu)__A,
@@ -84,7 +92,7 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v32qu)__A,
@@ -106,7 +114,7 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A,
@@ -128,7 +136,7 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A,
@@ -152,7 +160,7 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v16qu)__A,
@@ -176,7 +184,7 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v16qu)__A,
@@ -198,7 +206,7 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A,
@@ -220,7 +228,7 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A,
diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
index 11dbd717a9f77..ae068073c8f8e 100644
--- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
@@ -3,7 +3,13 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpbusd_epi32
@@ -11,6 +17,14 @@ __m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpbusd_epi32(__S, __U, __A, __B);
 }
+// mask 0x55: odd lanes keep src
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_dpbusd_epi32(
+    (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800},
+    (__mmask8)0x55,
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  104, 200, 304, 400, 504, 600, 704, 800));
 
 __m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpbusd_epi32
@@ -18,12 +32,48 @@ __m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpbusd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x0F: first 4 lanes updated, last 4 zeroed
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_dpbusd_epi32(
+    (__mmask8)0x0F,
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 0, 0, 0, 0));
 
 __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusd_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// accumulation: src=10, dot=4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){10, 10, 10, 10, 10, 10, 10, 10}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  14, 14, 14, 14, 14, 14, 14, 14));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+    ((__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpbusds_epi32
@@ -31,6 +81,14 @@ __m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
+// mask 0xAA: even lanes keep src
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_dpbusds_epi32(
+    (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800},
+    (__mmask8)0xAA,
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 204, 300, 404, 500, 604, 700, 804));
 
 __m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpbusds_epi32
@@ -38,12 +96,41 @@ __m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
+// maskz 0xFF: all lanes updated
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_dpbusds_epi32(
+    (__mmask8)0xFF,
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4));
 
 __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusds_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}),
+    ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssd_epi32
@@ -51,6 +138,14 @@ __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
+// mask 0xF0: last 4 lanes updated
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_dpwssd_epi32(
+    (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800},
+    (__mmask8)0xF0,
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 200, 300, 400, 502, 602, 702, 802));
 
 __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssd_epi32
@@ -58,12 +153,55 @@ __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x0F: first 4 lanes updated, last 4 zeroed
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_dpwssd_epi32(
+    (__mmask8)0x0F,
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 0, 0, 0, 0));
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// accumulation: src=10, dot=2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){10, 10, 10, 10, 10, 10, 10, 10}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  12, 12, 12, 12, 12, 12, 12, 12));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  -2, -2, -2, -2, -2, -2, -2, -2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}),
+    ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssds_epi32
@@ -71,6 +209,14 @@ __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
+// mask 0xAA: odd lanes updated
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_dpwssds_epi32(
+    (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800},
+    (__mmask8)0xAA,
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 202, 300, 402, 500, 602, 700, 802));
 
 __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssds_epi32
@@ -78,12 +224,41 @@ __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
+// maskz 0xFF: all lanes
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_dpwssds_epi32(
+    (__mmask8)0xFF,
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2));
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}),
+    ((__m256i)(__v16hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpbusd_epi32
@@ -91,6 +266,14 @@ __m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpbusd_epi32(__S, __U, __A, __B);
 }
+// mask 0x05: lanes 0,2 updated
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_dpbusd_epi32(
+    (__m128i)(__v4si){100, 200, 300, 400},
+    (__mmask8)0x05,
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  104, 200, 304, 400));
 
 __m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpbusd_epi32
@@ -98,12 +281,48 @@ __m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m12
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpbusd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x03: first 2 lanes updated, last 2 zeroed
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_dpbusd_epi32(
+    (__mmask8)0x03,
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 0, 0));
 
 __m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusd_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4));
+// accumulation: src=10, dot=4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){10, 10, 10, 10}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  14, 14, 14, 14));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+    ((__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpbusds_epi32
@@ -111,6 +330,14 @@ __m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m12
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
+// mask 0x0A: lanes 1,3 updated
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_dpbusds_epi32(
+    (__m128i)(__v4si){100, 200, 300, 400},
+    (__mmask8)0x0A,
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 204, 300, 404));
 
 __m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpbusds_epi32
@@ -118,12 +345,41 @@ __m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m1
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
+// maskz 0x0F: all 4 lanes
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_dpbusds_epi32(
+    (__mmask8)0x0F,
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4));
 
 __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusds_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}),
+    ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssd_epi32
@@ -131,6 +387,14 @@ __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
+// mask 0x05: lanes 0,2 updated
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_dpwssd_epi32(
+    (__m128i)(__v4si){100, 200, 300, 400},
+    (__mmask8)0x05,
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  102, 200, 302, 400));
 
 __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssd_epi32
@@ -138,12 +402,55 @@ __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m12
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x03: first 2 lanes
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_dpwssd_epi32(
+    (__mmask8)0x03,
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  2, 2, 0, 0));
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2));
+// accumulation: src=10, dot=2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){10, 10, 10, 10}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  12, 12, 12, 12));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){-1,-1,-1,-1,-1,-1,-1,-1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  -2, -2, -2, -2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v8hi){1,0,1,0,1,0,1,0}),
+    ((__m128i)(__v8hi){1,0,1,0,1,0,1,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssds_epi32
@@ -151,6 +458,14 @@ __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m12
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
+// mask 0x0A: lanes 1,3 updated
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_dpwssds_epi32(
+    (__m128i)(__v4si){100, 200, 300, 400},
+    (__mmask8)0x0A,
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  100, 202, 300, 402));
 
 __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssds_epi32
@@ -158,10 +473,39 @@ __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m1
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
+// maskz 0x0F: all 4 lanes
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_dpwssds_epi32(
+    (__mmask8)0x0F,
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2));
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}),
+    ((__m128i)(__v8hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
index 6b8465206eedb..a0e35039a135e 100644
--- a/clang/test/CodeGen/X86/avx512vnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c
@@ -3,7 +3,13 @@
 //  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 //  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+//  RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+//  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+//  RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+//  RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m512i test_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpbusd_epi32
@@ -25,12 +31,49 @@ __m512i test_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   return _mm512_dpbusd_epi32(__S, __A, __B);
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+  (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4));
+
+TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+  (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5));
+
+TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  (__m512i)(__v64qu){1,2,3,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+  (__m512i)(__v64qi){5,6,7,8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  72, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5));
+
+TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  (__m512i)(__v64qu){1,2,3,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+  (__m512i)(__v64qi){5,6,7,8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  72, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5));
+
+// two's complement wrap: INT32_MAX + 1 wraps to INT32_MIN
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpbusd_epi32(
+    (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m512i)(__v64qu){1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0},
+    (__m512i)(__v64qi){1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0}),
+  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648, 
+   -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648,  -2147483648));
+
 __m512i test_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpbusds_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpbusds_epi32(__S, __U, __A, __B);
 }
+// mask 0x5555: even lanes updated, odd lanes keep src
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_dpbusds_epi32(
+    (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600},
+    (__mmask16)0x5555,
+    (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  104, 200, 304, 400, 504, 600, 704, 800, 904, 1000, 1104, 1200, 1304, 1400, 1504, 1600));
 
 __m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpbusds_epi32
@@ -38,12 +81,41 @@ __m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A,
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpbusds_epi32(__U, __S, __A, __B);
 }
+// maskz 0x00FF: first 8 lanes updated, last 8 zeroed
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_dpbusds_epi32(
+    (__mmask16)0x00FF,
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpbusds_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpbusds_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4));
+// saturate to INT32_MAX: src=MAX, dot=4
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpbusds_epi32(
+    (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN: src=MIN, dot=255*(-1)*4=-1020
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpbusds_epi32(
+    (__m512i)(__v16si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1},
+    (__m512i)(__v64qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255},
+    (__m512i)(__v64qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssd_epi32
@@ -51,6 +123,14 @@ __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
+// mask 0xFF00: last 8 lanes updated, first 8 keep src
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_dpwssd_epi32(
+    (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600},
+    (__mmask16)0xFF00,
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 200, 300, 400, 500, 600, 700, 800, 902, 1002, 1102, 1202, 1302, 1402, 1502, 1602));
 
 __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssd_epi32
@@ -58,12 +138,55 @@ __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, _
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
+// maskz 0x000F: first 4 lanes updated, rest zeroed
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_dpwssd_epi32(
+    (__mmask16)0x000F,
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssd_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2));
+// accumulation: src=10, dot=2
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssd_epi32(
+    (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m512i)(__v32hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0},
+    (__m512i)(__v32hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssds_epi32
@@ -71,6 +194,14 @@ __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, _
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
+// mask 0xAAAA: odd lanes updated, even lanes keep src
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_dpwssds_epi32(
+    (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600},
+    (__mmask16)0xAAAA,
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  100, 202, 300, 402, 500, 602, 700, 802, 900, 1002, 1100, 1202, 1300, 1402, 1500, 1602));
 
 __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssds_epi32
@@ -78,10 +209,39 @@ __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A,
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
+// maskz 0xFFFF: all lanes updated
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_dpwssds_epi32(
+    (__mmask16)0xFFFF,
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2));
 
 __m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssds_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssds_epi32(
+    (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2));
+// saturate to INT32_MAX: src=MAX, dot=32767*32767*2=2147352578
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssds_epi32(
+    (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
+// saturate to INT32_MIN: src=MIN, dot=(-32768)*32767*2=-2147418112
+TEST_CONSTEXPR(match_v16si(
+  _mm512_dpwssds_epi32(
+    (__m512i)(__v16si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1},
+    (__m512i)(__v32hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768},
+    (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c
index 6557a26807eb2..0a12a43e6cee7 100644
--- a/clang/test/CodeGen/X86/avxvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@@ -3,100 +3,386 @@
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusd_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+    ((__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusds_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  -2, -2, -2, -2, -2, -2, -2, -2));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}),
+    ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_epi32(
+    ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
 
 __m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusd_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}),
+    ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})),
+  -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+    ((__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusds_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusds_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})),
+  2147483647, 2147483647, 2147483647, 2147483647));
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v8hi){1,0,1,0,1,0,1,0}),
+    ((__m128i)(__v8hi){1,0,1,0,1,0,1,0})),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){0, 0, 0, 0}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+    ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})),
+  2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_epi32(
+    ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+    ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})),
+  2147483647, 2147483647, 2147483647, 2147483647));
 
 __m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusd_avx_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusd_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255},
+    (__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+  -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusd_avx_epi32(
+    (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0},
+    (__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpbusds_avx_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_dpbusds_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4, 4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpbusds_avx_epi32(
+    (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
 
 __m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_avx_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// signedness: (-1)*1 + (-1)*1 = -2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  -2, -2, -2, -2, -2, -2, -2, -2));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssd_avx_epi32(
+    (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0},
+    (__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_avx_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_avx_epi32(
+    (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2, 2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v8si(
+  _mm256_dpwssds_avx_epi32(
+    (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647},
+    (__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647));
 
 __m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusd_avx_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusd_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4));
+// signedness: A=255(u8), B=-1(i8) -> 255*(-1)*4 = -1020
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255},
+    (__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}),
+  -1020, -1020, -1020, -1020));
+// overflow wraps: src=MAX, dot=1*1=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusd_avx_epi32(
+    (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647},
+    (__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0},
+    (__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpbusds_avx_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_dpbusds_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1*4 = 4
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  4, 4, 4, 4));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpbusds_avx_epi32(
+    (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647},
+    (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+    (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}),
+  2147483647, 2147483647, 2147483647, 2147483647));
 
 __m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_avx_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2));
+// large values: 32767*32767 + 32767*32767 = 2147352578
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147352578, 2147352578, 2147352578, 2147352578));
+// overflow wraps: src=MAX, dot=1*1+0*0=1 -> MAX+1 wraps to MIN
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssd_avx_epi32(
+    (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647},
+    (__m128i)(__v8hi){1,0,1,0,1,0,1,0},
+    (__m128i)(__v8hi){1,0,1,0,1,0,1,0}),
+  -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1));
 
 __m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_avx_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_avx_epi32(__S, __A, __B);
 }
+// basic: 1*1 + 1*1 = 2
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_avx_epi32(
+    (__m128i)(__v4si){0, 0, 0, 0},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1},
+    (__m128i)(__v8hi){1,1,1,1,1,1,1,1}),
+  2, 2, 2, 2));
+// saturate to INT32_MAX
+TEST_CONSTEXPR(match_v4si(
+  _mm_dpwssds_avx_epi32(
+    (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647},
+    (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767},
+    (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}),
+  2147483647, 2147483647, 2147483647, 2147483647));



More information about the cfe-commits mailing list