[clang] [llvm] [X86][AVX10.2] Support AVX10.2 VNNI FP16/INT8/INT16 new instructions (PR #101783)

Phoebe Wang via cfe-commits cfe-commits at lists.llvm.org
Fri Aug 2 19:28:04 PDT 2024


https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/101783

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965

>From 76043bc860ef0b6aa71d22662beed88891554d63 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Thu, 1 Aug 2024 11:23:34 +0800
Subject: [PATCH] [X86][AVX10.2] Support AVX10.2 VNNI FP16/INT8/INT16 new
 instructions

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
---
 clang/include/clang/Basic/BuiltinsX86.def     |   69 +-
 clang/lib/Headers/avx10_2_512niintrin.h       |  279 ++++
 clang/lib/Headers/avx10_2niintrin.h           |  369 +++++
 clang/lib/Headers/avxvnniint16intrin.h        |  113 +-
 clang/lib/Headers/avxvnniint8intrin.h         |  113 +-
 .../test/CodeGen/X86/avx10_2_512ni-builtins.c |  276 ++++
 clang/test/CodeGen/X86/avx10_2ni-builtins.c   |  381 +++++
 .../test/CodeGen/X86/avxvnniint16-builtins.c  |    2 +
 clang/test/CodeGen/X86/avxvnniint8-builtins.c |    2 +
 llvm/include/llvm/IR/IntrinsicsX86.td         |   79 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |    7 +
 llvm/lib/Target/X86/X86ISelLowering.h         |   10 +-
 llvm/lib/Target/X86/X86InstrAVX10.td          |   34 +
 llvm/lib/Target/X86/X86InstrAVX512.td         |   59 +-
 llvm/lib/Target/X86/X86InstrFragmentsSIMD.td  |   12 +
 llvm/lib/Target/X86/X86InstrInfo.cpp          |   54 +
 llvm/lib/Target/X86/X86InstrSSE.td            |   79 +-
 llvm/lib/Target/X86/X86IntrinsicsInfo.h       |   33 +
 .../CodeGen/X86/avx10_2_512ni-intrinsics.ll   |  387 ++++-
 llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll |  563 +++++++
 .../CodeGen/X86/avxvnniint16-intrinsics.ll    |   62 +
 .../CodeGen/X86/avxvnniint8-intrinsics.ll     |  206 +++
 .../test/MC/Disassembler/X86/avx10_2ni-32.txt | 1410 +++++++++++++++++
 .../test/MC/Disassembler/X86/avx10_2ni-64.txt | 1410 +++++++++++++++++
 llvm/test/MC/X86/avx10_2ni-32-intel.s         | 1410 +++++++++++++++++
 llvm/test/MC/X86/avx10_2ni-64-att.s           | 1410 +++++++++++++++++
 llvm/test/TableGen/x86-fold-tables.inc        |  234 +++
 .../utils/TableGen/X86InstrMappingEmitter.cpp |    3 +-
 28 files changed, 8815 insertions(+), 251 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index f028711a807c0..3ea196d949d2d 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -773,18 +773,18 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512v
 TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
 
 // AVX-VNNI-INT8
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
 
 TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
@@ -1959,6 +1959,27 @@ TARGET_HEADER_BUILTIN(__readgsword,  "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
 TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 
+// AVX10.2 VNNI FP16
+TARGET_BUILTIN(__builtin_ia32_vdpphps128, "V4fV4fV8xV8x", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdpphps256, "V8fV8fV16xV16x", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdpphps512, "V16fV16fV32xV32x", "ncV:512:", "avx10.2-512")
+
+// AVX10.2 VNNI INT8
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+
+// AVX10.2 VNNI INT16
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+
 // AVX10.2 VMPSADBW
 TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
 
@@ -1968,18 +1989,18 @@ TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx
 TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
 
 // AVX-VNNI-INT16
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
 
 // AVX-NE-CONVERT
 TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index 5ad6993b45433..7e614f7740bff 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -16,6 +16,35 @@
 #ifndef __AVX10_2_512NIINTRIN_H
 #define __AVX10_2_512NIINTRIN_H
 
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
+                 __min_vector_width__(512)))
+
+/* VNNI FP16 */
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W,
+                                                           __m512h __A,
+                                                           __m512h __B) {
+  return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A,
+                                           (__v32hf)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W,
+                                                                __mmask16 __U,
+                                                                __m512h __A,
+                                                                __m512h __B) {
+  return (__m512)__builtin_ia32_selectps_512(
+      (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
+                                                                 __m512 __W,
+                                                                 __m512h __A,
+                                                                 __m512h __B) {
+  return (__m512)__builtin_ia32_selectps_512(
+      (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B),
+      (__v16sf)_mm512_setzero_ps());
+}
+
 /* VMPSADBW */
 #define _mm512_mpsadbw_epu8(A, B, imm)                                         \
   ((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A),                   \
@@ -31,5 +60,255 @@
       (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)),           \
       (__v32hi)_mm512_setzero_si512()))
 
+/* VNNI INT8 */
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
+                                                                 __m512i __A,
+                                                                 __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
+                                                                  __m512i __A,
+                                                                  __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
+    __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
+                                                                 __m512i __A,
+                                                                 __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
+                                                                  __m512i __A,
+                                                                  __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
+    __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
+                                                                 __m512i __A,
+                                                                 __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
+                                                                  __m512i __A,
+                                                                  __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
+    __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+/* VNNI INT16 */
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
+                                                                 __m512i __B,
+                                                                 __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
+                                             (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
+                                                                  __m512i __B,
+                                                                  __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
+                                              (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
+                                                                 __m512i __B,
+                                                                 __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
+                                             (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
+                                                                  __m512i __B,
+                                                                  __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
+                                              (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
+                                                                 __m512i __B,
+                                                                 __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
+                                             (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
+                                                                  __m512i __B,
+                                                                  __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
+                                              (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __SSE2__ */
 #endif /* __AVX10_2_512NIINTRIN_H */
diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
index 3527e0eaf5c89..07edba0e4ad1f 100644
--- a/clang/lib/Headers/avx10_2niintrin.h
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -15,6 +15,58 @@
 #ifndef __AVX10_2NIINTRIN_H
 #define __AVX10_2NIINTRIN_H
 
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(256)))
+
+/* VNNI FP16 */
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W,
+                                                           __m128h __A,
+                                                           __m128h __B) {
+  return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A,
+                                           (__v8hf)__B);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128)__builtin_ia32_selectps_128(
+      (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U,
+                                                                 __m128 __W,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_dpph_ps(__W, __A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W,
+                                                              __m256h __A,
+                                                              __m256h __B) {
+  return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A,
+                                           (__v16hf)__B);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
+  return (__m256)__builtin_ia32_selectps_256(
+      (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B),
+      (__v8sf)_mm256_setzero_ps());
+}
+
 /* VMPSADBW */
 #define _mm_mask_mpsadbw_epu8(W, U, A, B, imm)                                 \
   ((__m128i)__builtin_ia32_selectw_128(                                        \
@@ -36,6 +88,320 @@
       (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)),           \
       (__v16hi)_mm256_setzero_si256()))
 
+/* VNNI INT8 */
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32(
+    __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32(
+    __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32(
+    __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B),
+      (__v8si)_mm256_setzero_si256());
+}
+
+/* VNNI INT16 */
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
+    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
+    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
+      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
+      (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
+    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
+      (__v8si)_mm256_setzero_si256());
+}
+
 /* YMM Rounding */
 #define _mm256_add_round_pd(A, B, R)                                           \
   ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A),               \
@@ -79,5 +445,8 @@
       (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)),               \
       (__v8sf)_mm256_setzero_ps()))
 
+#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128
+
 #endif /* __AVX10_2NIINTRIN_H */
 #endif /* __SSE2__ */
diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
index e4d342a8b45b1..805d249911c17 100644
--- a/clang/lib/Headers/avxvnniint16intrin.h
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -15,14 +15,6 @@
 #ifndef __AVXVNNIINT16INTRIN_H
 #define __AVXVNNIINT16INTRIN_H
 
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
-                 __min_vector_width__(256)))
-
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 ///    signed 16-bit results. Sum these 2 results with the corresponding
@@ -53,12 +45,9 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpwsud_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -90,11 +79,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -127,12 +114,9 @@ _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -165,11 +149,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
+#define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@@ -201,12 +183,9 @@ _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpwusd_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@@ -238,11 +217,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@@ -275,12 +252,9 @@ _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpwusds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@@ -313,11 +287,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
+#define _mm256_dpwusds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -349,12 +321,9 @@ _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpwuud_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -386,11 +355,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpwuud_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -423,12 +390,9 @@ _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpwuuds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -461,13 +425,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
+#define _mm256_dpwuuds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))
 
 #endif // __AVXVNNIINT16INTRIN_H
diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h
index b0b6cb853f713..c211620c68f07 100644
--- a/clang/lib/Headers/avxvnniint8intrin.h
+++ b/clang/lib/Headers/avxvnniint8intrin.h
@@ -14,14 +14,6 @@
 #ifndef __AVXVNNIINT8INTRIN_H
 #define __AVXVNNIINT8INTRIN_H
 
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
-                 __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
-                 __min_vector_width__(128)))
-
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -52,12 +44,9 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpbssd_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@@ -89,11 +78,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpbssd_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@@ -126,12 +113,9 @@ _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpbssds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@@ -164,11 +148,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
+#define _mm256_dpbssds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -200,12 +182,9 @@ _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpbsud_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -237,11 +216,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpbsud_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -274,12 +251,9 @@ _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpbsuds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -312,11 +286,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
+#define _mm256_dpbsuds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -348,12 +320,9 @@ _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpbuud_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -385,11 +354,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpbuud_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -422,14 +389,10 @@ _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpbuuds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))
 
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
 ///    32-bit integer in \a __W with signed saturation, and store the packed
@@ -460,12 +423,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
+#define _mm256_dpbuuds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))
 
 #endif // __AVXVNNIINT8INTRIN_H
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
index b7982e6ecca84..26e0d124c8284 100644
--- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -3,6 +3,28 @@
 
 #include <immintrin.h>
 
+// VNNI FP16
+__m512 test_mm512_dpph_ps(__m512 __W, __m512h __A, __m512h __B) {
+// CHECK-LABEL: @test_mm512_dpph_ps(
+// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512
+  return _mm512_dpph_ps(__W, __A, __B);
+}
+
+__m512 test_mm512_mask_dpph_ps(__m512 __W, __mmask16 __U, __m512h __A, __m512h __B) {
+// CHECK-LABEL: @test_mm512_mask_dpph_ps(
+// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_dpph_ps(__W, __U, __A, __B);
+}
+
+__m512 test_mm512_maskz_dpph_ps(__mmask16 __U, __m512 __W, __m512h __A, __m512h __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpph_ps(
+// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_dpph_ps(__U, __W, __A, __B);
+}
+
 // VMPSADBW
 __m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) {
 // CHECK-LABEL: @test_mm512_mpsadbw_epu8
@@ -23,3 +45,257 @@ __m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17);
 }
+
+// VNNI INT8
+__m512i test_mm512_dpbssd_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbssd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+  return _mm512_dpbssd_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbssd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpbssd_epi32(__W, __U, __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbssd_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbssd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpbssd_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbssds_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbssds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+  return _mm512_dpbssds_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbssds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbssds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpbssds_epi32(__W, __U,  __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbssds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbssds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpbssds_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbsud_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512
+  return _mm512_dpbsud_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpbsud_epi32(__W, __U,  __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbsud_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpbsud_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbsuds_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512
+  return _mm512_dpbsuds_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbsuds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpbsuds_epi32(__W, __U,  __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbsuds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpbsuds_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbuud_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512
+  return _mm512_dpbuud_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpbuud_epi32(__W, __U,  __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbuud_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpbuud_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbuuds_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512
+  return _mm512_dpbuuds_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbuuds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpbuuds_epi32(__W, __U, __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbuuds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpbuuds_epi32(__U, __W, __A, __B);
+}
+
+/* VNNI INT16 */
+__m512i test_mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_dpwsud_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_dpwsuds_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwusd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_dpwusd_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwusd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwusds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_dpwusds_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwusds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_dpwuud_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_dpwuuds_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+}
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
index baf3a35a9a191..56785a3bc4e91 100644
--- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -3,6 +3,49 @@
 
 #include <immintrin.h>
 
+// VNNI FP16
+__m128 test_mm_dpph_ps(__m128 __W, __m128h __A, __m128h __B) {
+// CHECK-LABEL: @test_mm_dpph_ps(
+// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128
+  return _mm_dpph_ps(__W, __A, __B);
+}
+
+__m128 test_mm_mask_dpph_ps(__m128 __W, __mmask8 __U, __m128h __A, __m128h __B) {
+// CHECK-LABEL: @test_mm_mask_dpph_ps(
+// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_dpph_ps(__W, __U, __A, __B);
+}
+
+__m128 test_mm_maskz_dpph_ps(__mmask8 __U, __m128 __W, __m128h __A, __m128h __B) {
+// CHECK-LABEL: @test_mm_maskz_dpph_ps(
+// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128
+// CHECK: zeroinitializer
+// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_dpph_ps(__U, __W, __A, __B);
+}
+
+__m256 test_mm256_dpph_ps(__m256 __W, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_dpph_ps(
+// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256
+  return _mm256_dpph_ps(__W, __A, __B);
+}
+
+__m256 test_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_mask_dpph_ps(
+// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_dpph_ps(__W, __U,  __A, __B);
+}
+
+__m256 test_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpph_ps(
+// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256
+// CHECK: zeroinitializer
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_dpph_ps(__U, __W, __A, __B);
+}
+
 // VMPSADBW
 __m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) {
 // CHECK-LABEL: @test_mm_mpsadbw_epu8
@@ -44,6 +87,344 @@ __m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
   return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170);
 }
 
+// VNNI INT8
+__m128i test_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbssd_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssd.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpbssd_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbssd_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssd.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpbssd_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbssds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpbssds_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbssds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpbssds_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbsud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsud.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpbsud_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbsud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsud.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpbsud_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbsuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsuds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpbsuds_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbsuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsuds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpbsuds_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbuud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuud.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpbuud_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbuud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuud.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpbuud_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbuuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuuds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpbuuds_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbuuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuuds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpbuuds_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbssd_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssd.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpbssd_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbssd_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssd.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpbssd_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbssds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpbssds_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbssds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbssds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpbssds_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbsud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsud.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpbsud_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbsud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsud.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpbsud_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbsuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsuds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpbsuds_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbsuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbsuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsuds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpbsuds_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbuud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuud.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpbuud_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbuud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuud.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpbuud_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbuuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuuds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpbuuds_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbuuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbuuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuuds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpbuuds_epi32(__U, __W, __A, __B);
+}
+
+// VNNI INT16
+__m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwsud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwsud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwsud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwsuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwsuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwusd_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwusd_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwusd_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwusds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwusds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwusds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwuud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwuud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwuud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwuuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwuuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
 // YMM Rounding
 __m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) {
 // CHECK-LABEL: @test_mm256_add_round_pd
diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
index a10ca551a1514..f9feaea1e244d 100644
--- a/clang/test/CodeGen/X86/avxvnniint16-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #include <immintrin.h>
 
diff --git a/clang/test/CodeGen/X86/avxvnniint8-builtins.c b/clang/test/CodeGen/X86/avxvnniint8-builtins.c
index cbdf443888a15..80d005c16d387 100644
--- a/clang/test/CodeGen/X86/avxvnniint8-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint8-builtins.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -ffreestanding %s -triple=i386-   -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386-   -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #include <immintrin.h>
 
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 515b0d0fcc22c..f257a8d825e59 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -4980,6 +4980,85 @@ let TargetPrefix = "x86" in {
 //===----------------------------------------------------------------------===//
 // AVX10.2 intrinsics
 let TargetPrefix = "x86" in {
+  // VNNI FP16
+  def int_x86_avx10_vdpphps_128 :
+      ClangBuiltin<"__builtin_ia32_vdpphps128">,
+      DefaultAttrsIntrinsic<[llvm_v4f32_ty],
+                            [llvm_v4f32_ty, llvm_v8f16_ty, llvm_v8f16_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vdpphps_256 :
+      ClangBuiltin<"__builtin_ia32_vdpphps256">,
+      DefaultAttrsIntrinsic<[llvm_v8f32_ty],
+                            [llvm_v8f32_ty, llvm_v16f16_ty, llvm_v16f16_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vdpphps_512 :
+      ClangBuiltin<"__builtin_ia32_vdpphps512">,
+      DefaultAttrsIntrinsic<[llvm_v16f32_ty],
+                            [llvm_v16f32_ty, llvm_v32f16_ty, llvm_v32f16_ty],
+                            [IntrNoMem]>;
+  // VNNI INT8
+  def int_x86_avx10_vpdpbssd_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpbssd512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpbssds_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpbssds512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpbsud_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpbsud512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpbsuds_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpbsuds512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpbuud_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpbuud512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpbuuds_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpbuuds512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  // VNNI INT16
+  def int_x86_avx10_vpdpwsud_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpwsud512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpwsuds_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpwsuds512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpwusd_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpwusd512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpwusds_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpwusds512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpwuud_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpwuud512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+  def int_x86_avx10_vpdpwuuds_512 :
+      ClangBuiltin<"__builtin_ia32_vpdpwuuds512">,
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [IntrNoMem]>;
+
   // VMPSADBW
   def int_x86_avx10_vmpsadbw_512 :
       ClangBuiltin<"__builtin_ia32_mpsadbw512">,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9fafb66ab0b3f..48f9cc7ab760b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34033,6 +34033,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVTNEPS2BF16)
   NODE_NAME_CASE(MCVTNEPS2BF16)
   NODE_NAME_CASE(DPBF16PS)
+  NODE_NAME_CASE(DPFP16PS)
   NODE_NAME_CASE(MPSADBW)
   NODE_NAME_CASE(LWPINS)
   NODE_NAME_CASE(MGATHER)
@@ -34058,6 +34059,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VPDPBUUDS)
   NODE_NAME_CASE(VPDPBSSD)
   NODE_NAME_CASE(VPDPBSSDS)
+  NODE_NAME_CASE(VPDPWSUD)
+  NODE_NAME_CASE(VPDPWSUDS)
+  NODE_NAME_CASE(VPDPWUSD)
+  NODE_NAME_CASE(VPDPWUSDS)
+  NODE_NAME_CASE(VPDPWUUD)
+  NODE_NAME_CASE(VPDPWUUDS)
   NODE_NAME_CASE(AESENC128KL)
   NODE_NAME_CASE(AESDEC128KL)
   NODE_NAME_CASE(AESENC256KL)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 4fd320885d608..f0bfbda1c3084 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -595,6 +595,13 @@ namespace llvm {
     VPDPBSSD,
     VPDPBSSDS,
 
+    VPDPWSUD,
+    VPDPWSUDS,
+    VPDPWUSD,
+    VPDPWUSDS,
+    VPDPWUUD,
+    VPDPWUUDS,
+
     MPSADBW,
 
     // Compress and expand.
@@ -656,9 +663,10 @@ namespace llvm {
     // SRC, PASSTHRU, MASK
     MCVTNEPS2BF16,
 
-    // Dot product of BF16 pairs to accumulated into
+    // Dot product of BF16/FP16 pairs to accumulated into
     // packed single precision.
     DPBF16PS,
+    DPFP16PS,
 
     // A stack checking function call. On Windows it's _chkstk call.
     DYN_ALLOCA,
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 666667895bc39..7a2a9cb2ba305 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -12,6 +12,40 @@
 //
 //===----------------------------------------------------------------------===//
 
+// VNNI FP16
+let ExeDomain = SSEPackedSingle in
+defm VDPPHPS : avx512_dpf16ps_sizes<0x52, "vdpphps", X86dpfp16ps, avx512vl_f16_info,
+                                    [HasAVX10_2], [HasAVX10_2_512]>,
+                    T8, PS, EVEX_CD8<32, CD8VF>;
+
+// VNNI INT8
+defm VPDPBSSD   : VNNI_common<0x50, "vpdpbssd", X86vpdpbssd, SchedWriteVecIMul, 1,
+                              [HasAVX10_2], [HasAVX10_2_512]>, XD;
+defm VPDPBSSDS  : VNNI_common<0x51, "vpdpbssds", X86vpdpbssds, SchedWriteVecIMul, 1,
+                              [HasAVX10_2], [HasAVX10_2_512]>, XD;
+defm VPDPBSUD   : VNNI_common<0x50, "vpdpbsud", X86vpdpbsud, SchedWriteVecIMul, 0,
+                              [HasAVX10_2], [HasAVX10_2_512]>, XS;
+defm VPDPBSUDS  : VNNI_common<0x51, "vpdpbsuds", X86vpdpbsuds, SchedWriteVecIMul, 0,
+                              [HasAVX10_2], [HasAVX10_2_512]>, XS;
+defm VPDPBUUD   : VNNI_common<0x50, "vpdpbuud", X86vpdpbuud, SchedWriteVecIMul, 1,
+                              [HasAVX10_2], [HasAVX10_2_512]>, PS;
+defm VPDPBUUDS  : VNNI_common<0x51, "vpdpbuuds", X86vpdpbuuds, SchedWriteVecIMul, 1,
+                              [HasAVX10_2], [HasAVX10_2_512]>, PS;
+
+// VNNI INT16
+defm VPDPWSUD   : VNNI_common<0xd2, "vpdpwsud", X86vpdpwsud, SchedWriteVecIMul, 0,
+                              [HasAVX10_2], [HasAVX10_2_512]>, XS;
+defm VPDPWSUDS  : VNNI_common<0xd3, "vpdpwsuds", X86vpdpwsuds, SchedWriteVecIMul, 0,
+                              [HasAVX10_2], [HasAVX10_2_512]>, XS;
+defm VPDPWUSD   : VNNI_common<0xd2, "vpdpwusd", X86vpdpwusd, SchedWriteVecIMul, 0,
+                              [HasAVX10_2], [HasAVX10_2_512]>, PD;
+defm VPDPWUSDS  : VNNI_common<0xd3, "vpdpwusds", X86vpdpwusds, SchedWriteVecIMul, 0,
+                              [HasAVX10_2], [HasAVX10_2_512]>, PD;
+defm VPDPWUUD   : VNNI_common<0xd2, "vpdpwuud", X86vpdpwuud, SchedWriteVecIMul, 1,
+                              [HasAVX10_2], [HasAVX10_2_512]>, PS;
+defm VPDPWUUDS  : VNNI_common<0xd3, "vpdpwuuds", X86vpdpwuuds, SchedWriteVecIMul, 1,
+                              [HasAVX10_2], [HasAVX10_2_512]>, PS;
+
 // VMPSADBW
 defm VMPSADBW : avx512_common_3Op_rm_imm8<0x42, X86Vmpsadbw, "vmpsadbw", SchedWritePSADBW,
                                           avx512vl_i16_info, avx512vl_i8_info,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index da690aea43f5c..d996cee7918d9 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12390,13 +12390,13 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (VTI.VT (OpNode VTI.RC:$src1,
                                             VTI.RC:$src2, VTI.RC:$src3)),
                                    IsCommutable, IsCommutable>,
-                                   EVEX, VVVV, T8, PD, Sched<[sched]>;
+                                   EVEX, VVVV, T8, Sched<[sched]>;
   defm m  :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                             (VTI.VT (VTI.LdFrag addr:$src3))))>,
-                                   EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8, PD,
+                                   EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8,
                                    Sched<[sched.Folded, sched.ReadAfterFold,
                                           sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -12406,17 +12406,18 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                     (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
                                    EVEX, VVVV, EVEX_CD8<32, CD8VF>, EVEX_B,
-                                   T8, PD, Sched<[sched.Folded, sched.ReadAfterFold,
+                                   T8, Sched<[sched.Folded, sched.ReadAfterFold,
                                                 sched.ReadAfterFold]>;
   }
 }
 
 multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
-                       X86SchedWriteWidths sched, bit IsCommutable> {
-  let Predicates = [HasVNNI] in
+                       X86SchedWriteWidths sched, bit IsCommutable,
+                       list<Predicate> prds, list<Predicate> prds512> {
+  let Predicates = prds512 in
   defm Z      :   VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info,
                            IsCommutable>, EVEX_V512;
-  let Predicates = [HasVNNI, HasVLX] in {
+  let Predicates = prds in {
     defm Z256 :   VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info,
                            IsCommutable>, EVEX_V256;
     defm Z128 :   VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info,
@@ -12425,10 +12426,14 @@ multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
 }
 
 // FIXME: Is there a better scheduler class for VPDP?
-defm VPDPBUSD   : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>;
-defm VPDPBUSDS  : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>;
-defm VPDPWSSD   : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>;
-defm VPDPWSSDS  : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>;
+defm VPDPBUSD   : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0,
+                              [HasVNNI, HasVLX], [HasVNNI]>, PD;
+defm VPDPBUSDS  : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0,
+                              [HasVNNI, HasVLX], [HasVNNI]>, PD;
+defm VPDPWSSD   : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1,
+                              [HasVNNI, HasVLX], [HasVNNI]>, PD;
+defm VPDPWSSDS  : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1,
+                              [HasVNNI, HasVLX], [HasVNNI]>, PD;
 
 // Patterns to match VPDPWSSD from existing instructions/intrinsics.
 let Predicates = [HasVNNI] in {
@@ -12806,9 +12811,9 @@ let Predicates = [HasBF16] in {
 }
 
 let Constraints = "$src1 = $dst" in {
-multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              X86FoldableSchedWrite sched,
-                              X86VectorVTInfo _, X86VectorVTInfo src_v> {
+multiclass avx512_dpf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                             X86FoldableSchedWrite sched,
+                             X86VectorVTInfo _, X86VectorVTInfo src_v> {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins src_v.RC:$src2, src_v.RC:$src3),
                            OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -12834,25 +12839,25 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 } // Constraints = "$src1 = $dst"
 
-multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                 X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
-                                 AVX512VLVectorVTInfo src_v, Predicate prd> {
-  let Predicates = [prd] in {
-    defm Z    : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512,
-                                   src_v.info512>, EVEX_V512;
+multiclass avx512_dpf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                AVX512VLVectorVTInfo _, list<Predicate> prds,
+                                list<Predicate> prds512> {
+  let Predicates = prds512 in {
+    defm Z    : avx512_dpf16ps_rm<opc, OpcodeStr, OpNode, WriteFMAZ,
+                                  avx512vl_f32_info.info512, _.info512>, EVEX_V512;
   }
-  let Predicates = [HasVLX, prd] in {
-    defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256,
-                                   src_v.info256>, EVEX_V256;
-    defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128,
-                                   src_v.info128>, EVEX_V128;
+  let Predicates = prds in {
+    defm Z256 : avx512_dpf16ps_rm<opc, OpcodeStr, OpNode, WriteFMAY,
+                                  v8f32x_info, _.info256>, EVEX_V256;
+    defm Z128 : avx512_dpf16ps_rm<opc, OpcodeStr, OpNode, WriteFMAX,
+                                  v4f32x_info, _.info128>, EVEX_V128;
   }
 }
 
 let ExeDomain = SSEPackedSingle in
-defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
-                                       avx512vl_f32_info, avx512vl_bf16_info,
-                                       HasBF16>, T8, XS, EVEX_CD8<32, CD8VF>;
+defm VDPBF16PS : avx512_dpf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, avx512vl_bf16_info,
+                                      [HasVLX, HasBF16], [HasBF16]>,
+                      T8, XS, EVEX_CD8<32, CD8VF>;
 
 //===----------------------------------------------------------------------===//
 // AVX512FP16
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 74596cec5c5ef..b83e3f24c6a63 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -792,6 +792,11 @@ def X86dpbf16ps :      SDNode<"X86ISD::DPBF16PS",
                                             SDTCisSameAs<0,1>,
                                             SDTCVecEltisVT<2, bf16>,
                                             SDTCisSameAs<2,3>]>>;
+def X86dpfp16ps :      SDNode<"X86ISD::DPFP16PS",
+                       SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+                                            SDTCisSameAs<0,1>,
+                                            SDTCVecEltisVT<2, f16>,
+                                            SDTCisSameAs<2,3>]>>;
 
 // galois field arithmetic
 def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
@@ -809,6 +814,13 @@ def X86vpdpbsuds : SDNode<"X86ISD::VPDPBSUDS", SDTVnni>;
 def X86vpdpbuud  : SDNode<"X86ISD::VPDPBUUD",  SDTVnni>;
 def X86vpdpbuuds : SDNode<"X86ISD::VPDPBUUDS", SDTVnni>;
 
+def X86vpdpwsud  : SDNode<"X86ISD::VPDPWSUD",  SDTVnni>;
+def X86vpdpwsuds : SDNode<"X86ISD::VPDPWSUDS", SDTVnni>;
+def X86vpdpwusd  : SDNode<"X86ISD::VPDPWUSD",  SDTVnni>;
+def X86vpdpwusds : SDNode<"X86ISD::VPDPWUSDS", SDTVnni>;
+def X86vpdpwuud  : SDNode<"X86ISD::VPDPWUUD",  SDTVnni>;
+def X86vpdpwuuds : SDNode<"X86ISD::VPDPWUUDS", SDTVnni>;
+
 def X86Vmpsadbw : SDNode<"X86ISD::MPSADBW", SDTX86PSADBW>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 7fc786b1e570b..7e2e97d387a83 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2953,6 +2953,42 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VPDPBUUDSYrr:
   case X86::VPDPBUUDrr:
   case X86::VPDPBUUDYrr:
+  case X86::VPDPBSSDSZ128r:
+  case X86::VPDPBSSDSZ128rk:
+  case X86::VPDPBSSDSZ128rkz:
+  case X86::VPDPBSSDSZ256r:
+  case X86::VPDPBSSDSZ256rk:
+  case X86::VPDPBSSDSZ256rkz:
+  case X86::VPDPBSSDSZr:
+  case X86::VPDPBSSDSZrk:
+  case X86::VPDPBSSDSZrkz:
+  case X86::VPDPBSSDZ128r:
+  case X86::VPDPBSSDZ128rk:
+  case X86::VPDPBSSDZ128rkz:
+  case X86::VPDPBSSDZ256r:
+  case X86::VPDPBSSDZ256rk:
+  case X86::VPDPBSSDZ256rkz:
+  case X86::VPDPBSSDZr:
+  case X86::VPDPBSSDZrk:
+  case X86::VPDPBSSDZrkz:
+  case X86::VPDPBUUDSZ128r:
+  case X86::VPDPBUUDSZ128rk:
+  case X86::VPDPBUUDSZ128rkz:
+  case X86::VPDPBUUDSZ256r:
+  case X86::VPDPBUUDSZ256rk:
+  case X86::VPDPBUUDSZ256rkz:
+  case X86::VPDPBUUDSZr:
+  case X86::VPDPBUUDSZrk:
+  case X86::VPDPBUUDSZrkz:
+  case X86::VPDPBUUDZ128r:
+  case X86::VPDPBUUDZ128rk:
+  case X86::VPDPBUUDZ128rkz:
+  case X86::VPDPBUUDZ256r:
+  case X86::VPDPBUUDZ256rk:
+  case X86::VPDPBUUDZ256rkz:
+  case X86::VPDPBUUDZr:
+  case X86::VPDPBUUDZrk:
+  case X86::VPDPBUUDZrkz:
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128rk:
   case X86::VPDPWSSDZ128rkz:
@@ -2971,6 +3007,24 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VPDPWSSDSZr:
   case X86::VPDPWSSDSZrk:
   case X86::VPDPWSSDSZrkz:
+  case X86::VPDPWUUDZ128r:
+  case X86::VPDPWUUDZ128rk:
+  case X86::VPDPWUUDZ128rkz:
+  case X86::VPDPWUUDZ256r:
+  case X86::VPDPWUUDZ256rk:
+  case X86::VPDPWUUDZ256rkz:
+  case X86::VPDPWUUDZr:
+  case X86::VPDPWUUDZrk:
+  case X86::VPDPWUUDZrkz:
+  case X86::VPDPWUUDSZ128r:
+  case X86::VPDPWUUDSZ128rk:
+  case X86::VPDPWUUDSZ128rkz:
+  case X86::VPDPWUUDSZ256r:
+  case X86::VPDPWUUDSZ256rk:
+  case X86::VPDPWUUDSZ256rkz:
+  case X86::VPDPWUUDSZr:
+  case X86::VPDPWUUDSZrk:
+  case X86::VPDPWUUDSZrkz:
   case X86::VPMADD52HUQrr:
   case X86::VPMADD52HUQYrr:
   case X86::VPMADD52HUQZ128r:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 2fc3b6aa98858..5f9211edfa161 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8425,46 +8425,41 @@ defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8, XS,
 defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8, XD, VEX, VVVV;
 defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8, XD, VEX_L, VEX, VVVV;
 
-let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
-multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
-  let isCommutable = IsCommutable in
-  def rr  : I<opc, MRMSrcReg, (outs VR128:$dst),
-              (ins VR128:$src1, VR128:$src2, VR128:$src3),
-              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-              [(set VR128:$dst,
-                (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
-                        VR128:$src1, VR128:$src2, VR128:$src3)))]>,
-              VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
-
-  def rm  : I<opc, MRMSrcMem, (outs VR128:$dst),
-              (ins VR128:$src1, VR128:$src2, i128mem:$src3),
-              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-              [(set VR128:$dst,
-                (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
-                        VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
-              VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
-
-  let isCommutable = IsCommutable in
-  def Yrr  : I<opc, MRMSrcReg, (outs VR256:$dst),
-               (ins VR256:$src1, VR256:$src2, VR256:$src3),
-               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-               [(set VR256:$dst,
-                 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
-                         VR256:$src1, VR256:$src2, VR256:$src3)))]>,
-               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
-
-  def Yrm  : I<opc, MRMSrcMem, (outs VR256:$dst),
-               (ins VR256:$src1, VR256:$src2, i256mem:$src3),
-               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-               [(set VR256:$dst,
-                 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
-                         VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
-               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+let Predicates = [HasAVXVNNIINT16] in {
+  defm VPDPWSUD   : avx_dotprod_rm<0xd2,"vpdpwsud", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpwsud, SchedWriteVecIMul.XMM,
+                                   0>, T8, XS;
+  defm VPDPWSUDY  : avx_dotprod_rm<0xd2,"vpdpwsud", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpwsud, SchedWriteVecIMul.YMM,
+                                   0>, VEX_L, T8, XS;
+  defm VPDPWSUDS  : avx_dotprod_rm<0xd3,"vpdpwsuds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpwsuds, SchedWriteVecIMul.XMM,
+                                   0>, T8, XS;
+  defm VPDPWSUDSY : avx_dotprod_rm<0xd3,"vpdpwsuds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpwsuds, SchedWriteVecIMul.YMM,
+                                   0>, VEX_L, T8, XS;
+  defm VPDPWUSD   : avx_dotprod_rm<0xd2,"vpdpwusd", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpwusd, SchedWriteVecIMul.XMM,
+                                   0>, T8, PD;
+  defm VPDPWUSDY  : avx_dotprod_rm<0xd2,"vpdpwusd", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpwusd, SchedWriteVecIMul.YMM,
+                                   0>, VEX_L, T8, PD;
+  defm VPDPWUSDS  : avx_dotprod_rm<0xd3,"vpdpwusds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpwusds, SchedWriteVecIMul.XMM,
+                                   0>, T8, PD;
+  defm VPDPWUSDSY : avx_dotprod_rm<0xd3,"vpdpwusds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpwusds, SchedWriteVecIMul.YMM,
+                                   0>, VEX_L, T8, PD;
+  defm VPDPWUUD   : avx_dotprod_rm<0xd2,"vpdpwuud", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpwuud, SchedWriteVecIMul.XMM,
+                                   1>, T8;
+  defm VPDPWUUDY  : avx_dotprod_rm<0xd2,"vpdpwuud", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpwuud, SchedWriteVecIMul.YMM,
+                                   1>, VEX_L, T8;
+  defm VPDPWUUDS  : avx_dotprod_rm<0xd3,"vpdpwuuds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpwuuds, SchedWriteVecIMul.XMM,
+                                   1>, T8;
+  defm VPDPWUUDSY : avx_dotprod_rm<0xd3,"vpdpwuuds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpwuuds, SchedWriteVecIMul.YMM,
+                                   1>, VEX_L, T8;
 }
-
-defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8, XS;
-defm VPDPWSUDS  : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8, XS;
-defm VPDPWUSD   : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8, PD;
-defm VPDPWUSDS  : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8, PD;
-defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8;
-defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 000138e1837af..defbba937fe47 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -394,8 +394,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
                        X86ISD::FADD_RND),
     X86_INTRINSIC_DATA(avx10_vaddps256, INTR_TYPE_2OP, ISD::FADD,
                        X86ISD::FADD_RND),
+    X86_INTRINSIC_DATA(avx10_vdpphps_128, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0),
+    X86_INTRINSIC_DATA(avx10_vdpphps_256, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0),
+    X86_INTRINSIC_DATA(avx10_vdpphps_512, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0),
     X86_INTRINSIC_DATA(avx10_vmpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::MPSADBW,
                        0),
+    X86_INTRINSIC_DATA(avx10_vpdpbssd_512, INTR_TYPE_3OP, X86ISD::VPDPBSSD, 0),
+    X86_INTRINSIC_DATA(avx10_vpdpbssds_512, INTR_TYPE_3OP, X86ISD::VPDPBSSDS,
+                       0),
+    X86_INTRINSIC_DATA(avx10_vpdpbsud_512, INTR_TYPE_3OP, X86ISD::VPDPBSUD, 0),
+    X86_INTRINSIC_DATA(avx10_vpdpbsuds_512, INTR_TYPE_3OP, X86ISD::VPDPBSUDS,
+                       0),
+    X86_INTRINSIC_DATA(avx10_vpdpbuud_512, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0),
+    X86_INTRINSIC_DATA(avx10_vpdpbuuds_512, INTR_TYPE_3OP, X86ISD::VPDPBUUDS,
+                       0),
+    X86_INTRINSIC_DATA(avx10_vpdpwsud_512, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0),
+    X86_INTRINSIC_DATA(avx10_vpdpwsuds_512, INTR_TYPE_3OP, X86ISD::VPDPWSUDS,
+                       0),
+    X86_INTRINSIC_DATA(avx10_vpdpwusd_512, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0),
+    X86_INTRINSIC_DATA(avx10_vpdpwusds_512, INTR_TYPE_3OP, X86ISD::VPDPWUSDS,
+                       0),
+    X86_INTRINSIC_DATA(avx10_vpdpwuud_512, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0),
+    X86_INTRINSIC_DATA(avx10_vpdpwuuds_512, INTR_TYPE_3OP, X86ISD::VPDPWUUDS,
+                       0),
     X86_INTRINSIC_DATA(avx2_mpsadbw, INTR_TYPE_3OP_IMM8, X86ISD::MPSADBW, 0),
     X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
     X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
@@ -456,6 +477,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(avx2_vpdpbuud_256, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0),
     X86_INTRINSIC_DATA(avx2_vpdpbuuds_128, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
     X86_INTRINSIC_DATA(avx2_vpdpbuuds_256, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwsud_128, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwsud_256, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwsuds_128, INTR_TYPE_3OP, X86ISD::VPDPWSUDS, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwsuds_256, INTR_TYPE_3OP, X86ISD::VPDPWSUDS, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwusd_128, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwusd_256, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwusds_128, INTR_TYPE_3OP, X86ISD::VPDPWUSDS, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwusds_256, INTR_TYPE_3OP, X86ISD::VPDPWUSDS, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwuud_128, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwuud_256, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwuuds_128, INTR_TYPE_3OP, X86ISD::VPDPWUUDS, 0),
+    X86_INTRINSIC_DATA(avx2_vpdpwuuds_256, INTR_TYPE_3OP, X86ISD::VPDPWUUDS, 0),
     X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD,
                        X86ISD::FADD_RND),
     X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD,
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index bafa52a2a83ae..07e86cb01e133 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -1,6 +1,389 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+
+; VNNI FP16
+
+define <16 x float> @test_mm512_dpph_ps(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) {
+; CHECK-LABEL: test_mm512_dpph_ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdpphps %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_dpph_ps(<16 x float> %__W, i16 zeroext %__U, <32 x half> %__A, <32 x half> %__B) {
+; X86-LABEL: test_mm512_mask_dpph_ps:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdpphps %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpph_ps:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdpphps %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dph = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x float> %dph, <16 x float> %__W
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_dpph_ps(i16 zeroext %__U, <16 x float> %__W, <32 x half> %__A, <32 x half> %__B) {
+; X86-LABEL: test_mm512_maskz_dpph_ps:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdpphps %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpph_ps:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdpphps %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dph = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x float> %dph, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32 x half>)
+
+; VNNI INT8
+
+define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpbssd_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbssd (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpbssd_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssd (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %__B = load <16 x i32>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpbssds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpbssds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpbssd_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xc9,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpbssd_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xc9,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpbsud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbsud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpbsud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %__B = load <16 x i32>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpbsuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpbsuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpbsud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpbsud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpbuud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbuud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpbuud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %__B = load <16 x i32>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpbuuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpbuuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpbuud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpbuud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+; VNNI INT16
+
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpwsud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpwsud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpwsud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %__B = load <16 x i32>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpwsuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpwsuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpwsud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpwsud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpwusd_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpwusd (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpwusd_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwusd (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %__B = load <16 x i32>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpwusds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwusds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpwusds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwusds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpwusd_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpwusd_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpwuud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpwuud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpwuud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwuud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %__B = load <16 x i32>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpwuuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpwuuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpwuud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpwuud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %bst = bitcast i16 %__U to <16 x i1>
+  %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
 
 ; VMPSADBW
 
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 4080546c0c543..15ef78aa4c1a5 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -2,6 +2,569 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
+; VNNI FP16
+
+define <4 x float> @test_mm_dpph_ps(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) {
+; CHECK-LABEL: test_mm_dpph_ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdpphps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_dpph_ps(<4 x float> %__W, i8 zeroext %__U, <8 x half> %__A, <8 x half> %__B) {
+; X86-LABEL: test_mm_mask_dpph_ps:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdpphps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpph_ps:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdpphps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dph = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %ext = shufflevector <8 x i1> %bst, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %ext, <4 x float> %dph, <4 x float> %__W
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_maskz_dpph_ps(i8 zeroext %__U, <4 x float> %__W, <8 x half> %__A, <8 x half> %__B) {
+; X86-LABEL: test_mm_maskz_dpph_ps:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdpphps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpph_ps:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdpphps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dph = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %ext = shufflevector <8 x i1> %bst, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %ext, <4 x float> %dph, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <8 x float> @test_mm256_dpph_ps(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) {
+; CHECK-LABEL: test_mm256_dpph_ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdpphps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_dpph_ps(<8 x float> %__W, i8 zeroext %__U, <16 x half> %__A, <16 x half> %__B) {
+; X86-LABEL: test_mm256_mask_dpph_ps:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdpphps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpph_ps:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdpphps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dph = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x float> %dph, <8 x float> %__W
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_maskz_dpph_ps(i8 zeroext %__U, <8 x float> %__W, <16 x half> %__A, <16 x half> %__B) {
+; X86-LABEL: test_mm256_maskz_dpph_ps:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdpphps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpph_ps:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdpphps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dph = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x float> %dph, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float>, <8 x half>, <8 x half>)
+declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x half>)
+
+; VNNI INT8
+
+define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpbssd_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpbssd_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpbssds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpbssds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpbssds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpbssds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpbssd_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpbssd_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpbsud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpbsud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpbsuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpbsuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpbsuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpbsuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpbsud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpbsud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpbuud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpbuud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpbuuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpbuuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpbuuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpbuuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpbuud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpbuud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+; VNNI INT16
+
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpwsud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpwsud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpwsuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpwsuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpwsuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpwsuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpwsud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpwsud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpwusd_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpwusd_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpwusds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpwusds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpwusds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpwusds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpwusd_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpwusd_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpwuud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpwuud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpwuuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpwuuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %bst = bitcast i4 %__U to <4 x i1>
+  %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpwuuds_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpwuuds_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpwuud_epi32:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpwuud_epi32:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %bst = bitcast i8 %__U to <8 x i1>
+  %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
 ; VMPSADBW
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
index 999c968fa80db..8601d454215ad 100644
--- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -1,12 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefix=AVX10
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefix=AVX10
 
 define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
   ret <4 x i32> %ret
 }
@@ -17,6 +24,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
   ret <8 x i32> %ret
 }
@@ -27,6 +39,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
   ret <4 x i32> %ret
 }
@@ -37,6 +54,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
   ret <8 x i32> %ret
 }
@@ -47,6 +69,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
   ret <4 x i32> %ret
 }
@@ -57,6 +84,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
   ret <8 x i32> %ret
 }
@@ -67,6 +99,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
   ret <4 x i32> %ret
 }
@@ -77,6 +114,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
   ret <8 x i32> %ret
 }
@@ -87,6 +129,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
   ret <4 x i32> %ret
 }
@@ -97,6 +144,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
   ret <8 x i32> %ret
 }
@@ -107,6 +159,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
   ret <4 x i32> %ret
 }
@@ -117,6 +174,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
   ret <8 x i32> %ret
 }
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
index f9e44ac4132be..607720fbc3f33 100644
--- a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-256  --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256  --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64
 
 
 declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
@@ -22,6 +24,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x50,0x18]
+; AVX10-X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbssd (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x50,0x1f]
+; AVX10-X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, ptr %x2p
   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -48,6 +67,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x51,0x18]
+; AVX10-X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbssds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x51,0x1f]
+; AVX10-X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, ptr %x2p
   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -74,6 +110,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x50,0x18]
+; AVX10-X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbssd (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x50,0x1f]
+; AVX10-X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, ptr %x2p
   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -100,6 +153,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x51,0x18]
+; AVX10-X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbssds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x51,0x1f]
+; AVX10-X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, ptr %x2p
   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -126,6 +196,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x50,0x18]
+; AVX10-X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbsud (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x50,0x1f]
+; AVX10-X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, ptr %x2p
   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -152,6 +239,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x51,0x18]
+; AVX10-X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbsuds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x51,0x1f]
+; AVX10-X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, ptr %x2p
   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -178,6 +282,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x50,0x18]
+; AVX10-X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbsud (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x50,0x1f]
+; AVX10-X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, ptr %x2p
   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -204,6 +325,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x51,0x18]
+; AVX10-X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbsuds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x51,0x1f]
+; AVX10-X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, ptr %x2p
   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -230,6 +368,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x50,0x18]
+; AVX10-X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbuud (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x50,0x1f]
+; AVX10-X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, ptr %x2p
   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -256,6 +411,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
 ; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x51,0x18]
+; AVX10-X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbuuds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x51,0x1f]
+; AVX10-X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <4 x i32>, ptr %x2p
   %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
   %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -282,6 +454,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x50,0x18]
+; AVX10-X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbuud (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x50,0x1f]
+; AVX10-X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, ptr %x2p
   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -308,6 +497,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
 ; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT:    vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x51,0x18]
+; AVX10-X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT:    vpdpbuuds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x51,0x1f]
+; AVX10-X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
   %x2 = load <8 x i32>, ptr %x2p
   %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
   %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
diff --git a/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt b/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt
index 59457e6eec293..8abfbcabf113b 100644
--- a/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt
@@ -1,6 +1,1416 @@
 # RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT
 # RUN: llvm-mc --disassemble %s -triple=i386 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
 
+# VNNI FP16
+
+# ATT:   vdpphps %xmm4, %xmm3, %xmm2
+# INTEL: vdpphps xmm2, xmm3, xmm4
+0x62,0xf2,0x64,0x08,0x52,0xd4
+
+# ATT:   vdpphps %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vdpphps xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0x52,0xd4
+
+# ATT:   vdpphps %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdpphps xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0x52,0xd4
+
+# ATT:   vdpphps %ymm4, %ymm3, %ymm2
+# INTEL: vdpphps ymm2, ymm3, ymm4
+0x62,0xf2,0x64,0x28,0x52,0xd4
+
+# ATT:   vdpphps %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vdpphps ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0x52,0xd4
+
+# ATT:   vdpphps %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdpphps ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0x52,0xd4
+
+# ATT:   vdpphps %zmm4, %zmm3, %zmm2
+# INTEL: vdpphps zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0x52,0xd4
+
+# ATT:   vdpphps %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vdpphps zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0x52,0xd4
+
+# ATT:   vdpphps %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdpphps zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0x52,0xd4
+
+# ATT:   vdpphps  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x08,0x52,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vdpphps  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0x52,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vdpphps  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vdpphps xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0x52,0x10
+
+# ATT:   vdpphps  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf2,0x64,0x08,0x52,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vdpphps  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0x52,0x51,0x7f
+
+# ATT:   vdpphps  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0x52,0x52,0x80
+
+# ATT:   vdpphps  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x28,0x52,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vdpphps  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0x52,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vdpphps  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vdpphps ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0x52,0x10
+
+# ATT:   vdpphps  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf2,0x64,0x28,0x52,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vdpphps  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0x52,0x51,0x7f
+
+# ATT:   vdpphps  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0x52,0x52,0x80
+
+# ATT:   vdpphps  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0x52,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vdpphps  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0x52,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vdpphps  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vdpphps zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0x52,0x10
+
+# ATT:   vdpphps  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0x52,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vdpphps  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0x52,0x51,0x7f
+
+# ATT:   vdpphps  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0x52,0x52,0x80
+
+# VNNI INT8
+
+# ATT:   vpdpbssd %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x50,0xd4
+
+# ATT:   vpdpbssd %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbssd xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x67,0x0f,0x50,0xd4
+
+# ATT:   vpdpbssd %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x67,0x8f,0x50,0xd4
+
+# ATT:   vpdpbssd %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x50,0xd4
+
+# ATT:   vpdpbssd %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbssd ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x67,0x2f,0x50,0xd4
+
+# ATT:   vpdpbssd %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x67,0xaf,0x50,0xd4
+
+# ATT:   vpdpbssd %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbssd zmm2, zmm3, zmm4
+0x62,0xf2,0x67,0x48,0x50,0xd4
+
+# ATT:   vpdpbssd %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbssd zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x67,0x4f,0x50,0xd4
+
+# ATT:   vpdpbssd %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x67,0xcf,0x50,0xd4
+
+# ATT:   vpdpbssd  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x67,0x18,0x50,0x10
+
+# ATT:   vpdpbssd  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbssd  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x67,0x8f,0x50,0x51,0x7f
+
+# ATT:   vpdpbssd  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x67,0x9f,0x50,0x52,0x80
+
+# ATT:   vpdpbssd  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x67,0x38,0x50,0x10
+
+# ATT:   vpdpbssd  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbssd  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x67,0xaf,0x50,0x51,0x7f
+
+# ATT:   vpdpbssd  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x67,0xbf,0x50,0x52,0x80
+
+# ATT:   vpdpbssd  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x67,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x67,0x58,0x50,0x10
+
+# ATT:   vpdpbssd  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x67,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbssd  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x67,0xcf,0x50,0x51,0x7f
+
+# ATT:   vpdpbssd  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x67,0xdf,0x50,0x52,0x80
+
+# ATT:   vpdpbssds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x51,0xd4
+
+# ATT:   vpdpbssds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbssds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x67,0x0f,0x51,0xd4
+
+# ATT:   vpdpbssds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x67,0x8f,0x51,0xd4
+
+# ATT:   vpdpbssds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x51,0xd4
+
+# ATT:   vpdpbssds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbssds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x67,0x2f,0x51,0xd4
+
+# ATT:   vpdpbssds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x67,0xaf,0x51,0xd4
+
+# ATT:   vpdpbssds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbssds zmm2, zmm3, zmm4
+0x62,0xf2,0x67,0x48,0x51,0xd4
+
+# ATT:   vpdpbssds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbssds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x67,0x4f,0x51,0xd4
+
+# ATT:   vpdpbssds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x67,0xcf,0x51,0xd4
+
+# ATT:   vpdpbssds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x67,0x18,0x51,0x10
+
+# ATT:   vpdpbssds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbssds  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x67,0x8f,0x51,0x51,0x7f
+
+# ATT:   vpdpbssds  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x67,0x9f,0x51,0x52,0x80
+
+# ATT:   vpdpbssds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x67,0x38,0x51,0x10
+
+# ATT:   vpdpbssds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbssds  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x67,0xaf,0x51,0x51,0x7f
+
+# ATT:   vpdpbssds  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x67,0xbf,0x51,0x52,0x80
+
+# ATT:   vpdpbssds  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x67,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x67,0x58,0x51,0x10
+
+# ATT:   vpdpbssds  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x67,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbssds  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x67,0xcf,0x51,0x51,0x7f
+
+# ATT:   vpdpbssds  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x67,0xdf,0x51,0x52,0x80
+
+# ATT:   vpdpbsud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x50,0xd4
+
+# ATT:   vpdpbsud %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbsud xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x66,0x0f,0x50,0xd4
+
+# ATT:   vpdpbsud %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x66,0x8f,0x50,0xd4
+
+# ATT:   vpdpbsud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x50,0xd4
+
+# ATT:   vpdpbsud %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbsud ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x66,0x2f,0x50,0xd4
+
+# ATT:   vpdpbsud %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x66,0xaf,0x50,0xd4
+
+# ATT:   vpdpbsud %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbsud zmm2, zmm3, zmm4
+0x62,0xf2,0x66,0x48,0x50,0xd4
+
+# ATT:   vpdpbsud %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbsud zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x66,0x4f,0x50,0xd4
+
+# ATT:   vpdpbsud %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x66,0xcf,0x50,0xd4
+
+# ATT:   vpdpbsud  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x66,0x18,0x50,0x10
+
+# ATT:   vpdpbsud  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbsud  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x66,0x8f,0x50,0x51,0x7f
+
+# ATT:   vpdpbsud  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x66,0x9f,0x50,0x52,0x80
+
+# ATT:   vpdpbsud  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x66,0x38,0x50,0x10
+
+# ATT:   vpdpbsud  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbsud  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x66,0xaf,0x50,0x51,0x7f
+
+# ATT:   vpdpbsud  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x66,0xbf,0x50,0x52,0x80
+
+# ATT:   vpdpbsud  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x66,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x66,0x58,0x50,0x10
+
+# ATT:   vpdpbsud  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x66,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbsud  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x66,0xcf,0x50,0x51,0x7f
+
+# ATT:   vpdpbsud  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x66,0xdf,0x50,0x52,0x80
+
+# ATT:   vpdpbsuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x51,0xd4
+
+# ATT:   vpdpbsuds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbsuds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x66,0x0f,0x51,0xd4
+
+# ATT:   vpdpbsuds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x66,0x8f,0x51,0xd4
+
+# ATT:   vpdpbsuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x51,0xd4
+
+# ATT:   vpdpbsuds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbsuds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x66,0x2f,0x51,0xd4
+
+# ATT:   vpdpbsuds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x66,0xaf,0x51,0xd4
+
+# ATT:   vpdpbsuds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbsuds zmm2, zmm3, zmm4
+0x62,0xf2,0x66,0x48,0x51,0xd4
+
+# ATT:   vpdpbsuds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbsuds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x66,0x4f,0x51,0xd4
+
+# ATT:   vpdpbsuds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x66,0xcf,0x51,0xd4
+
+# ATT:   vpdpbsuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x66,0x18,0x51,0x10
+
+# ATT:   vpdpbsuds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbsuds  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x66,0x8f,0x51,0x51,0x7f
+
+# ATT:   vpdpbsuds  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x66,0x9f,0x51,0x52,0x80
+
+# ATT:   vpdpbsuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x66,0x38,0x51,0x10
+
+# ATT:   vpdpbsuds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbsuds  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x66,0xaf,0x51,0x51,0x7f
+
+# ATT:   vpdpbsuds  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x66,0xbf,0x51,0x52,0x80
+
+# ATT:   vpdpbsuds  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x66,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x66,0x58,0x51,0x10
+
+# ATT:   vpdpbsuds  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x66,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbsuds  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x66,0xcf,0x51,0x51,0x7f
+
+# ATT:   vpdpbsuds  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x66,0xdf,0x51,0x52,0x80
+
+# ATT:   vpdpbuud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x50,0xd4
+
+# ATT:   vpdpbuud %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbuud xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0x50,0xd4
+
+# ATT:   vpdpbuud %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0x50,0xd4
+
+# ATT:   vpdpbuud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x50,0xd4
+
+# ATT:   vpdpbuud %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbuud ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0x50,0xd4
+
+# ATT:   vpdpbuud %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0x50,0xd4
+
+# ATT:   vpdpbuud %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbuud zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0x50,0xd4
+
+# ATT:   vpdpbuud %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbuud zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0x50,0xd4
+
+# ATT:   vpdpbuud %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0x50,0xd4
+
+# ATT:   vpdpbuud  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0x50,0x10
+
+# ATT:   vpdpbuud  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbuud  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0x50,0x51,0x7f
+
+# ATT:   vpdpbuud  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0x50,0x52,0x80
+
+# ATT:   vpdpbuud  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0x50,0x10
+
+# ATT:   vpdpbuud  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbuud  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0x50,0x51,0x7f
+
+# ATT:   vpdpbuud  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0x50,0x52,0x80
+
+# ATT:   vpdpbuud  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0x50,0x10
+
+# ATT:   vpdpbuud  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbuud  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0x50,0x51,0x7f
+
+# ATT:   vpdpbuud  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0x50,0x52,0x80
+
+# ATT:   vpdpbuuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x51,0xd4
+
+# ATT:   vpdpbuuds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbuuds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0x51,0xd4
+
+# ATT:   vpdpbuuds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0x51,0xd4
+
+# ATT:   vpdpbuuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x51,0xd4
+
+# ATT:   vpdpbuuds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbuuds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0x51,0xd4
+
+# ATT:   vpdpbuuds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0x51,0xd4
+
+# ATT:   vpdpbuuds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbuuds zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0x51,0xd4
+
+# ATT:   vpdpbuuds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbuuds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0x51,0xd4
+
+# ATT:   vpdpbuuds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0x51,0xd4
+
+# ATT:   vpdpbuuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0x51,0x10
+
+# ATT:   vpdpbuuds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbuuds  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0x51,0x51,0x7f
+
+# ATT:   vpdpbuuds  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0x51,0x52,0x80
+
+# ATT:   vpdpbuuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0x51,0x10
+
+# ATT:   vpdpbuuds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbuuds  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0x51,0x51,0x7f
+
+# ATT:   vpdpbuuds  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0x51,0x52,0x80
+
+# ATT:   vpdpbuuds  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0x51,0x10
+
+# ATT:   vpdpbuuds  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbuuds  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0x51,0x51,0x7f
+
+# ATT:   vpdpbuuds  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0x51,0x52,0x80
+
+# VNNI INT16
+
+# ATT:   vpdpwsud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xd2,0xd4
+
+# ATT:   vpdpwsud %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwsud xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x66,0x0f,0xd2,0xd4
+
+# ATT:   vpdpwsud %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x66,0x8f,0xd2,0xd4
+
+# ATT:   vpdpwsud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xd2,0xd4
+
+# ATT:   vpdpwsud %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwsud ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x66,0x2f,0xd2,0xd4
+
+# ATT:   vpdpwsud %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x66,0xaf,0xd2,0xd4
+
+# ATT:   vpdpwsud %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwsud zmm2, zmm3, zmm4
+0x62,0xf2,0x66,0x48,0xd2,0xd4
+
+# ATT:   vpdpwsud %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwsud zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x66,0x4f,0xd2,0xd4
+
+# ATT:   vpdpwsud %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x66,0xcf,0xd2,0xd4
+
+# ATT:   vpdpwsud  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsud  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsud  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x66,0x18,0xd2,0x10
+
+# ATT:   vpdpwsud  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwsud  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x66,0x8f,0xd2,0x51,0x7f
+
+# ATT:   vpdpwsud  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x66,0x9f,0xd2,0x52,0x80
+
+# ATT:   vpdpwsud  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsud  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsud  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x66,0x38,0xd2,0x10
+
+# ATT:   vpdpwsud  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwsud  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x66,0xaf,0xd2,0x51,0x7f
+
+# ATT:   vpdpwsud  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x66,0xbf,0xd2,0x52,0x80
+
+# ATT:   vpdpwsud  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x66,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsud  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsud  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x66,0x58,0xd2,0x10
+
+# ATT:   vpdpwsud  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x66,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwsud  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x66,0xcf,0xd2,0x51,0x7f
+
+# ATT:   vpdpwsud  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x66,0xdf,0xd2,0x52,0x80
+
+# ATT:   vpdpwsuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xd3,0xd4
+
+# ATT:   vpdpwsuds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwsuds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x66,0x0f,0xd3,0xd4
+
+# ATT:   vpdpwsuds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x66,0x8f,0xd3,0xd4
+
+# ATT:   vpdpwsuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xd3,0xd4
+
+# ATT:   vpdpwsuds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwsuds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x66,0x2f,0xd3,0xd4
+
+# ATT:   vpdpwsuds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x66,0xaf,0xd3,0xd4
+
+# ATT:   vpdpwsuds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwsuds zmm2, zmm3, zmm4
+0x62,0xf2,0x66,0x48,0xd3,0xd4
+
+# ATT:   vpdpwsuds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwsuds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x66,0x4f,0xd3,0xd4
+
+# ATT:   vpdpwsuds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x66,0xcf,0xd3,0xd4
+
+# ATT:   vpdpwsuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsuds  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsuds  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x66,0x18,0xd3,0x10
+
+# ATT:   vpdpwsuds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwsuds  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x66,0x8f,0xd3,0x51,0x7f
+
+# ATT:   vpdpwsuds  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x66,0x9f,0xd3,0x52,0x80
+
+# ATT:   vpdpwsuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsuds  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsuds  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x66,0x38,0xd3,0x10
+
+# ATT:   vpdpwsuds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwsuds  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x66,0xaf,0xd3,0x51,0x7f
+
+# ATT:   vpdpwsuds  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x66,0xbf,0xd3,0x52,0x80
+
+# ATT:   vpdpwsuds  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x66,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsuds  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsuds  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x66,0x58,0xd3,0x10
+
+# ATT:   vpdpwsuds  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x66,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwsuds  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x66,0xcf,0xd3,0x51,0x7f
+
+# ATT:   vpdpwsuds  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x66,0xdf,0xd3,0x52,0x80
+
+# ATT:   vpdpwusd %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmm4
+0xc4,0xe2,0x61,0xd2,0xd4
+
+# ATT:   vpdpwusd %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwusd xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x65,0x0f,0xd2,0xd4
+
+# ATT:   vpdpwusd %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x65,0x8f,0xd2,0xd4
+
+# ATT:   vpdpwusd %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymm4
+0xc4,0xe2,0x65,0xd2,0xd4
+
+# ATT:   vpdpwusd %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwusd ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x65,0x2f,0xd2,0xd4
+
+# ATT:   vpdpwusd %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x65,0xaf,0xd2,0xd4
+
+# ATT:   vpdpwusd %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwusd zmm2, zmm3, zmm4
+0x62,0xf2,0x65,0x48,0xd2,0xd4
+
+# ATT:   vpdpwusd %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwusd zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x65,0x4f,0xd2,0xd4
+
+# ATT:   vpdpwusd %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x65,0xcf,0xd2,0xd4
+
+# ATT:   vpdpwusd  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusd  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusd  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x65,0x18,0xd2,0x10
+
+# ATT:   vpdpwusd  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwusd  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x65,0x8f,0xd2,0x51,0x7f
+
+# ATT:   vpdpwusd  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x65,0x9f,0xd2,0x52,0x80
+
+# ATT:   vpdpwusd  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusd  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusd  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x65,0x38,0xd2,0x10
+
+# ATT:   vpdpwusd  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwusd  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x65,0xaf,0xd2,0x51,0x7f
+
+# ATT:   vpdpwusd  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x65,0xbf,0xd2,0x52,0x80
+
+# ATT:   vpdpwusd  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x65,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusd  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusd  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x65,0x58,0xd2,0x10
+
+# ATT:   vpdpwusd  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x65,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwusd  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x65,0xcf,0xd2,0x51,0x7f
+
+# ATT:   vpdpwusd  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x65,0xdf,0xd2,0x52,0x80
+
+# ATT:   vpdpwusds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmm4
+0xc4,0xe2,0x61,0xd3,0xd4
+
+# ATT:   vpdpwusds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwusds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x65,0x0f,0xd3,0xd4
+
+# ATT:   vpdpwusds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x65,0x8f,0xd3,0xd4
+
+# ATT:   vpdpwusds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymm4
+0xc4,0xe2,0x65,0xd3,0xd4
+
+# ATT:   vpdpwusds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwusds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x65,0x2f,0xd3,0xd4
+
+# ATT:   vpdpwusds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x65,0xaf,0xd3,0xd4
+
+# ATT:   vpdpwusds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwusds zmm2, zmm3, zmm4
+0x62,0xf2,0x65,0x48,0xd3,0xd4
+
+# ATT:   vpdpwusds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwusds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x65,0x4f,0xd3,0xd4
+
+# ATT:   vpdpwusds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x65,0xcf,0xd3,0xd4
+
+# ATT:   vpdpwusds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusds  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusds  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x65,0x18,0xd3,0x10
+
+# ATT:   vpdpwusds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwusds  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x65,0x8f,0xd3,0x51,0x7f
+
+# ATT:   vpdpwusds  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x65,0x9f,0xd3,0x52,0x80
+
+# ATT:   vpdpwusds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusds  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusds  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x65,0x38,0xd3,0x10
+
+# ATT:   vpdpwusds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwusds  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x65,0xaf,0xd3,0x51,0x7f
+
+# ATT:   vpdpwusds  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x65,0xbf,0xd3,0x52,0x80
+
+# ATT:   vpdpwusds  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x65,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusds  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusds  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x65,0x58,0xd3,0x10
+
+# ATT:   vpdpwusds  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x65,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwusds  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x65,0xcf,0xd3,0x51,0x7f
+
+# ATT:   vpdpwusds  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x65,0xdf,0xd3,0x52,0x80
+
+# ATT:   vpdpwuud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0xd2,0xd4
+
+# ATT:   vpdpwuud %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwuud xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0xd2,0xd4
+
+# ATT:   vpdpwuud %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0xd2,0xd4
+
+# ATT:   vpdpwuud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0xd2,0xd4
+
+# ATT:   vpdpwuud %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwuud ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0xd2,0xd4
+
+# ATT:   vpdpwuud %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0xd2,0xd4
+
+# ATT:   vpdpwuud %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwuud zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0xd2,0xd4
+
+# ATT:   vpdpwuud %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwuud zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0xd2,0xd4
+
+# ATT:   vpdpwuud %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0xd2,0xd4
+
+# ATT:   vpdpwuud  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuud  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuud  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0xd2,0x10
+
+# ATT:   vpdpwuud  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwuud  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0xd2,0x51,0x7f
+
+# ATT:   vpdpwuud  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0xd2,0x52,0x80
+
+# ATT:   vpdpwuud  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuud  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuud  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0xd2,0x10
+
+# ATT:   vpdpwuud  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwuud  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0xd2,0x51,0x7f
+
+# ATT:   vpdpwuud  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0xd2,0x52,0x80
+
+# ATT:   vpdpwuud  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuud  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuud  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0xd2,0x10
+
+# ATT:   vpdpwuud  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwuud  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0xd2,0x51,0x7f
+
+# ATT:   vpdpwuud  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0xd2,0x52,0x80
+
+# ATT:   vpdpwuuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0xd3,0xd4
+
+# ATT:   vpdpwuuds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwuuds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0xd3,0xd4
+
+# ATT:   vpdpwuuds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0xd3,0xd4
+
+# ATT:   vpdpwuuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0xd3,0xd4
+
+# ATT:   vpdpwuuds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwuuds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0xd3,0xd4
+
+# ATT:   vpdpwuuds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0xd3,0xd4
+
+# ATT:   vpdpwuuds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwuuds zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0xd3,0xd4
+
+# ATT:   vpdpwuuds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwuuds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0xd3,0xd4
+
+# ATT:   vpdpwuuds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0xd3,0xd4
+
+# ATT:   vpdpwuuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuuds  291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuuds  (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0xd3,0x10
+
+# ATT:   vpdpwuuds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwuuds  2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0xd3,0x51,0x7f
+
+# ATT:   vpdpwuuds  -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0xd3,0x52,0x80
+
+# ATT:   vpdpwuuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuuds  291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuuds  (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0xd3,0x10
+
+# ATT:   vpdpwuuds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwuuds  4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0xd3,0x51,0x7f
+
+# ATT:   vpdpwuuds  -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0xd3,0x52,0x80
+
+# ATT:   vpdpwuuds  268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuuds  291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuuds  (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0xd3,0x10
+
+# ATT:   vpdpwuuds  -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwuuds  8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0xd3,0x51,0x7f
+
+# ATT:   vpdpwuuds  -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0xd3,0x52,0x80
+
 # VMPSADBW
 
 # ATT:   vmpsadbw $123, %xmm4, %xmm3, %xmm2
diff --git a/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt b/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt
index 34f8851d04d6b..fd97529af2806 100644
--- a/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt
@@ -1,6 +1,1416 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
 
+# VNNI FP16
+
+# ATT:   vdpphps %xmm24, %xmm23, %xmm22
+# INTEL: vdpphps xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0x52,0xf0
+
+# ATT:   vdpphps %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vdpphps xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0x52,0xf0
+
+# ATT:   vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdpphps xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0x52,0xf0
+
+# ATT:   vdpphps %ymm24, %ymm23, %ymm22
+# INTEL: vdpphps ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0x52,0xf0
+
+# ATT:   vdpphps %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vdpphps ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0x52,0xf0
+
+# ATT:   vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdpphps ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0x52,0xf0
+
+# ATT:   vdpphps %zmm24, %zmm23, %zmm22
+# INTEL: vdpphps zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0x52,0xf0
+
+# ATT:   vdpphps %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vdpphps zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0x52,0xf0
+
+# ATT:   vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdpphps zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0x52,0xf0
+
+# ATT:   vdpphps  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vdpphps xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vdpphps  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vdpphps xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vdpphps  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vdpphps xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0x52,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vdpphps  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vdpphps xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vdpphps  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdpphps xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0x52,0x71,0x7f
+
+# ATT:   vdpphps  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdpphps xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0x52,0x72,0x80
+
+# ATT:   vdpphps  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vdpphps ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vdpphps  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vdpphps ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vdpphps  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vdpphps ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0x52,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vdpphps  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vdpphps ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vdpphps  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdpphps ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0x52,0x71,0x7f
+
+# ATT:   vdpphps  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdpphps ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0x52,0x72,0x80
+
+# ATT:   vdpphps  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vdpphps zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vdpphps  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vdpphps zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vdpphps  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vdpphps zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0x52,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vdpphps  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vdpphps zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0x52,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vdpphps  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdpphps zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0x52,0x71,0x7f
+
+# ATT:   vdpphps  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdpphps zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0x52,0x72,0x80
+
+# VNNI INT8
+
+# ATT:   vpdpbssd %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbssd xmm22, xmm23, xmm24
+0x62,0x82,0x47,0x00,0x50,0xf0
+
+# ATT:   vpdpbssd %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbssd xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x47,0x07,0x50,0xf0
+
+# ATT:   vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x47,0x87,0x50,0xf0
+
+# ATT:   vpdpbssd %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbssd ymm22, ymm23, ymm24
+0x62,0x82,0x47,0x20,0x50,0xf0
+
+# ATT:   vpdpbssd %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbssd ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x47,0x27,0x50,0xf0
+
+# ATT:   vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x47,0xa7,0x50,0xf0
+
+# ATT:   vpdpbssd %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbssd zmm22, zmm23, zmm24
+0x62,0x82,0x47,0x40,0x50,0xf0
+
+# ATT:   vpdpbssd %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbssd zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x47,0x47,0x50,0xf0
+
+# ATT:   vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x47,0xc7,0x50,0xf0
+
+# ATT:   vpdpbssd  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbssd xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbssd xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbssd xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x47,0x10,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssd  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbssd xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x47,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbssd  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x47,0x87,0x50,0x71,0x7f
+
+# ATT:   vpdpbssd  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x47,0x97,0x50,0x72,0x80
+
+# ATT:   vpdpbssd  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbssd ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbssd ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbssd ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x47,0x30,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssd  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbssd ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x47,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbssd  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x47,0xa7,0x50,0x71,0x7f
+
+# ATT:   vpdpbssd  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x47,0xb7,0x50,0x72,0x80
+
+# ATT:   vpdpbssd  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbssd zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbssd zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbssd zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x47,0x50,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssd  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbssd zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x47,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbssd  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x47,0xc7,0x50,0x71,0x7f
+
+# ATT:   vpdpbssd  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x47,0xd7,0x50,0x72,0x80
+
+# ATT:   vpdpbssds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbssds xmm22, xmm23, xmm24
+0x62,0x82,0x47,0x00,0x51,0xf0
+
+# ATT:   vpdpbssds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbssds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x47,0x07,0x51,0xf0
+
+# ATT:   vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x47,0x87,0x51,0xf0
+
+# ATT:   vpdpbssds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbssds ymm22, ymm23, ymm24
+0x62,0x82,0x47,0x20,0x51,0xf0
+
+# ATT:   vpdpbssds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbssds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x47,0x27,0x51,0xf0
+
+# ATT:   vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x47,0xa7,0x51,0xf0
+
+# ATT:   vpdpbssds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbssds zmm22, zmm23, zmm24
+0x62,0x82,0x47,0x40,0x51,0xf0
+
+# ATT:   vpdpbssds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbssds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x47,0x47,0x51,0xf0
+
+# ATT:   vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x47,0xc7,0x51,0xf0
+
+# ATT:   vpdpbssds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbssds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbssds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbssds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x47,0x10,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssds  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbssds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x47,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbssds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x47,0x87,0x51,0x71,0x7f
+
+# ATT:   vpdpbssds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x47,0x97,0x51,0x72,0x80
+
+# ATT:   vpdpbssds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbssds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbssds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbssds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x47,0x30,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssds  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbssds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x47,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbssds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x47,0xa7,0x51,0x71,0x7f
+
+# ATT:   vpdpbssds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x47,0xb7,0x51,0x72,0x80
+
+# ATT:   vpdpbssds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbssds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbssds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbssds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x47,0x50,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssds  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbssds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x47,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbssds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x47,0xc7,0x51,0x71,0x7f
+
+# ATT:   vpdpbssds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x47,0xd7,0x51,0x72,0x80
+
+# ATT:   vpdpbsud %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbsud xmm22, xmm23, xmm24
+0x62,0x82,0x46,0x00,0x50,0xf0
+
+# ATT:   vpdpbsud %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbsud xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x46,0x07,0x50,0xf0
+
+# ATT:   vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x46,0x87,0x50,0xf0
+
+# ATT:   vpdpbsud %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbsud ymm22, ymm23, ymm24
+0x62,0x82,0x46,0x20,0x50,0xf0
+
+# ATT:   vpdpbsud %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbsud ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x46,0x27,0x50,0xf0
+
+# ATT:   vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x46,0xa7,0x50,0xf0
+
+# ATT:   vpdpbsud %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbsud zmm22, zmm23, zmm24
+0x62,0x82,0x46,0x40,0x50,0xf0
+
+# ATT:   vpdpbsud %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbsud zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x46,0x47,0x50,0xf0
+
+# ATT:   vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x46,0xc7,0x50,0xf0
+
+# ATT:   vpdpbsud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbsud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbsud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbsud xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x46,0x10,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsud  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbsud xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x46,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbsud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x46,0x87,0x50,0x71,0x7f
+
+# ATT:   vpdpbsud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x46,0x97,0x50,0x72,0x80
+
+# ATT:   vpdpbsud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbsud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbsud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbsud ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x46,0x30,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsud  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbsud ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x46,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbsud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x46,0xa7,0x50,0x71,0x7f
+
+# ATT:   vpdpbsud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x46,0xb7,0x50,0x72,0x80
+
+# ATT:   vpdpbsud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbsud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbsud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbsud zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x46,0x50,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsud  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbsud zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x46,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbsud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x46,0xc7,0x50,0x71,0x7f
+
+# ATT:   vpdpbsud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x46,0xd7,0x50,0x72,0x80
+
+# ATT:   vpdpbsuds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbsuds xmm22, xmm23, xmm24
+0x62,0x82,0x46,0x00,0x51,0xf0
+
+# ATT:   vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbsuds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x46,0x07,0x51,0xf0
+
+# ATT:   vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x46,0x87,0x51,0xf0
+
+# ATT:   vpdpbsuds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbsuds ymm22, ymm23, ymm24
+0x62,0x82,0x46,0x20,0x51,0xf0
+
+# ATT:   vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbsuds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x46,0x27,0x51,0xf0
+
+# ATT:   vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x46,0xa7,0x51,0xf0
+
+# ATT:   vpdpbsuds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbsuds zmm22, zmm23, zmm24
+0x62,0x82,0x46,0x40,0x51,0xf0
+
+# ATT:   vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbsuds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x46,0x47,0x51,0xf0
+
+# ATT:   vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x46,0xc7,0x51,0xf0
+
+# ATT:   vpdpbsuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbsuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbsuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbsuds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x46,0x10,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsuds  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbsuds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x46,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbsuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x46,0x87,0x51,0x71,0x7f
+
+# ATT:   vpdpbsuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x46,0x97,0x51,0x72,0x80
+
+# ATT:   vpdpbsuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbsuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbsuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbsuds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x46,0x30,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsuds  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbsuds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x46,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbsuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x46,0xa7,0x51,0x71,0x7f
+
+# ATT:   vpdpbsuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x46,0xb7,0x51,0x72,0x80
+
+# ATT:   vpdpbsuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbsuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbsuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbsuds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x46,0x50,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsuds  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbsuds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x46,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbsuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x46,0xc7,0x51,0x71,0x7f
+
+# ATT:   vpdpbsuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x46,0xd7,0x51,0x72,0x80
+
+# ATT:   vpdpbuud %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbuud xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0x50,0xf0
+
+# ATT:   vpdpbuud %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbuud xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0x50,0xf0
+
+# ATT:   vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0x50,0xf0
+
+# ATT:   vpdpbuud %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbuud ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0x50,0xf0
+
+# ATT:   vpdpbuud %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbuud ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0x50,0xf0
+
+# ATT:   vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0x50,0xf0
+
+# ATT:   vpdpbuud %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbuud zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0x50,0xf0
+
+# ATT:   vpdpbuud %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbuud zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0x50,0xf0
+
+# ATT:   vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0x50,0xf0
+
+# ATT:   vpdpbuud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbuud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbuud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbuud xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuud  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbuud xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbuud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0x50,0x71,0x7f
+
+# ATT:   vpdpbuud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0x50,0x72,0x80
+
+# ATT:   vpdpbuud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbuud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbuud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbuud ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuud  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbuud ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbuud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0x50,0x71,0x7f
+
+# ATT:   vpdpbuud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0x50,0x72,0x80
+
+# ATT:   vpdpbuud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbuud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbuud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbuud zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuud  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbuud zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbuud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0x50,0x71,0x7f
+
+# ATT:   vpdpbuud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0x50,0x72,0x80
+
+# ATT:   vpdpbuuds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbuuds xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0x51,0xf0
+
+# ATT:   vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbuuds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0x51,0xf0
+
+# ATT:   vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0x51,0xf0
+
+# ATT:   vpdpbuuds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbuuds ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0x51,0xf0
+
+# ATT:   vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbuuds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0x51,0xf0
+
+# ATT:   vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0x51,0xf0
+
+# ATT:   vpdpbuuds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbuuds zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0x51,0xf0
+
+# ATT:   vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbuuds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0x51,0xf0
+
+# ATT:   vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0x51,0xf0
+
+# ATT:   vpdpbuuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbuuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbuuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbuuds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuuds  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbuuds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbuuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0x51,0x71,0x7f
+
+# ATT:   vpdpbuuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0x51,0x72,0x80
+
+# ATT:   vpdpbuuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbuuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbuuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbuuds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuuds  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbuuds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbuuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0x51,0x71,0x7f
+
+# ATT:   vpdpbuuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0x51,0x72,0x80
+
+# ATT:   vpdpbuuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbuuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbuuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbuuds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuuds  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbuuds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpbuuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0x51,0x71,0x7f
+
+# ATT:   vpdpbuuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0x51,0x72,0x80
+
+# VNNI INT16
+
+# ATT:   vpdpwsud %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwsud xmm22, xmm23, xmm24
+0x62,0x82,0x46,0x00,0xd2,0xf0
+
+# ATT:   vpdpwsud %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwsud xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x46,0x07,0xd2,0xf0
+
+# ATT:   vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x46,0x87,0xd2,0xf0
+
+# ATT:   vpdpwsud %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwsud ymm22, ymm23, ymm24
+0x62,0x82,0x46,0x20,0xd2,0xf0
+
+# ATT:   vpdpwsud %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwsud ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x46,0x27,0xd2,0xf0
+
+# ATT:   vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x46,0xa7,0xd2,0xf0
+
+# ATT:   vpdpwsud %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwsud zmm22, zmm23, zmm24
+0x62,0x82,0x46,0x40,0xd2,0xf0
+
+# ATT:   vpdpwsud %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwsud zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x46,0x47,0xd2,0xf0
+
+# ATT:   vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x46,0xc7,0xd2,0xf0
+
+# ATT:   vpdpwsud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwsud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwsud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsud  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwsud xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x46,0x10,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsud  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwsud xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x46,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwsud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x46,0x87,0xd2,0x71,0x7f
+
+# ATT:   vpdpwsud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x46,0x97,0xd2,0x72,0x80
+
+# ATT:   vpdpwsud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwsud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwsud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsud  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwsud ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x46,0x30,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsud  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwsud ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x46,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwsud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x46,0xa7,0xd2,0x71,0x7f
+
+# ATT:   vpdpwsud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x46,0xb7,0xd2,0x72,0x80
+
+# ATT:   vpdpwsud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwsud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwsud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsud  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwsud zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x46,0x50,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsud  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwsud zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x46,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwsud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x46,0xc7,0xd2,0x71,0x7f
+
+# ATT:   vpdpwsud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x46,0xd7,0xd2,0x72,0x80
+
+# ATT:   vpdpwsuds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwsuds xmm22, xmm23, xmm24
+0x62,0x82,0x46,0x00,0xd3,0xf0
+
+# ATT:   vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwsuds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x46,0x07,0xd3,0xf0
+
+# ATT:   vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x46,0x87,0xd3,0xf0
+
+# ATT:   vpdpwsuds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwsuds ymm22, ymm23, ymm24
+0x62,0x82,0x46,0x20,0xd3,0xf0
+
+# ATT:   vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwsuds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x46,0x27,0xd3,0xf0
+
+# ATT:   vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x46,0xa7,0xd3,0xf0
+
+# ATT:   vpdpwsuds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwsuds zmm22, zmm23, zmm24
+0x62,0x82,0x46,0x40,0xd3,0xf0
+
+# ATT:   vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwsuds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x46,0x47,0xd3,0xf0
+
+# ATT:   vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x46,0xc7,0xd3,0xf0
+
+# ATT:   vpdpwsuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwsuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwsuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsuds  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwsuds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x46,0x10,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsuds  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwsuds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x46,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwsuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x46,0x87,0xd3,0x71,0x7f
+
+# ATT:   vpdpwsuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x46,0x97,0xd3,0x72,0x80
+
+# ATT:   vpdpwsuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwsuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwsuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsuds  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwsuds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x46,0x30,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsuds  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwsuds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x46,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwsuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x46,0xa7,0xd3,0x71,0x7f
+
+# ATT:   vpdpwsuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x46,0xb7,0xd3,0x72,0x80
+
+# ATT:   vpdpwsuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwsuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwsuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsuds  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwsuds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x46,0x50,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsuds  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwsuds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x46,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwsuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x46,0xc7,0xd3,0x71,0x7f
+
+# ATT:   vpdpwsuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x46,0xd7,0xd3,0x72,0x80
+
+# ATT:   vpdpwusd %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwusd xmm22, xmm23, xmm24
+0x62,0x82,0x45,0x00,0xd2,0xf0
+
+# ATT:   vpdpwusd %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwusd xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x45,0x07,0xd2,0xf0
+
+# ATT:   vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x45,0x87,0xd2,0xf0
+
+# ATT:   vpdpwusd %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwusd ymm22, ymm23, ymm24
+0x62,0x82,0x45,0x20,0xd2,0xf0
+
+# ATT:   vpdpwusd %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwusd ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x45,0x27,0xd2,0xf0
+
+# ATT:   vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x45,0xa7,0xd2,0xf0
+
+# ATT:   vpdpwusd %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwusd zmm22, zmm23, zmm24
+0x62,0x82,0x45,0x40,0xd2,0xf0
+
+# ATT:   vpdpwusd %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwusd zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x45,0x47,0xd2,0xf0
+
+# ATT:   vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x45,0xc7,0xd2,0xf0
+
+# ATT:   vpdpwusd  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwusd xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusd  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwusd xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusd  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwusd xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x45,0x10,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusd  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwusd xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x45,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwusd  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x45,0x87,0xd2,0x71,0x7f
+
+# ATT:   vpdpwusd  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x45,0x97,0xd2,0x72,0x80
+
+# ATT:   vpdpwusd  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwusd ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusd  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwusd ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusd  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwusd ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x45,0x30,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusd  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwusd ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x45,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwusd  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x45,0xa7,0xd2,0x71,0x7f
+
+# ATT:   vpdpwusd  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x45,0xb7,0xd2,0x72,0x80
+
+# ATT:   vpdpwusd  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwusd zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusd  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwusd zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusd  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwusd zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x45,0x50,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusd  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwusd zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x45,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwusd  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x45,0xc7,0xd2,0x71,0x7f
+
+# ATT:   vpdpwusd  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x45,0xd7,0xd2,0x72,0x80
+
+# ATT:   vpdpwusds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwusds xmm22, xmm23, xmm24
+0x62,0x82,0x45,0x00,0xd3,0xf0
+
+# ATT:   vpdpwusds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwusds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x45,0x07,0xd3,0xf0
+
+# ATT:   vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x45,0x87,0xd3,0xf0
+
+# ATT:   vpdpwusds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwusds ymm22, ymm23, ymm24
+0x62,0x82,0x45,0x20,0xd3,0xf0
+
+# ATT:   vpdpwusds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwusds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x45,0x27,0xd3,0xf0
+
+# ATT:   vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x45,0xa7,0xd3,0xf0
+
+# ATT:   vpdpwusds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwusds zmm22, zmm23, zmm24
+0x62,0x82,0x45,0x40,0xd3,0xf0
+
+# ATT:   vpdpwusds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwusds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x45,0x47,0xd3,0xf0
+
+# ATT:   vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x45,0xc7,0xd3,0xf0
+
+# ATT:   vpdpwusds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwusds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwusds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusds  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwusds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x45,0x10,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusds  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwusds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x45,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwusds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x45,0x87,0xd3,0x71,0x7f
+
+# ATT:   vpdpwusds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x45,0x97,0xd3,0x72,0x80
+
+# ATT:   vpdpwusds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwusds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwusds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusds  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwusds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x45,0x30,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusds  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwusds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x45,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwusds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x45,0xa7,0xd3,0x71,0x7f
+
+# ATT:   vpdpwusds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x45,0xb7,0xd3,0x72,0x80
+
+# ATT:   vpdpwusds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwusds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwusds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusds  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwusds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x45,0x50,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusds  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwusds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x45,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwusds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x45,0xc7,0xd3,0x71,0x7f
+
+# ATT:   vpdpwusds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x45,0xd7,0xd3,0x72,0x80
+
+# ATT:   vpdpwuud %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwuud xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0xd2,0xf0
+
+# ATT:   vpdpwuud %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwuud xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0xd2,0xf0
+
+# ATT:   vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0xd2,0xf0
+
+# ATT:   vpdpwuud %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwuud ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0xd2,0xf0
+
+# ATT:   vpdpwuud %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwuud ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0xd2,0xf0
+
+# ATT:   vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0xd2,0xf0
+
+# ATT:   vpdpwuud %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwuud zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0xd2,0xf0
+
+# ATT:   vpdpwuud %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwuud zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0xd2,0xf0
+
+# ATT:   vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0xd2,0xf0
+
+# ATT:   vpdpwuud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwuud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwuud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuud  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwuud xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuud  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwuud xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwuud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0xd2,0x71,0x7f
+
+# ATT:   vpdpwuud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0xd2,0x72,0x80
+
+# ATT:   vpdpwuud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwuud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwuud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuud  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwuud ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuud  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwuud ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwuud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0xd2,0x71,0x7f
+
+# ATT:   vpdpwuud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0xd2,0x72,0x80
+
+# ATT:   vpdpwuud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwuud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwuud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuud  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwuud zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuud  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwuud zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwuud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0xd2,0x71,0x7f
+
+# ATT:   vpdpwuud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0xd2,0x72,0x80
+
+# ATT:   vpdpwuuds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwuuds xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0xd3,0xf0
+
+# ATT:   vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwuuds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0xd3,0xf0
+
+# ATT:   vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0xd3,0xf0
+
+# ATT:   vpdpwuuds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwuuds ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0xd3,0xf0
+
+# ATT:   vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwuuds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0xd3,0xf0
+
+# ATT:   vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0xd3,0xf0
+
+# ATT:   vpdpwuuds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwuuds zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0xd3,0xf0
+
+# ATT:   vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwuuds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0xd3,0xf0
+
+# ATT:   vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0xd3,0xf0
+
+# ATT:   vpdpwuuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwuuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwuuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuuds  (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwuuds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuuds  -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwuuds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwuuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0xd3,0x71,0x7f
+
+# ATT:   vpdpwuuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0xd3,0x72,0x80
+
+# ATT:   vpdpwuuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwuuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwuuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuuds  (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwuuds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuuds  -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwuuds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwuuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0xd3,0x71,0x7f
+
+# ATT:   vpdpwuuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0xd3,0x72,0x80
+
+# ATT:   vpdpwuuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwuuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwuuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuuds  (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwuuds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuuds  -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwuuds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwuuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0xd3,0x71,0x7f
+
+# ATT:   vpdpwuuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0xd3,0x72,0x80
+
 # VMPSADBW
 
 # ATT:   vmpsadbw $123, %xmm24, %xmm23, %xmm22
diff --git a/llvm/test/MC/X86/avx10_2ni-32-intel.s b/llvm/test/MC/X86/avx10_2ni-32-intel.s
index ea9a89f316cc3..54e9f9433e003 100644
--- a/llvm/test/MC/X86/avx10_2ni-32-intel.s
+++ b/llvm/test/MC/X86/avx10_2ni-32-intel.s
@@ -1,5 +1,1415 @@
 // RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
+// VNNI FP16
+
+// CHECK: vdpphps xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0xd4]
+          vdpphps xmm2, xmm3, xmm4
+
+// CHECK: vdpphps xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x52,0xd4]
+          vdpphps xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vdpphps xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x52,0xd4]
+          vdpphps xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vdpphps ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0xd4]
+          vdpphps ymm2, ymm3, ymm4
+
+// CHECK: vdpphps ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x52,0xd4]
+          vdpphps ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vdpphps ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x52,0xd4]
+          vdpphps ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vdpphps zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0xd4]
+          vdpphps zmm2, zmm3, zmm4
+
+// CHECK: vdpphps zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x52,0xd4]
+          vdpphps zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vdpphps zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x52,0xd4]
+          vdpphps zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x52,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdpphps xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x52,0x10]
+          vdpphps xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x52,0x51,0x7f]
+          vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x52,0x52,0x80]
+          vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x52,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdpphps ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x52,0x10]
+          vdpphps ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x52,0x51,0x7f]
+          vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x52,0x52,0x80]
+          vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x52,0x94,0x87,0x23,0x01,0x00,0x00]
+          vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdpphps zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x52,0x10]
+          vdpphps zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x52,0x51,0x7f]
+          vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x52,0x52,0x80]
+          vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// VNNI INT8
+
+// CHECK: vpdpbssd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4]
+          vpdpbssd xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssd xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x50,0xd4]
+          vpdpbssd xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x50,0xd4]
+          vpdpbssd xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbssd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4]
+          vpdpbssd ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssd ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x50,0xd4]
+          vpdpbssd ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x50,0xd4]
+          vpdpbssd ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbssd zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0xd4]
+          vpdpbssd zmm2, zmm3, zmm4
+
+// CHECK: vpdpbssd zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x50,0xd4]
+          vpdpbssd zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x50,0xd4]
+          vpdpbssd zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x67,0x18,0x50,0x10]
+          vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x50,0x51,0x7f]
+          vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x50,0x52,0x80]
+          vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x67,0x38,0x50,0x10]
+          vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x50,0x51,0x7f]
+          vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x50,0x52,0x80]
+          vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x67,0x58,0x50,0x10]
+          vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x50,0x51,0x7f]
+          vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x50,0x52,0x80]
+          vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbssds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4]
+          vpdpbssds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x51,0xd4]
+          vpdpbssds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x51,0xd4]
+          vpdpbssds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbssds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4]
+          vpdpbssds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x51,0xd4]
+          vpdpbssds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x51,0xd4]
+          vpdpbssds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbssds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0xd4]
+          vpdpbssds zmm2, zmm3, zmm4
+
+// CHECK: vpdpbssds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x51,0xd4]
+          vpdpbssds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x51,0xd4]
+          vpdpbssds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x67,0x18,0x51,0x10]
+          vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x51,0x51,0x7f]
+          vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x51,0x52,0x80]
+          vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x67,0x38,0x51,0x10]
+          vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x51,0x51,0x7f]
+          vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x51,0x52,0x80]
+          vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x67,0x58,0x51,0x10]
+          vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x51,0x51,0x7f]
+          vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x51,0x52,0x80]
+          vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4]
+          vpdpbsud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsud xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x50,0xd4]
+          vpdpbsud xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x50,0xd4]
+          vpdpbsud xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4]
+          vpdpbsud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsud ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x50,0xd4]
+          vpdpbsud ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x50,0xd4]
+          vpdpbsud ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbsud zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0xd4]
+          vpdpbsud zmm2, zmm3, zmm4
+
+// CHECK: vpdpbsud zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x50,0xd4]
+          vpdpbsud zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x50,0xd4]
+          vpdpbsud zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x18,0x50,0x10]
+          vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x50,0x51,0x7f]
+          vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0x50,0x52,0x80]
+          vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0x38,0x50,0x10]
+          vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x50,0x51,0x7f]
+          vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0x50,0x52,0x80]
+          vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0x58,0x50,0x10]
+          vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x50,0x51,0x7f]
+          vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0x50,0x52,0x80]
+          vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4]
+          vpdpbsuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsuds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x51,0xd4]
+          vpdpbsuds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x51,0xd4]
+          vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4]
+          vpdpbsuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsuds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x51,0xd4]
+          vpdpbsuds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x51,0xd4]
+          vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbsuds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0xd4]
+          vpdpbsuds zmm2, zmm3, zmm4
+
+// CHECK: vpdpbsuds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x51,0xd4]
+          vpdpbsuds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x51,0xd4]
+          vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x18,0x51,0x10]
+          vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x51,0x51,0x7f]
+          vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0x51,0x52,0x80]
+          vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0x38,0x51,0x10]
+          vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x51,0x51,0x7f]
+          vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0x51,0x52,0x80]
+          vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0x58,0x51,0x10]
+          vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x51,0x51,0x7f]
+          vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0x51,0x52,0x80]
+          vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4]
+          vpdpbuud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuud xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x50,0xd4]
+          vpdpbuud xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x50,0xd4]
+          vpdpbuud xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4]
+          vpdpbuud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuud ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x50,0xd4]
+          vpdpbuud ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x50,0xd4]
+          vpdpbuud ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbuud zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0xd4]
+          vpdpbuud zmm2, zmm3, zmm4
+
+// CHECK: vpdpbuud zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x50,0xd4]
+          vpdpbuud zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x50,0xd4]
+          vpdpbuud zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x50,0x10]
+          vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x50,0x51,0x7f]
+          vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x50,0x52,0x80]
+          vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x50,0x10]
+          vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x50,0x51,0x7f]
+          vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x50,0x52,0x80]
+          vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x50,0x10]
+          vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x50,0x51,0x7f]
+          vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x50,0x52,0x80]
+          vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4]
+          vpdpbuuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuuds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x51,0xd4]
+          vpdpbuuds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x51,0xd4]
+          vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4]
+          vpdpbuuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuuds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x51,0xd4]
+          vpdpbuuds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x51,0xd4]
+          vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbuuds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0xd4]
+          vpdpbuuds zmm2, zmm3, zmm4
+
+// CHECK: vpdpbuuds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x51,0xd4]
+          vpdpbuuds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x51,0xd4]
+          vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x51,0x10]
+          vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x51,0x51,0x7f]
+          vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x51,0x52,0x80]
+          vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x51,0x10]
+          vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x51,0x51,0x7f]
+          vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x51,0x52,0x80]
+          vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x51,0x10]
+          vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x51,0x51,0x7f]
+          vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x51,0x52,0x80]
+          vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// VNNI INT16
+
+// CHECK: vpdpwsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4]
+          vpdpwsud xmm2, xmm3, xmm4
+
+// CHECK: vpdpwsud xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd2,0xd4]
+          vpdpwsud xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd2,0xd4]
+          vpdpwsud xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4]
+          vpdpwsud ymm2, ymm3, ymm4
+
+// CHECK: vpdpwsud ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd2,0xd4]
+          vpdpwsud ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd2,0xd4]
+          vpdpwsud ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwsud zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0xd4]
+          vpdpwsud zmm2, zmm3, zmm4
+
+// CHECK: vpdpwsud zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd2,0xd4]
+          vpdpwsud zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd2,0xd4]
+          vpdpwsud zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x18,0xd2,0x10]
+          vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd2,0x51,0x7f]
+          vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0xd2,0x52,0x80]
+          vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0x38,0xd2,0x10]
+          vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd2,0x51,0x7f]
+          vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0xd2,0x52,0x80]
+          vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0x58,0xd2,0x10]
+          vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd2,0x51,0x7f]
+          vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0xd2,0x52,0x80]
+          vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4]
+          vpdpwsuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwsuds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd3,0xd4]
+          vpdpwsuds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd3,0xd4]
+          vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4]
+          vpdpwsuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwsuds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd3,0xd4]
+          vpdpwsuds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd3,0xd4]
+          vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwsuds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0xd4]
+          vpdpwsuds zmm2, zmm3, zmm4
+
+// CHECK: vpdpwsuds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd3,0xd4]
+          vpdpwsuds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd3,0xd4]
+          vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x18,0xd3,0x10]
+          vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd3,0x51,0x7f]
+          vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0xd3,0x52,0x80]
+          vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0x38,0xd3,0x10]
+          vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd3,0x51,0x7f]
+          vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0xd3,0x52,0x80]
+          vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0x58,0xd3,0x10]
+          vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd3,0x51,0x7f]
+          vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0xd3,0x52,0x80]
+          vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwusd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4]
+          vpdpwusd xmm2, xmm3, xmm4
+
+// CHECK: vpdpwusd xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd2,0xd4]
+          vpdpwusd xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd2,0xd4]
+          vpdpwusd xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwusd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4]
+          vpdpwusd ymm2, ymm3, ymm4
+
+// CHECK: vpdpwusd ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd2,0xd4]
+          vpdpwusd ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd2,0xd4]
+          vpdpwusd ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwusd zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0xd4]
+          vpdpwusd zmm2, zmm3, zmm4
+
+// CHECK: vpdpwusd zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd2,0xd4]
+          vpdpwusd zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd2,0xd4]
+          vpdpwusd zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x65,0x18,0xd2,0x10]
+          vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd2,0x51,0x7f]
+          vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x65,0x9f,0xd2,0x52,0x80]
+          vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x65,0x38,0xd2,0x10]
+          vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd2,0x51,0x7f]
+          vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x65,0xbf,0xd2,0x52,0x80]
+          vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x65,0x58,0xd2,0x10]
+          vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd2,0x51,0x7f]
+          vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x65,0xdf,0xd2,0x52,0x80]
+          vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwusds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4]
+          vpdpwusds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwusds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd3,0xd4]
+          vpdpwusds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd3,0xd4]
+          vpdpwusds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwusds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4]
+          vpdpwusds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwusds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd3,0xd4]
+          vpdpwusds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd3,0xd4]
+          vpdpwusds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwusds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0xd4]
+          vpdpwusds zmm2, zmm3, zmm4
+
+// CHECK: vpdpwusds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd3,0xd4]
+          vpdpwusds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd3,0xd4]
+          vpdpwusds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x65,0x18,0xd3,0x10]
+          vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd3,0x51,0x7f]
+          vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x65,0x9f,0xd3,0x52,0x80]
+          vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x65,0x38,0xd3,0x10]
+          vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd3,0x51,0x7f]
+          vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x65,0xbf,0xd3,0x52,0x80]
+          vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x65,0x58,0xd3,0x10]
+          vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd3,0x51,0x7f]
+          vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x65,0xdf,0xd3,0x52,0x80]
+          vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4]
+          vpdpwuud xmm2, xmm3, xmm4
+
+// CHECK: vpdpwuud xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd2,0xd4]
+          vpdpwuud xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd2,0xd4]
+          vpdpwuud xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4]
+          vpdpwuud ymm2, ymm3, ymm4
+
+// CHECK: vpdpwuud ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd2,0xd4]
+          vpdpwuud ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd2,0xd4]
+          vpdpwuud ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwuud zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0xd4]
+          vpdpwuud zmm2, zmm3, zmm4
+
+// CHECK: vpdpwuud zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd2,0xd4]
+          vpdpwuud zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd2,0xd4]
+          vpdpwuud zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0xd2,0x10]
+          vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd2,0x51,0x7f]
+          vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0xd2,0x52,0x80]
+          vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0xd2,0x10]
+          vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd2,0x51,0x7f]
+          vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0xd2,0x52,0x80]
+          vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0xd2,0x10]
+          vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd2,0x51,0x7f]
+          vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0xd2,0x52,0x80]
+          vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4]
+          vpdpwuuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwuuds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd3,0xd4]
+          vpdpwuuds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd3,0xd4]
+          vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4]
+          vpdpwuuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwuuds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd3,0xd4]
+          vpdpwuuds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd3,0xd4]
+          vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwuuds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0xd4]
+          vpdpwuuds zmm2, zmm3, zmm4
+
+// CHECK: vpdpwuuds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd3,0xd4]
+          vpdpwuuds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd3,0xd4]
+          vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0xd3,0x10]
+          vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd3,0x51,0x7f]
+          vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0xd3,0x52,0x80]
+          vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0xd3,0x10]
+          vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd3,0x51,0x7f]
+          vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0xd3,0x52,0x80]
+          vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+          vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0xd3,0x10]
+          vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd3,0x51,0x7f]
+          vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0xd3,0x52,0x80]
+          vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
 // VMPSADBW
 
 // CHECK: vmpsadbw xmm2, xmm3, xmm4, 123
diff --git a/llvm/test/MC/X86/avx10_2ni-64-att.s b/llvm/test/MC/X86/avx10_2ni-64-att.s
index 8ee4bc3f64127..4e90f1dfb8c91 100644
--- a/llvm/test/MC/X86/avx10_2ni-64-att.s
+++ b/llvm/test/MC/X86/avx10_2ni-64-att.s
@@ -1,5 +1,1415 @@
 // RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
 
+// VNNI FP16
+
+// CHECK: vdpphps %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0x52,0xf0]
+          vdpphps %xmm24, %xmm23, %xmm22
+
+// CHECK: vdpphps %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0x52,0xf0]
+          vdpphps %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0x52,0xf0]
+          vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vdpphps %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0x52,0xf0]
+          vdpphps %ymm24, %ymm23, %ymm22
+
+// CHECK: vdpphps %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0x52,0xf0]
+          vdpphps %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x52,0xf0]
+          vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdpphps %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0x52,0xf0]
+          vdpphps %zmm24, %zmm23, %zmm22
+
+// CHECK: vdpphps %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0x52,0xf0]
+          vdpphps %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x52,0xf0]
+          vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vdpphps  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdpphps  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vdpphps  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdpphps  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vdpphps  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x52,0x35,0x00,0x00,0x00,0x00]
+          vdpphps  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vdpphps  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vdpphps  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vdpphps  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x52,0x71,0x7f]
+          vdpphps  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vdpphps  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x52,0x72,0x80]
+          vdpphps  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vdpphps  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdpphps  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vdpphps  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdpphps  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vdpphps  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x52,0x35,0x00,0x00,0x00,0x00]
+          vdpphps  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vdpphps  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vdpphps  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vdpphps  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x52,0x71,0x7f]
+          vdpphps  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdpphps  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x52,0x72,0x80]
+          vdpphps  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdpphps  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdpphps  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vdpphps  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vdpphps  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vdpphps  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x52,0x35,0x00,0x00,0x00,0x00]
+          vdpphps  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vdpphps  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x52,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vdpphps  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vdpphps  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x52,0x71,0x7f]
+          vdpphps  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vdpphps  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x52,0x72,0x80]
+          vdpphps  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// VNNI INT8
+
+// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x47,0x00,0x50,0xf0]
+          vpdpbssd %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x07,0x50,0xf0]
+          vpdpbssd %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0x87,0x50,0xf0]
+          vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x47,0x20,0x50,0xf0]
+          vpdpbssd %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x27,0x50,0xf0]
+          vpdpbssd %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0xa7,0x50,0xf0]
+          vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x47,0x40,0x50,0xf0]
+          vpdpbssd %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x47,0x50,0xf0]
+          vpdpbssd %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0xc7,0x50,0xf0]
+          vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssd  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbssd  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbssd  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbssd  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbssd  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x10,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbssd  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbssd  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbssd  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbssd  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0x87,0x50,0x71,0x7f]
+          vpdpbssd  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssd  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0x97,0x50,0x72,0x80]
+          vpdpbssd  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssd  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbssd  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbssd  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbssd  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbssd  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x30,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbssd  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbssd  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbssd  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbssd  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x50,0x71,0x7f]
+          vpdpbssd  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssd  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x50,0x72,0x80]
+          vpdpbssd  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssd  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbssd  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbssd  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbssd  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbssd  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x50,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbssd  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbssd  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbssd  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbssd  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x50,0x71,0x7f]
+          vpdpbssd  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssd  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x50,0x72,0x80]
+          vpdpbssd  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x47,0x00,0x51,0xf0]
+          vpdpbssds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x07,0x51,0xf0]
+          vpdpbssds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0x87,0x51,0xf0]
+          vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x47,0x20,0x51,0xf0]
+          vpdpbssds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x27,0x51,0xf0]
+          vpdpbssds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0xa7,0x51,0xf0]
+          vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x47,0x40,0x51,0xf0]
+          vpdpbssds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x47,0x51,0xf0]
+          vpdpbssds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0xc7,0x51,0xf0]
+          vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbssds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbssds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbssds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbssds  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x10,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbssds  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbssds  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbssds  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbssds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0x87,0x51,0x71,0x7f]
+          vpdpbssds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0x97,0x51,0x72,0x80]
+          vpdpbssds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbssds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbssds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbssds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbssds  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x30,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbssds  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbssds  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbssds  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbssds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x51,0x71,0x7f]
+          vpdpbssds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x51,0x72,0x80]
+          vpdpbssds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbssds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbssds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbssds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbssds  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x50,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbssds  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbssds  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbssds  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbssds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x51,0x71,0x7f]
+          vpdpbssds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x51,0x72,0x80]
+          vpdpbssds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x00,0x50,0xf0]
+          vpdpbsud %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x07,0x50,0xf0]
+          vpdpbsud %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0x87,0x50,0xf0]
+          vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x46,0x20,0x50,0xf0]
+          vpdpbsud %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x27,0x50,0xf0]
+          vpdpbsud %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xa7,0x50,0xf0]
+          vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x40,0x50,0xf0]
+          vpdpbsud %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x47,0x50,0xf0]
+          vpdpbsud %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xc7,0x50,0xf0]
+          vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbsud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbsud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbsud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbsud  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x10,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbsud  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbsud  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbsud  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbsud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x87,0x50,0x71,0x7f]
+          vpdpbsud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x97,0x50,0x72,0x80]
+          vpdpbsud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbsud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbsud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbsud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbsud  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x30,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbsud  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbsud  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbsud  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbsud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0x50,0x71,0x7f]
+          vpdpbsud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0x50,0x72,0x80]
+          vpdpbsud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbsud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbsud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbsud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbsud  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x50,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbsud  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbsud  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbsud  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbsud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0x50,0x71,0x7f]
+          vpdpbsud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0x50,0x72,0x80]
+          vpdpbsud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x00,0x51,0xf0]
+          vpdpbsuds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x07,0x51,0xf0]
+          vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0x87,0x51,0xf0]
+          vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x46,0x20,0x51,0xf0]
+          vpdpbsuds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x27,0x51,0xf0]
+          vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xa7,0x51,0xf0]
+          vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x40,0x51,0xf0]
+          vpdpbsuds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x47,0x51,0xf0]
+          vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xc7,0x51,0xf0]
+          vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbsuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbsuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbsuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbsuds  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x10,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbsuds  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbsuds  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbsuds  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbsuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x87,0x51,0x71,0x7f]
+          vpdpbsuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x97,0x51,0x72,0x80]
+          vpdpbsuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbsuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbsuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbsuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbsuds  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x30,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbsuds  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbsuds  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbsuds  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbsuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0x51,0x71,0x7f]
+          vpdpbsuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0x51,0x72,0x80]
+          vpdpbsuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbsuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbsuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbsuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbsuds  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x50,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbsuds  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbsuds  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbsuds  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbsuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0x51,0x71,0x7f]
+          vpdpbsuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0x51,0x72,0x80]
+          vpdpbsuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0x50,0xf0]
+          vpdpbuud %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0x50,0xf0]
+          vpdpbuud %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0x50,0xf0]
+          vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0x50,0xf0]
+          vpdpbuud %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0x50,0xf0]
+          vpdpbuud %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x50,0xf0]
+          vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0x50,0xf0]
+          vpdpbuud %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0x50,0xf0]
+          vpdpbuud %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x50,0xf0]
+          vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbuud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbuud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbuud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbuud  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbuud  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbuud  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbuud  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbuud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x50,0x71,0x7f]
+          vpdpbuud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x50,0x72,0x80]
+          vpdpbuud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbuud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbuud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbuud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbuud  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbuud  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbuud  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbuud  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbuud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x50,0x71,0x7f]
+          vpdpbuud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x50,0x72,0x80]
+          vpdpbuud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbuud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbuud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbuud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbuud  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x50,0x35,0x00,0x00,0x00,0x00]
+          vpdpbuud  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbuud  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbuud  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbuud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x50,0x71,0x7f]
+          vpdpbuud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x50,0x72,0x80]
+          vpdpbuud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0x51,0xf0]
+          vpdpbuuds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0x51,0xf0]
+          vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0x51,0xf0]
+          vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0x51,0xf0]
+          vpdpbuuds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0x51,0xf0]
+          vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x51,0xf0]
+          vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0x51,0xf0]
+          vpdpbuuds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0x51,0xf0]
+          vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x51,0xf0]
+          vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbuuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbuuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbuuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbuuds  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbuuds  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbuuds  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpbuuds  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbuuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x51,0x71,0x7f]
+          vpdpbuuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x51,0x72,0x80]
+          vpdpbuuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbuuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbuuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbuuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbuuds  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbuuds  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbuuds  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpbuuds  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbuuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x51,0x71,0x7f]
+          vpdpbuuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x51,0x72,0x80]
+          vpdpbuuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpbuuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbuuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpbuuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbuuds  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x51,0x35,0x00,0x00,0x00,0x00]
+          vpdpbuuds  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbuuds  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpbuuds  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbuuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x51,0x71,0x7f]
+          vpdpbuuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x51,0x72,0x80]
+          vpdpbuuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// VNNI INT16
+
+// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x00,0xd2,0xf0]
+          vpdpwsud %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x07,0xd2,0xf0]
+          vpdpwsud %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0x87,0xd2,0xf0]
+          vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x46,0x20,0xd2,0xf0]
+          vpdpwsud %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x27,0xd2,0xf0]
+          vpdpwsud %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xa7,0xd2,0xf0]
+          vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x40,0xd2,0xf0]
+          vpdpwsud %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x47,0xd2,0xf0]
+          vpdpwsud %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xc7,0xd2,0xf0]
+          vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwsud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwsud  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x10,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwsud  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwsud  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwsud  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwsud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x87,0xd2,0x71,0x7f]
+          vpdpwsud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x97,0xd2,0x72,0x80]
+          vpdpwsud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwsud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwsud  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x30,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwsud  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwsud  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwsud  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwsud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0xd2,0x71,0x7f]
+          vpdpwsud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0xd2,0x72,0x80]
+          vpdpwsud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwsud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwsud  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x50,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwsud  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwsud  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwsud  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwsud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0xd2,0x71,0x7f]
+          vpdpwsud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0xd2,0x72,0x80]
+          vpdpwsud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x00,0xd3,0xf0]
+          vpdpwsuds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x07,0xd3,0xf0]
+          vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0x87,0xd3,0xf0]
+          vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x46,0x20,0xd3,0xf0]
+          vpdpwsuds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x27,0xd3,0xf0]
+          vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xa7,0xd3,0xf0]
+          vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x40,0xd3,0xf0]
+          vpdpwsuds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x47,0xd3,0xf0]
+          vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xc7,0xd3,0xf0]
+          vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwsuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwsuds  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x10,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwsuds  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwsuds  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwsuds  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwsuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x87,0xd3,0x71,0x7f]
+          vpdpwsuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x97,0xd3,0x72,0x80]
+          vpdpwsuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwsuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwsuds  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x30,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwsuds  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwsuds  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwsuds  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwsuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0xd3,0x71,0x7f]
+          vpdpwsuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0xd3,0x72,0x80]
+          vpdpwsuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwsuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwsuds  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x50,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwsuds  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwsuds  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwsuds  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwsuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0xd3,0x71,0x7f]
+          vpdpwsuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0xd3,0x72,0x80]
+          vpdpwsuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x45,0x00,0xd2,0xf0]
+          vpdpwusd %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x07,0xd2,0xf0]
+          vpdpwusd %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0x87,0xd2,0xf0]
+          vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x45,0x20,0xd2,0xf0]
+          vpdpwusd %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x27,0xd2,0xf0]
+          vpdpwusd %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0xa7,0xd2,0xf0]
+          vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x45,0x40,0xd2,0xf0]
+          vpdpwusd %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x47,0xd2,0xf0]
+          vpdpwusd %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0xc7,0xd2,0xf0]
+          vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusd  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusd  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwusd  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusd  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwusd  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x10,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwusd  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwusd  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwusd  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwusd  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0x87,0xd2,0x71,0x7f]
+          vpdpwusd  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusd  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0x97,0xd2,0x72,0x80]
+          vpdpwusd  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusd  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusd  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwusd  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusd  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwusd  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x30,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwusd  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwusd  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwusd  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwusd  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xa7,0xd2,0x71,0x7f]
+          vpdpwusd  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusd  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xb7,0xd2,0x72,0x80]
+          vpdpwusd  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusd  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusd  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwusd  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusd  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwusd  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x50,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwusd  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwusd  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwusd  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwusd  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xc7,0xd2,0x71,0x7f]
+          vpdpwusd  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusd  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xd7,0xd2,0x72,0x80]
+          vpdpwusd  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x45,0x00,0xd3,0xf0]
+          vpdpwusds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x07,0xd3,0xf0]
+          vpdpwusds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0x87,0xd3,0xf0]
+          vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x45,0x20,0xd3,0xf0]
+          vpdpwusds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x27,0xd3,0xf0]
+          vpdpwusds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0xa7,0xd3,0xf0]
+          vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x45,0x40,0xd3,0xf0]
+          vpdpwusds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x47,0xd3,0xf0]
+          vpdpwusds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0xc7,0xd3,0xf0]
+          vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwusds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwusds  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x10,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwusds  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwusds  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwusds  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwusds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0x87,0xd3,0x71,0x7f]
+          vpdpwusds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0x97,0xd3,0x72,0x80]
+          vpdpwusds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwusds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwusds  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x30,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwusds  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwusds  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwusds  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwusds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xa7,0xd3,0x71,0x7f]
+          vpdpwusds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xb7,0xd3,0x72,0x80]
+          vpdpwusds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwusds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwusds  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x50,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwusds  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwusds  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwusds  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwusds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xc7,0xd3,0x71,0x7f]
+          vpdpwusds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xd7,0xd3,0x72,0x80]
+          vpdpwusds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0xd2,0xf0]
+          vpdpwuud %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0xd2,0xf0]
+          vpdpwuud %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0xd2,0xf0]
+          vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0xd2,0xf0]
+          vpdpwuud %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0xd2,0xf0]
+          vpdpwuud %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0xd2,0xf0]
+          vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0xd2,0xf0]
+          vpdpwuud %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0xd2,0xf0]
+          vpdpwuud %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0xd2,0xf0]
+          vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuud  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwuud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuud  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwuud  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwuud  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwuud  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwuud  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwuud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0xd2,0x71,0x7f]
+          vpdpwuud  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0xd2,0x72,0x80]
+          vpdpwuud  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuud  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwuud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuud  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwuud  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwuud  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwuud  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwuud  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwuud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0xd2,0x71,0x7f]
+          vpdpwuud  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0xd2,0x72,0x80]
+          vpdpwuud  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuud  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwuud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuud  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwuud  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0xd2,0x35,0x00,0x00,0x00,0x00]
+          vpdpwuud  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwuud  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwuud  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwuud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0xd2,0x71,0x7f]
+          vpdpwuud  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0xd2,0x72,0x80]
+          vpdpwuud  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0xd3,0xf0]
+          vpdpwuuds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0xd3,0xf0]
+          vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0xd3,0xf0]
+          vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0xd3,0xf0]
+          vpdpwuuds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0xd3,0xf0]
+          vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0xd3,0xf0]
+          vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0xd3,0xf0]
+          vpdpwuuds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0xd3,0xf0]
+          vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0xd3,0xf0]
+          vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuuds  268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwuuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuuds  291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwuuds  (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwuuds  (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwuuds  -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwuuds  -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwuuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0xd3,0x71,0x7f]
+          vpdpwuuds  2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0xd3,0x72,0x80]
+          vpdpwuuds  -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuuds  268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwuuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuuds  291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwuuds  (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwuuds  (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwuuds  -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwuuds  -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwuuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0xd3,0x71,0x7f]
+          vpdpwuuds  4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0xd3,0x72,0x80]
+          vpdpwuuds  -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuuds  268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwuuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuuds  291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwuuds  (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0xd3,0x35,0x00,0x00,0x00,0x00]
+          vpdpwuuds  (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwuuds  -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff]
+          vpdpwuuds  -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwuuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0xd3,0x71,0x7f]
+          vpdpwuuds  8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0xd3,0x72,0x80]
+          vpdpwuuds  -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
 // VMPSADBW
 
 // CHECK: vmpsadbw $123, %xmm24, %xmm23, %xmm22
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index f31c4baada141..f6ba46e8b7742 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -4160,6 +4160,9 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0},
   {X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0},
   {X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0},
+  {X86::VDPPHPSZ128r, X86::VDPPHPSZ128m, 0},
+  {X86::VDPPHPSZ256r, X86::VDPPHPSZ256m, 0},
+  {X86::VDPPHPSZr, X86::VDPPHPSZm, 0},
   {X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0},
   {X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0},
   {X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE},
@@ -4883,12 +4886,24 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0},
   {X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0},
   {X86::VPDPBSSDSYrr, X86::VPDPBSSDSYrm, 0},
+  {X86::VPDPBSSDSZ128r, X86::VPDPBSSDSZ128m, 0},
+  {X86::VPDPBSSDSZ256r, X86::VPDPBSSDSZ256m, 0},
+  {X86::VPDPBSSDSZr, X86::VPDPBSSDSZm, 0},
   {X86::VPDPBSSDSrr, X86::VPDPBSSDSrm, 0},
   {X86::VPDPBSSDYrr, X86::VPDPBSSDYrm, 0},
+  {X86::VPDPBSSDZ128r, X86::VPDPBSSDZ128m, 0},
+  {X86::VPDPBSSDZ256r, X86::VPDPBSSDZ256m, 0},
+  {X86::VPDPBSSDZr, X86::VPDPBSSDZm, 0},
   {X86::VPDPBSSDrr, X86::VPDPBSSDrm, 0},
   {X86::VPDPBSUDSYrr, X86::VPDPBSUDSYrm, 0},
+  {X86::VPDPBSUDSZ128r, X86::VPDPBSUDSZ128m, 0},
+  {X86::VPDPBSUDSZ256r, X86::VPDPBSUDSZ256m, 0},
+  {X86::VPDPBSUDSZr, X86::VPDPBSUDSZm, 0},
   {X86::VPDPBSUDSrr, X86::VPDPBSUDSrm, 0},
   {X86::VPDPBSUDYrr, X86::VPDPBSUDYrm, 0},
+  {X86::VPDPBSUDZ128r, X86::VPDPBSUDZ128m, 0},
+  {X86::VPDPBSUDZ256r, X86::VPDPBSUDZ256m, 0},
+  {X86::VPDPBSUDZr, X86::VPDPBSUDZm, 0},
   {X86::VPDPBSUDrr, X86::VPDPBSUDrm, 0},
   {X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0},
   {X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0},
@@ -4901,8 +4916,14 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0},
   {X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0},
   {X86::VPDPBUUDSYrr, X86::VPDPBUUDSYrm, 0},
+  {X86::VPDPBUUDSZ128r, X86::VPDPBUUDSZ128m, 0},
+  {X86::VPDPBUUDSZ256r, X86::VPDPBUUDSZ256m, 0},
+  {X86::VPDPBUUDSZr, X86::VPDPBUUDSZm, 0},
   {X86::VPDPBUUDSrr, X86::VPDPBUUDSrm, 0},
   {X86::VPDPBUUDYrr, X86::VPDPBUUDYrm, 0},
+  {X86::VPDPBUUDZ128r, X86::VPDPBUUDZ128m, 0},
+  {X86::VPDPBUUDZ256r, X86::VPDPBUUDZ256m, 0},
+  {X86::VPDPBUUDZr, X86::VPDPBUUDZm, 0},
   {X86::VPDPBUUDrr, X86::VPDPBUUDrm, 0},
   {X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0},
   {X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0},
@@ -4915,16 +4936,34 @@ static const X86FoldTableEntry Table3[] = {
   {X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0},
   {X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0},
   {X86::VPDPWSUDSYrr, X86::VPDPWSUDSYrm, 0},
+  {X86::VPDPWSUDSZ128r, X86::VPDPWSUDSZ128m, 0},
+  {X86::VPDPWSUDSZ256r, X86::VPDPWSUDSZ256m, 0},
+  {X86::VPDPWSUDSZr, X86::VPDPWSUDSZm, 0},
   {X86::VPDPWSUDSrr, X86::VPDPWSUDSrm, 0},
   {X86::VPDPWSUDYrr, X86::VPDPWSUDYrm, 0},
+  {X86::VPDPWSUDZ128r, X86::VPDPWSUDZ128m, 0},
+  {X86::VPDPWSUDZ256r, X86::VPDPWSUDZ256m, 0},
+  {X86::VPDPWSUDZr, X86::VPDPWSUDZm, 0},
   {X86::VPDPWSUDrr, X86::VPDPWSUDrm, 0},
   {X86::VPDPWUSDSYrr, X86::VPDPWUSDSYrm, 0},
+  {X86::VPDPWUSDSZ128r, X86::VPDPWUSDSZ128m, 0},
+  {X86::VPDPWUSDSZ256r, X86::VPDPWUSDSZ256m, 0},
+  {X86::VPDPWUSDSZr, X86::VPDPWUSDSZm, 0},
   {X86::VPDPWUSDSrr, X86::VPDPWUSDSrm, 0},
   {X86::VPDPWUSDYrr, X86::VPDPWUSDYrm, 0},
+  {X86::VPDPWUSDZ128r, X86::VPDPWUSDZ128m, 0},
+  {X86::VPDPWUSDZ256r, X86::VPDPWUSDZ256m, 0},
+  {X86::VPDPWUSDZr, X86::VPDPWUSDZm, 0},
   {X86::VPDPWUSDrr, X86::VPDPWUSDrm, 0},
   {X86::VPDPWUUDSYrr, X86::VPDPWUUDSYrm, 0},
+  {X86::VPDPWUUDSZ128r, X86::VPDPWUUDSZ128m, 0},
+  {X86::VPDPWUUDSZ256r, X86::VPDPWUUDSZ256m, 0},
+  {X86::VPDPWUUDSZr, X86::VPDPWUUDSZm, 0},
   {X86::VPDPWUUDSrr, X86::VPDPWUUDSrm, 0},
   {X86::VPDPWUUDYrr, X86::VPDPWUUDYrm, 0},
+  {X86::VPDPWUUDZ128r, X86::VPDPWUUDZ128m, 0},
+  {X86::VPDPWUUDZ256r, X86::VPDPWUUDZ256m, 0},
+  {X86::VPDPWUUDZr, X86::VPDPWUUDZm, 0},
   {X86::VPDPWUUDrr, X86::VPDPWUUDrm, 0},
   {X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0},
   {X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0},
@@ -5598,6 +5637,12 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0},
   {X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0},
   {X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0},
+  {X86::VDPPHPSZ128rk, X86::VDPPHPSZ128mk, 0},
+  {X86::VDPPHPSZ128rkz, X86::VDPPHPSZ128mkz, 0},
+  {X86::VDPPHPSZ256rk, X86::VDPPHPSZ256mk, 0},
+  {X86::VDPPHPSZ256rkz, X86::VDPPHPSZ256mkz, 0},
+  {X86::VDPPHPSZrk, X86::VDPPHPSZmk, 0},
+  {X86::VDPPHPSZrkz, X86::VDPPHPSZmkz, 0},
   {X86::VFCMADDCPHZ128rk, X86::VFCMADDCPHZ128mk, 0},
   {X86::VFCMADDCPHZ128rkz, X86::VFCMADDCPHZ128mkz, 0},
   {X86::VFCMADDCPHZ256rk, X86::VFCMADDCPHZ256mk, 0},
@@ -6181,6 +6226,30 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0},
   {X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0},
   {X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0},
+  {X86::VPDPBSSDSZ128rk, X86::VPDPBSSDSZ128mk, 0},
+  {X86::VPDPBSSDSZ128rkz, X86::VPDPBSSDSZ128mkz, 0},
+  {X86::VPDPBSSDSZ256rk, X86::VPDPBSSDSZ256mk, 0},
+  {X86::VPDPBSSDSZ256rkz, X86::VPDPBSSDSZ256mkz, 0},
+  {X86::VPDPBSSDSZrk, X86::VPDPBSSDSZmk, 0},
+  {X86::VPDPBSSDSZrkz, X86::VPDPBSSDSZmkz, 0},
+  {X86::VPDPBSSDZ128rk, X86::VPDPBSSDZ128mk, 0},
+  {X86::VPDPBSSDZ128rkz, X86::VPDPBSSDZ128mkz, 0},
+  {X86::VPDPBSSDZ256rk, X86::VPDPBSSDZ256mk, 0},
+  {X86::VPDPBSSDZ256rkz, X86::VPDPBSSDZ256mkz, 0},
+  {X86::VPDPBSSDZrk, X86::VPDPBSSDZmk, 0},
+  {X86::VPDPBSSDZrkz, X86::VPDPBSSDZmkz, 0},
+  {X86::VPDPBSUDSZ128rk, X86::VPDPBSUDSZ128mk, 0},
+  {X86::VPDPBSUDSZ128rkz, X86::VPDPBSUDSZ128mkz, 0},
+  {X86::VPDPBSUDSZ256rk, X86::VPDPBSUDSZ256mk, 0},
+  {X86::VPDPBSUDSZ256rkz, X86::VPDPBSUDSZ256mkz, 0},
+  {X86::VPDPBSUDSZrk, X86::VPDPBSUDSZmk, 0},
+  {X86::VPDPBSUDSZrkz, X86::VPDPBSUDSZmkz, 0},
+  {X86::VPDPBSUDZ128rk, X86::VPDPBSUDZ128mk, 0},
+  {X86::VPDPBSUDZ128rkz, X86::VPDPBSUDZ128mkz, 0},
+  {X86::VPDPBSUDZ256rk, X86::VPDPBSUDZ256mk, 0},
+  {X86::VPDPBSUDZ256rkz, X86::VPDPBSUDZ256mkz, 0},
+  {X86::VPDPBSUDZrk, X86::VPDPBSUDZmk, 0},
+  {X86::VPDPBSUDZrkz, X86::VPDPBSUDZmkz, 0},
   {X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mk, 0},
   {X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mkz, 0},
   {X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mk, 0},
@@ -6193,6 +6262,18 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mkz, 0},
   {X86::VPDPBUSDZrk, X86::VPDPBUSDZmk, 0},
   {X86::VPDPBUSDZrkz, X86::VPDPBUSDZmkz, 0},
+  {X86::VPDPBUUDSZ128rk, X86::VPDPBUUDSZ128mk, 0},
+  {X86::VPDPBUUDSZ128rkz, X86::VPDPBUUDSZ128mkz, 0},
+  {X86::VPDPBUUDSZ256rk, X86::VPDPBUUDSZ256mk, 0},
+  {X86::VPDPBUUDSZ256rkz, X86::VPDPBUUDSZ256mkz, 0},
+  {X86::VPDPBUUDSZrk, X86::VPDPBUUDSZmk, 0},
+  {X86::VPDPBUUDSZrkz, X86::VPDPBUUDSZmkz, 0},
+  {X86::VPDPBUUDZ128rk, X86::VPDPBUUDZ128mk, 0},
+  {X86::VPDPBUUDZ128rkz, X86::VPDPBUUDZ128mkz, 0},
+  {X86::VPDPBUUDZ256rk, X86::VPDPBUUDZ256mk, 0},
+  {X86::VPDPBUUDZ256rkz, X86::VPDPBUUDZ256mkz, 0},
+  {X86::VPDPBUUDZrk, X86::VPDPBUUDZmk, 0},
+  {X86::VPDPBUUDZrkz, X86::VPDPBUUDZmkz, 0},
   {X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mk, 0},
   {X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mkz, 0},
   {X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mk, 0},
@@ -6205,6 +6286,42 @@ static const X86FoldTableEntry Table4[] = {
   {X86::VPDPWSSDZ256rkz, X86::VPDPWSSDZ256mkz, 0},
   {X86::VPDPWSSDZrk, X86::VPDPWSSDZmk, 0},
   {X86::VPDPWSSDZrkz, X86::VPDPWSSDZmkz, 0},
+  {X86::VPDPWSUDSZ128rk, X86::VPDPWSUDSZ128mk, 0},
+  {X86::VPDPWSUDSZ128rkz, X86::VPDPWSUDSZ128mkz, 0},
+  {X86::VPDPWSUDSZ256rk, X86::VPDPWSUDSZ256mk, 0},
+  {X86::VPDPWSUDSZ256rkz, X86::VPDPWSUDSZ256mkz, 0},
+  {X86::VPDPWSUDSZrk, X86::VPDPWSUDSZmk, 0},
+  {X86::VPDPWSUDSZrkz, X86::VPDPWSUDSZmkz, 0},
+  {X86::VPDPWSUDZ128rk, X86::VPDPWSUDZ128mk, 0},
+  {X86::VPDPWSUDZ128rkz, X86::VPDPWSUDZ128mkz, 0},
+  {X86::VPDPWSUDZ256rk, X86::VPDPWSUDZ256mk, 0},
+  {X86::VPDPWSUDZ256rkz, X86::VPDPWSUDZ256mkz, 0},
+  {X86::VPDPWSUDZrk, X86::VPDPWSUDZmk, 0},
+  {X86::VPDPWSUDZrkz, X86::VPDPWSUDZmkz, 0},
+  {X86::VPDPWUSDSZ128rk, X86::VPDPWUSDSZ128mk, 0},
+  {X86::VPDPWUSDSZ128rkz, X86::VPDPWUSDSZ128mkz, 0},
+  {X86::VPDPWUSDSZ256rk, X86::VPDPWUSDSZ256mk, 0},
+  {X86::VPDPWUSDSZ256rkz, X86::VPDPWUSDSZ256mkz, 0},
+  {X86::VPDPWUSDSZrk, X86::VPDPWUSDSZmk, 0},
+  {X86::VPDPWUSDSZrkz, X86::VPDPWUSDSZmkz, 0},
+  {X86::VPDPWUSDZ128rk, X86::VPDPWUSDZ128mk, 0},
+  {X86::VPDPWUSDZ128rkz, X86::VPDPWUSDZ128mkz, 0},
+  {X86::VPDPWUSDZ256rk, X86::VPDPWUSDZ256mk, 0},
+  {X86::VPDPWUSDZ256rkz, X86::VPDPWUSDZ256mkz, 0},
+  {X86::VPDPWUSDZrk, X86::VPDPWUSDZmk, 0},
+  {X86::VPDPWUSDZrkz, X86::VPDPWUSDZmkz, 0},
+  {X86::VPDPWUUDSZ128rk, X86::VPDPWUUDSZ128mk, 0},
+  {X86::VPDPWUUDSZ128rkz, X86::VPDPWUUDSZ128mkz, 0},
+  {X86::VPDPWUUDSZ256rk, X86::VPDPWUUDSZ256mk, 0},
+  {X86::VPDPWUUDSZ256rkz, X86::VPDPWUUDSZ256mkz, 0},
+  {X86::VPDPWUUDSZrk, X86::VPDPWUUDSZmk, 0},
+  {X86::VPDPWUUDSZrkz, X86::VPDPWUUDSZmkz, 0},
+  {X86::VPDPWUUDZ128rk, X86::VPDPWUUDZ128mk, 0},
+  {X86::VPDPWUUDZ128rkz, X86::VPDPWUUDZ128mkz, 0},
+  {X86::VPDPWUUDZ256rk, X86::VPDPWUUDZ256mk, 0},
+  {X86::VPDPWUUDZ256rkz, X86::VPDPWUUDZ256mkz, 0},
+  {X86::VPDPWUUDZrk, X86::VPDPWUUDZmk, 0},
+  {X86::VPDPWUUDZrkz, X86::VPDPWUUDZmkz, 0},
   {X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0},
   {X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0},
   {X86::VPERMBZrrk, X86::VPERMBZrmk, 0},
@@ -7835,6 +7952,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128mb, TB_BCAST_SS},
   {X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256mb, TB_BCAST_SS},
   {X86::VDPBF16PSZr, X86::VDPBF16PSZmb, TB_BCAST_SS},
+  {X86::VDPPHPSZ128r, X86::VDPPHPSZ128mb, TB_BCAST_SS},
+  {X86::VDPPHPSZ256r, X86::VDPPHPSZ256mb, TB_BCAST_SS},
+  {X86::VDPPHPSZr, X86::VDPPHPSZmb, TB_BCAST_SS},
   {X86::VEXP2PDZrk, X86::VEXP2PDZmbk, TB_BCAST_SD},
   {X86::VEXP2PSZrk, X86::VEXP2PSZmbk, TB_BCAST_SS},
   {X86::VFCMADDCPHZ128r, X86::VFCMADDCPHZ128mb, TB_BCAST_SS},
@@ -8158,18 +8278,54 @@ static const X86FoldTableEntry BroadcastTable3[] = {
   {X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmbk, TB_BCAST_Q},
   {X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmbk, TB_BCAST_Q},
   {X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmbk, TB_BCAST_Q},
+  {X86::VPDPBSSDSZ128r, X86::VPDPBSSDSZ128mb, TB_BCAST_D},
+  {X86::VPDPBSSDSZ256r, X86::VPDPBSSDSZ256mb, TB_BCAST_D},
+  {X86::VPDPBSSDSZr, X86::VPDPBSSDSZmb, TB_BCAST_D},
+  {X86::VPDPBSSDZ128r, X86::VPDPBSSDZ128mb, TB_BCAST_D},
+  {X86::VPDPBSSDZ256r, X86::VPDPBSSDZ256mb, TB_BCAST_D},
+  {X86::VPDPBSSDZr, X86::VPDPBSSDZmb, TB_BCAST_D},
+  {X86::VPDPBSUDSZ128r, X86::VPDPBSUDSZ128mb, TB_BCAST_D},
+  {X86::VPDPBSUDSZ256r, X86::VPDPBSUDSZ256mb, TB_BCAST_D},
+  {X86::VPDPBSUDSZr, X86::VPDPBSUDSZmb, TB_BCAST_D},
+  {X86::VPDPBSUDZ128r, X86::VPDPBSUDZ128mb, TB_BCAST_D},
+  {X86::VPDPBSUDZ256r, X86::VPDPBSUDZ256mb, TB_BCAST_D},
+  {X86::VPDPBSUDZr, X86::VPDPBSUDZmb, TB_BCAST_D},
   {X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128mb, TB_BCAST_D},
   {X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256mb, TB_BCAST_D},
   {X86::VPDPBUSDSZr, X86::VPDPBUSDSZmb, TB_BCAST_D},
   {X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128mb, TB_BCAST_D},
   {X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256mb, TB_BCAST_D},
   {X86::VPDPBUSDZr, X86::VPDPBUSDZmb, TB_BCAST_D},
+  {X86::VPDPBUUDSZ128r, X86::VPDPBUUDSZ128mb, TB_BCAST_D},
+  {X86::VPDPBUUDSZ256r, X86::VPDPBUUDSZ256mb, TB_BCAST_D},
+  {X86::VPDPBUUDSZr, X86::VPDPBUUDSZmb, TB_BCAST_D},
+  {X86::VPDPBUUDZ128r, X86::VPDPBUUDZ128mb, TB_BCAST_D},
+  {X86::VPDPBUUDZ256r, X86::VPDPBUUDZ256mb, TB_BCAST_D},
+  {X86::VPDPBUUDZr, X86::VPDPBUUDZmb, TB_BCAST_D},
   {X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128mb, TB_BCAST_D},
   {X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256mb, TB_BCAST_D},
   {X86::VPDPWSSDSZr, X86::VPDPWSSDSZmb, TB_BCAST_D},
   {X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128mb, TB_BCAST_D},
   {X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256mb, TB_BCAST_D},
   {X86::VPDPWSSDZr, X86::VPDPWSSDZmb, TB_BCAST_D},
+  {X86::VPDPWSUDSZ128r, X86::VPDPWSUDSZ128mb, TB_BCAST_D},
+  {X86::VPDPWSUDSZ256r, X86::VPDPWSUDSZ256mb, TB_BCAST_D},
+  {X86::VPDPWSUDSZr, X86::VPDPWSUDSZmb, TB_BCAST_D},
+  {X86::VPDPWSUDZ128r, X86::VPDPWSUDZ128mb, TB_BCAST_D},
+  {X86::VPDPWSUDZ256r, X86::VPDPWSUDZ256mb, TB_BCAST_D},
+  {X86::VPDPWSUDZr, X86::VPDPWSUDZmb, TB_BCAST_D},
+  {X86::VPDPWUSDSZ128r, X86::VPDPWUSDSZ128mb, TB_BCAST_D},
+  {X86::VPDPWUSDSZ256r, X86::VPDPWUSDSZ256mb, TB_BCAST_D},
+  {X86::VPDPWUSDSZr, X86::VPDPWUSDSZmb, TB_BCAST_D},
+  {X86::VPDPWUSDZ128r, X86::VPDPWUSDZ128mb, TB_BCAST_D},
+  {X86::VPDPWUSDZ256r, X86::VPDPWUSDZ256mb, TB_BCAST_D},
+  {X86::VPDPWUSDZr, X86::VPDPWUSDZmb, TB_BCAST_D},
+  {X86::VPDPWUUDSZ128r, X86::VPDPWUUDSZ128mb, TB_BCAST_D},
+  {X86::VPDPWUUDSZ256r, X86::VPDPWUUDSZ256mb, TB_BCAST_D},
+  {X86::VPDPWUUDSZr, X86::VPDPWUUDSZmb, TB_BCAST_D},
+  {X86::VPDPWUUDZ128r, X86::VPDPWUUDZ128mb, TB_BCAST_D},
+  {X86::VPDPWUUDZ256r, X86::VPDPWUUDZ256mb, TB_BCAST_D},
+  {X86::VPDPWUUDZr, X86::VPDPWUUDZmb, TB_BCAST_D},
   {X86::VPERMDZ256rrkz, X86::VPERMDZ256rmbkz, TB_BCAST_D},
   {X86::VPERMDZrrkz, X86::VPERMDZrmbkz, TB_BCAST_D},
   {X86::VPERMI2DZ128rr, X86::VPERMI2DZ128rmb, TB_BCAST_D},
@@ -8563,6 +8719,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mbkz, TB_BCAST_SS},
   {X86::VDPBF16PSZrk, X86::VDPBF16PSZmbk, TB_BCAST_SS},
   {X86::VDPBF16PSZrkz, X86::VDPBF16PSZmbkz, TB_BCAST_SS},
+  {X86::VDPPHPSZ128rk, X86::VDPPHPSZ128mbk, TB_BCAST_SS},
+  {X86::VDPPHPSZ128rkz, X86::VDPPHPSZ128mbkz, TB_BCAST_SS},
+  {X86::VDPPHPSZ256rk, X86::VDPPHPSZ256mbk, TB_BCAST_SS},
+  {X86::VDPPHPSZ256rkz, X86::VDPPHPSZ256mbkz, TB_BCAST_SS},
+  {X86::VDPPHPSZrk, X86::VDPPHPSZmbk, TB_BCAST_SS},
+  {X86::VDPPHPSZrkz, X86::VDPPHPSZmbkz, TB_BCAST_SS},
   {X86::VFCMADDCPHZ128rk, X86::VFCMADDCPHZ128mbk, TB_BCAST_SS},
   {X86::VFCMADDCPHZ128rkz, X86::VFCMADDCPHZ128mbkz, TB_BCAST_SS},
   {X86::VFCMADDCPHZ256rk, X86::VFCMADDCPHZ256mbk, TB_BCAST_SS},
@@ -8998,6 +9160,30 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VPANDQZ128rrk, X86::VPANDQZ128rmbk, TB_BCAST_Q},
   {X86::VPANDQZ256rrk, X86::VPANDQZ256rmbk, TB_BCAST_Q},
   {X86::VPANDQZrrk, X86::VPANDQZrmbk, TB_BCAST_Q},
+  {X86::VPDPBSSDSZ128rk, X86::VPDPBSSDSZ128mbk, TB_BCAST_D},
+  {X86::VPDPBSSDSZ128rkz, X86::VPDPBSSDSZ128mbkz, TB_BCAST_D},
+  {X86::VPDPBSSDSZ256rk, X86::VPDPBSSDSZ256mbk, TB_BCAST_D},
+  {X86::VPDPBSSDSZ256rkz, X86::VPDPBSSDSZ256mbkz, TB_BCAST_D},
+  {X86::VPDPBSSDSZrk, X86::VPDPBSSDSZmbk, TB_BCAST_D},
+  {X86::VPDPBSSDSZrkz, X86::VPDPBSSDSZmbkz, TB_BCAST_D},
+  {X86::VPDPBSSDZ128rk, X86::VPDPBSSDZ128mbk, TB_BCAST_D},
+  {X86::VPDPBSSDZ128rkz, X86::VPDPBSSDZ128mbkz, TB_BCAST_D},
+  {X86::VPDPBSSDZ256rk, X86::VPDPBSSDZ256mbk, TB_BCAST_D},
+  {X86::VPDPBSSDZ256rkz, X86::VPDPBSSDZ256mbkz, TB_BCAST_D},
+  {X86::VPDPBSSDZrk, X86::VPDPBSSDZmbk, TB_BCAST_D},
+  {X86::VPDPBSSDZrkz, X86::VPDPBSSDZmbkz, TB_BCAST_D},
+  {X86::VPDPBSUDSZ128rk, X86::VPDPBSUDSZ128mbk, TB_BCAST_D},
+  {X86::VPDPBSUDSZ128rkz, X86::VPDPBSUDSZ128mbkz, TB_BCAST_D},
+  {X86::VPDPBSUDSZ256rk, X86::VPDPBSUDSZ256mbk, TB_BCAST_D},
+  {X86::VPDPBSUDSZ256rkz, X86::VPDPBSUDSZ256mbkz, TB_BCAST_D},
+  {X86::VPDPBSUDSZrk, X86::VPDPBSUDSZmbk, TB_BCAST_D},
+  {X86::VPDPBSUDSZrkz, X86::VPDPBSUDSZmbkz, TB_BCAST_D},
+  {X86::VPDPBSUDZ128rk, X86::VPDPBSUDZ128mbk, TB_BCAST_D},
+  {X86::VPDPBSUDZ128rkz, X86::VPDPBSUDZ128mbkz, TB_BCAST_D},
+  {X86::VPDPBSUDZ256rk, X86::VPDPBSUDZ256mbk, TB_BCAST_D},
+  {X86::VPDPBSUDZ256rkz, X86::VPDPBSUDZ256mbkz, TB_BCAST_D},
+  {X86::VPDPBSUDZrk, X86::VPDPBSUDZmbk, TB_BCAST_D},
+  {X86::VPDPBSUDZrkz, X86::VPDPBSUDZmbkz, TB_BCAST_D},
   {X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mbk, TB_BCAST_D},
   {X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mbkz, TB_BCAST_D},
   {X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mbk, TB_BCAST_D},
@@ -9010,6 +9196,18 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mbkz, TB_BCAST_D},
   {X86::VPDPBUSDZrk, X86::VPDPBUSDZmbk, TB_BCAST_D},
   {X86::VPDPBUSDZrkz, X86::VPDPBUSDZmbkz, TB_BCAST_D},
+  {X86::VPDPBUUDSZ128rk, X86::VPDPBUUDSZ128mbk, TB_BCAST_D},
+  {X86::VPDPBUUDSZ128rkz, X86::VPDPBUUDSZ128mbkz, TB_BCAST_D},
+  {X86::VPDPBUUDSZ256rk, X86::VPDPBUUDSZ256mbk, TB_BCAST_D},
+  {X86::VPDPBUUDSZ256rkz, X86::VPDPBUUDSZ256mbkz, TB_BCAST_D},
+  {X86::VPDPBUUDSZrk, X86::VPDPBUUDSZmbk, TB_BCAST_D},
+  {X86::VPDPBUUDSZrkz, X86::VPDPBUUDSZmbkz, TB_BCAST_D},
+  {X86::VPDPBUUDZ128rk, X86::VPDPBUUDZ128mbk, TB_BCAST_D},
+  {X86::VPDPBUUDZ128rkz, X86::VPDPBUUDZ128mbkz, TB_BCAST_D},
+  {X86::VPDPBUUDZ256rk, X86::VPDPBUUDZ256mbk, TB_BCAST_D},
+  {X86::VPDPBUUDZ256rkz, X86::VPDPBUUDZ256mbkz, TB_BCAST_D},
+  {X86::VPDPBUUDZrk, X86::VPDPBUUDZmbk, TB_BCAST_D},
+  {X86::VPDPBUUDZrkz, X86::VPDPBUUDZmbkz, TB_BCAST_D},
   {X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mbk, TB_BCAST_D},
   {X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mbkz, TB_BCAST_D},
   {X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mbk, TB_BCAST_D},
@@ -9022,6 +9220,42 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VPDPWSSDZ256rkz, X86::VPDPWSSDZ256mbkz, TB_BCAST_D},
   {X86::VPDPWSSDZrk, X86::VPDPWSSDZmbk, TB_BCAST_D},
   {X86::VPDPWSSDZrkz, X86::VPDPWSSDZmbkz, TB_BCAST_D},
+  {X86::VPDPWSUDSZ128rk, X86::VPDPWSUDSZ128mbk, TB_BCAST_D},
+  {X86::VPDPWSUDSZ128rkz, X86::VPDPWSUDSZ128mbkz, TB_BCAST_D},
+  {X86::VPDPWSUDSZ256rk, X86::VPDPWSUDSZ256mbk, TB_BCAST_D},
+  {X86::VPDPWSUDSZ256rkz, X86::VPDPWSUDSZ256mbkz, TB_BCAST_D},
+  {X86::VPDPWSUDSZrk, X86::VPDPWSUDSZmbk, TB_BCAST_D},
+  {X86::VPDPWSUDSZrkz, X86::VPDPWSUDSZmbkz, TB_BCAST_D},
+  {X86::VPDPWSUDZ128rk, X86::VPDPWSUDZ128mbk, TB_BCAST_D},
+  {X86::VPDPWSUDZ128rkz, X86::VPDPWSUDZ128mbkz, TB_BCAST_D},
+  {X86::VPDPWSUDZ256rk, X86::VPDPWSUDZ256mbk, TB_BCAST_D},
+  {X86::VPDPWSUDZ256rkz, X86::VPDPWSUDZ256mbkz, TB_BCAST_D},
+  {X86::VPDPWSUDZrk, X86::VPDPWSUDZmbk, TB_BCAST_D},
+  {X86::VPDPWSUDZrkz, X86::VPDPWSUDZmbkz, TB_BCAST_D},
+  {X86::VPDPWUSDSZ128rk, X86::VPDPWUSDSZ128mbk, TB_BCAST_D},
+  {X86::VPDPWUSDSZ128rkz, X86::VPDPWUSDSZ128mbkz, TB_BCAST_D},
+  {X86::VPDPWUSDSZ256rk, X86::VPDPWUSDSZ256mbk, TB_BCAST_D},
+  {X86::VPDPWUSDSZ256rkz, X86::VPDPWUSDSZ256mbkz, TB_BCAST_D},
+  {X86::VPDPWUSDSZrk, X86::VPDPWUSDSZmbk, TB_BCAST_D},
+  {X86::VPDPWUSDSZrkz, X86::VPDPWUSDSZmbkz, TB_BCAST_D},
+  {X86::VPDPWUSDZ128rk, X86::VPDPWUSDZ128mbk, TB_BCAST_D},
+  {X86::VPDPWUSDZ128rkz, X86::VPDPWUSDZ128mbkz, TB_BCAST_D},
+  {X86::VPDPWUSDZ256rk, X86::VPDPWUSDZ256mbk, TB_BCAST_D},
+  {X86::VPDPWUSDZ256rkz, X86::VPDPWUSDZ256mbkz, TB_BCAST_D},
+  {X86::VPDPWUSDZrk, X86::VPDPWUSDZmbk, TB_BCAST_D},
+  {X86::VPDPWUSDZrkz, X86::VPDPWUSDZmbkz, TB_BCAST_D},
+  {X86::VPDPWUUDSZ128rk, X86::VPDPWUUDSZ128mbk, TB_BCAST_D},
+  {X86::VPDPWUUDSZ128rkz, X86::VPDPWUUDSZ128mbkz, TB_BCAST_D},
+  {X86::VPDPWUUDSZ256rk, X86::VPDPWUUDSZ256mbk, TB_BCAST_D},
+  {X86::VPDPWUUDSZ256rkz, X86::VPDPWUUDSZ256mbkz, TB_BCAST_D},
+  {X86::VPDPWUUDSZrk, X86::VPDPWUUDSZmbk, TB_BCAST_D},
+  {X86::VPDPWUUDSZrkz, X86::VPDPWUUDSZmbkz, TB_BCAST_D},
+  {X86::VPDPWUUDZ128rk, X86::VPDPWUUDZ128mbk, TB_BCAST_D},
+  {X86::VPDPWUUDZ128rkz, X86::VPDPWUUDZ128mbkz, TB_BCAST_D},
+  {X86::VPDPWUUDZ256rk, X86::VPDPWUUDZ256mbk, TB_BCAST_D},
+  {X86::VPDPWUUDZ256rkz, X86::VPDPWUUDZ256mbkz, TB_BCAST_D},
+  {X86::VPDPWUUDZrk, X86::VPDPWUUDZmbk, TB_BCAST_D},
+  {X86::VPDPWUUDZrkz, X86::VPDPWUUDZmbkz, TB_BCAST_D},
   {X86::VPERMDZ256rrk, X86::VPERMDZ256rmbk, TB_BCAST_D},
   {X86::VPERMDZrrk, X86::VPERMDZrmbk, TB_BCAST_D},
   {X86::VPERMI2DZ128rrk, X86::VPERMI2DZ128rmbk, TB_BCAST_D},
diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
index f967344135553..60b1a48721653 100644
--- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
+++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
@@ -242,7 +242,8 @@ void X86InstrMappingEmitter::emitCompressEVEXTable(
     auto It = llvm::find_if(Predicates, [](const Record *R) {
       StringRef Name = R->getName();
       return Name == "HasAVXNECONVERT" || Name == "HasAVXVNNI" ||
-             Name == "HasAVXIFMA";
+             Name == "HasAVXIFMA" || Name == "HasAVXVNNIINT8" ||
+             Name == "HasAVXVNNIINT16";
     });
     if (It != Predicates.end())
       PredicateInsts[(*It)->getValueAsString("CondString")].push_back(NewInst);



More information about the cfe-commits mailing list