[clang] b032920 - [X86][AVX10.2] Support AVX10.2 VNNI FP16/INT8/INT16 new instructions (#101783)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Aug 5 03:57:49 PDT 2024
Author: Phoebe Wang
Date: 2024-08-05T18:57:42+08:00
New Revision: b0329206db8e66fe180c504115103b27ca50f64e
URL: https://github.com/llvm/llvm-project/commit/b0329206db8e66fe180c504115103b27ca50f64e
DIFF: https://github.com/llvm/llvm-project/commit/b0329206db8e66fe180c504115103b27ca50f64e.diff
LOG: [X86][AVX10.2] Support AVX10.2 VNNI FP16/INT8/INT16 new instructions (#101783)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
Added:
Modified:
clang/include/clang/Basic/BuiltinsX86.def
clang/lib/Headers/avx10_2_512niintrin.h
clang/lib/Headers/avx10_2niintrin.h
clang/lib/Headers/avxvnniint16intrin.h
clang/lib/Headers/avxvnniint8intrin.h
clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
clang/test/CodeGen/X86/avx10_2ni-builtins.c
clang/test/CodeGen/X86/avxvnniint16-builtins.c
clang/test/CodeGen/X86/avxvnniint8-builtins.c
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/lib/Target/X86/X86InstrAVX10.td
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86InstrSSE.td
llvm/lib/Target/X86/X86IntrinsicsInfo.h
llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt
llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt
llvm/test/MC/X86/avx10_2ni-32-intel.s
llvm/test/MC/X86/avx10_2ni-64-att.s
llvm/test/TableGen/x86-fold-tables.inc
llvm/utils/TableGen/X86InstrMappingEmitter.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index c49b5c36da4fc..55551f688c14b 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -773,18 +773,18 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512v
TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")
// AVX-VNNI-INT8
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
-TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
@@ -1959,6 +1959,27 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+// AVX10.2 VNNI FP16
+TARGET_BUILTIN(__builtin_ia32_vdpphps128, "V4fV4fV8xV8x", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdpphps256, "V8fV8fV16xV16x", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdpphps512, "V16fV16fV32xV32x", "ncV:512:", "avx10.2-512")
+
+// AVX10.2 VNNI INT8
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
+
+// AVX10.2 VNNI INT16
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
+
// AVX10.2 VMPSADBW
TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
@@ -2088,18 +2109,18 @@ TARGET_BUILTIN(__builtin_ia32_vsubph256_round, "V16xV16xV16xIi", "nV:256:", "avx
TARGET_BUILTIN(__builtin_ia32_vsubps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
// AVX-VNNI-INT16
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
-TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
// AVX-NE-CONVERT
TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index 5ad6993b45433..7e614f7740bff 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -16,6 +16,35 @@
#ifndef __AVX10_2_512NIINTRIN_H
#define __AVX10_2_512NIINTRIN_H
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
+ __min_vector_width__(512)))
+
+/* VNNI FP16 */
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W,
+ __m512h __A,
+ __m512h __B) {
+ return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A,
+ (__v32hf)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W,
+ __mmask16 __U,
+ __m512h __A,
+ __m512h __B) {
+ return (__m512)__builtin_ia32_selectps_512(
+ (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
+ __m512 __W,
+ __m512h __A,
+ __m512h __B) {
+ return (__m512)__builtin_ia32_selectps_512(
+ (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
/* VMPSADBW */
#define _mm512_mpsadbw_epu8(A, B, imm) \
((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
@@ -31,5 +60,255 @@
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
(__v32hi)_mm512_setzero_si512()))
+/* VNNI INT8 */
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
+ __m512i __A,
+ __m512i __B) {
+ return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
+ __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
+ __m512i __A,
+ __m512i __B) {
+ return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
+ __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
+ __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
+ __m512i __A,
+ __m512i __B) {
+ return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
+ __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
+ __m512i __A,
+ __m512i __B) {
+ return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
+ __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
+ __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
+ __m512i __A,
+ __m512i __B) {
+ return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
+ __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
+ __m512i __A,
+ __m512i __B) {
+ return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
+ __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
+ __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+/* VNNI INT16 */
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
+ __m512i __B,
+ __m512i __C) {
+ return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
+ (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
+ __m512i __B,
+ __m512i __C) {
+ return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
+ (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
+ __m512i __B,
+ __m512i __C) {
+ return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
+ (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
+ __m512i __B,
+ __m512i __C) {
+ return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
+ (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
+ __m512i __B,
+ __m512i __C) {
+ return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
+ (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
+ __m512i __B,
+ __m512i __C) {
+ return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
+ (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
+ __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
+ (__v16si)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+
#endif /* __SSE2__ */
#endif /* __AVX10_2_512NIINTRIN_H */
diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
index 42b24d2b5b18f..c91a7b57c7527 100644
--- a/clang/lib/Headers/avx10_2niintrin.h
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -15,6 +15,58 @@
#ifndef __AVX10_2NIINTRIN_H
#define __AVX10_2NIINTRIN_H
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
+ __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
+ __min_vector_width__(256)))
+
+/* VNNI FP16 */
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A,
+ (__v8hf)__B);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U,
+ __m128 __W,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_dpph_ps(__W, __A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W,
+ __m256h __A,
+ __m256h __B) {
+ return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A,
+ (__v16hf)__B);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
+ return (__m256)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
/* VMPSADBW */
#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
((__m128i)__builtin_ia32_selectw_128( \
@@ -36,6 +88,320 @@
(__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
(__v16hi)_mm256_setzero_si256()))
+/* VNNI INT8 */
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32(
+ __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32(
+ __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32(
+ __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+/* VNNI INT16 */
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
+ __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
+ __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
+ __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
+ (__v8si)_mm256_setzero_si256());
+}
+
/* YMM Rounding */
#define _mm256_add_round_pd(A, B, R) \
((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
@@ -1702,5 +2068,8 @@
(__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
(__v8sf)_mm256_setzero_ps()))
+#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128
+
#endif /* __AVX10_2NIINTRIN_H */
#endif /* __SSE2__ */
diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
index e4d342a8b45b1..805d249911c17 100644
--- a/clang/lib/Headers/avxvnniint16intrin.h
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -15,14 +15,6 @@
#ifndef __AVXVNNIINT16INTRIN_H
#define __AVXVNNIINT16INTRIN_H
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 \
- __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
- __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 \
- __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
- __min_vector_width__(256)))
-
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
@@ -53,12 +45,9 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpwsud_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -90,11 +79,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpwsud_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -127,12 +114,9 @@ _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpwsuds_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -165,11 +149,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpwsuds_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@@ -201,12 +183,9 @@ _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpwusd_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@@ -238,11 +217,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpwusd_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@@ -275,12 +252,9 @@ _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpwusds_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@@ -313,11 +287,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpwusds_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -349,12 +321,9 @@ _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpwuud_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -386,11 +355,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpwuud_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -423,12 +390,9 @@ _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpwuuds_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@@ -461,13 +425,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
+#define _mm256_dpwuuds_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
#endif // __AVXVNNIINT16INTRIN_H
diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h
index b0b6cb853f713..c211620c68f07 100644
--- a/clang/lib/Headers/avxvnniint8intrin.h
+++ b/clang/lib/Headers/avxvnniint8intrin.h
@@ -14,14 +14,6 @@
#ifndef __AVXVNNIINT8INTRIN_H
#define __AVXVNNIINT8INTRIN_H
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256 \
- __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
- __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 \
- __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
- __min_vector_width__(128)))
-
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
@@ -52,12 +44,9 @@
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpbssd_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@@ -89,11 +78,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpbssd_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@@ -126,12 +113,9 @@ _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpbssds_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@@ -164,11 +148,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpbssds_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -200,12 +182,9 @@ _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpbsud_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -237,11 +216,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpbsud_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -274,12 +251,9 @@ _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpbsuds_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -312,11 +286,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpbsuds_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -348,12 +320,9 @@ _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpbuud_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -385,11 +354,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
+#define _mm256_dpbuud_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@@ -422,14 +389,10 @@ _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
- __m128i __A,
- __m128i __B) {
- return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
- (__v4si)__B);
-}
+#define _mm_dpbuuds_epi32(__W, __A, __B) \
+ ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A), \
+ (__v4si)(__B)))
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
@@ -460,12 +423,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
- (__v8si)__B);
-}
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
+#define _mm256_dpbuuds_epi32(__W, __A, __B) \
+ ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A), \
+ (__v8si)(__B)))
#endif // __AVXVNNIINT8INTRIN_H
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
index b7982e6ecca84..26e0d124c8284 100644
--- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -3,6 +3,28 @@
#include <immintrin.h>
+// VNNI FP16
+__m512 test_mm512_dpph_ps(__m512 __W, __m512h __A, __m512h __B) {
+// CHECK-LABEL: @test_mm512_dpph_ps(
+// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512
+ return _mm512_dpph_ps(__W, __A, __B);
+}
+
+__m512 test_mm512_mask_dpph_ps(__m512 __W, __mmask16 __U, __m512h __A, __m512h __B) {
+// CHECK-LABEL: @test_mm512_mask_dpph_ps(
+// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask_dpph_ps(__W, __U, __A, __B);
+}
+
+__m512 test_mm512_maskz_dpph_ps(__mmask16 __U, __m512 __W, __m512h __A, __m512h __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpph_ps(
+// CHECK: call <16 x float> @llvm.x86.avx10.vdpphps.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_maskz_dpph_ps(__U, __W, __A, __B);
+}
+
// VMPSADBW
__m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) {
// CHECK-LABEL: @test_mm512_mpsadbw_epu8
@@ -23,3 +45,257 @@ __m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17);
}
+
+// VNNI INT8
+__m512i test_mm512_dpbssd_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbssd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+ return _mm512_dpbssd_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbssd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpbssd_epi32(__W, __U, __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbssd_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbssd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssd.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpbssd_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbssds_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbssds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+ return _mm512_dpbssds_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbssds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbssds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpbssds_epi32(__W, __U, __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbssds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbssds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbssds.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpbssds_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbsud_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512
+ return _mm512_dpbsud_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpbsud_epi32(__W, __U, __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbsud_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsud.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpbsud_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbsuds_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512
+ return _mm512_dpbsuds_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbsuds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpbsuds_epi32(__W, __U, __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbsuds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpbsuds_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbuud_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512
+ return _mm512_dpbuud_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpbuud_epi32(__W, __U, __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbuud_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuud.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpbuud_epi32(__U, __W, __A, __B);
+}
+
+__m512i test_mm512_dpbuuds_epi32(__m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_dpbuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512
+ return _mm512_dpbuuds_epi32(__W, __A, __B);
+}
+
+__m512i test_mm512_mask_dpbuuds_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_dpbuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpbuuds_epi32(__W, __U, __A, __B);
+}
+
+__m512i test_mm512_maskz_dpbuuds_epi32(__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_dpbuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpbuuds_epi32(__U, __W, __A, __B);
+}
+
+/* VNNI INT16 */
+__m512i test_mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ return _mm512_dpwsud_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ return _mm512_dpwsuds_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwusd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ return _mm512_dpwusd_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwusd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwusds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ return _mm512_dpwusds_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwusds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ return _mm512_dpwuud_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
+// CHECK-LABEL: @test_mm512_dpwuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+ return _mm512_dpwuuds_epi32(__A, __B, __C);
+}
+
+__m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_mask_dpwuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
+__m512i test_mm512_maskz_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+// CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32(
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: zeroinitializer
+// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ return _mm512_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+}
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
index ace3b7e30c7f6..d06a008c09e71 100644
--- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -3,6 +3,49 @@
#include <immintrin.h>
+// VNNI FP16
+__m128 test_mm_dpph_ps(__m128 __W, __m128h __A, __m128h __B) {
+// CHECK-LABEL: @test_mm_dpph_ps(
+// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128
+ return _mm_dpph_ps(__W, __A, __B);
+}
+
+__m128 test_mm_mask_dpph_ps(__m128 __W, __mmask8 __U, __m128h __A, __m128h __B) {
+// CHECK-LABEL: @test_mm_mask_dpph_ps(
+// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_mask_dpph_ps(__W, __U, __A, __B);
+}
+
+__m128 test_mm_maskz_dpph_ps(__mmask8 __U, __m128 __W, __m128h __A, __m128h __B) {
+// CHECK-LABEL: @test_mm_maskz_dpph_ps(
+// CHECK: call <4 x float> @llvm.x86.avx10.vdpphps.128
+// CHECK: zeroinitializer
+// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+ return _mm_maskz_dpph_ps(__U, __W, __A, __B);
+}
+
+__m256 test_mm256_dpph_ps(__m256 __W, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_dpph_ps(
+// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256
+ return _mm256_dpph_ps(__W, __A, __B);
+}
+
+__m256 test_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_mask_dpph_ps(
+// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_mask_dpph_ps(__W, __U, __A, __B);
+}
+
+__m256 test_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpph_ps(
+// CHECK: call <8 x float> @llvm.x86.avx10.vdpphps.256
+// CHECK: zeroinitializer
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_maskz_dpph_ps(__U, __W, __A, __B);
+}
+
// VMPSADBW
__m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_mpsadbw_epu8
@@ -44,6 +87,344 @@ __m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170);
}
+// VNNI INT8
+__m128i test_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbssd_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssd.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpbssd_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbssd_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssd.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpbssd_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbssds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpbssds_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbssds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpbssds_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbsud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsud.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpbsud_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbsud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsud.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpbsud_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbsuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsuds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpbsuds_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbsuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsuds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpbsuds_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbuud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuud.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpbuud_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbuud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuud.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpbuud_epi32(__U, __W, __A, __B);
+}
+
+__m128i test_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_dpbuuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuuds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpbuuds_epi32(__W, __U, __A, __B);
+}
+
+__m128i test_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_dpbuuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuuds.128
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpbuuds_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbssd_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssd.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpbssd_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbssd_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssd.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpbssd_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbssds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpbssds_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbssds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbssds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbssds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpbssds_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbsud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsud.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpbsud_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbsud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsud.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpbsud_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbsuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsuds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpbsuds_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbsuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbsuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbsuds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpbsuds_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbuud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuud.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpbuud_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbuud_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuud.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpbuud_epi32(__U, __W, __A, __B);
+}
+
+__m256i test_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_dpbuuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuuds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpbuuds_epi32(__W, __U, __A, __B);
+}
+
+__m256i test_mm256_maskz_dpbuuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_dpbuuds_epi32
+// CHECK: @llvm.x86.avx2.vpdpbuuds.256
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpbuuds_epi32(__U, __W, __A, __B);
+}
+
+// VNNI INT16
+__m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwsud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwsud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwsud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpwsud_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwsuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwsuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwusd_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwusd_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwusd_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpwusd_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwusds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwusds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwusds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpwusds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwuud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwuud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwuud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpwuud_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_mask_dpwuuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
+__m128i test_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+// CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ return _mm_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_mask_dpwuuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
+__m256i test_mm256_maskz_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+// CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ return _mm256_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+}
+
// YMM Rounding
__m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) {
// CHECK-LABEL: @test_mm256_add_round_pd
diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
index a10ca551a1514..f9feaea1e244d 100644
--- a/clang/test/CodeGen/X86/avxvnniint16-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
@@ -1,5 +1,7 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s
// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s
#include <immintrin.h>
diff --git a/clang/test/CodeGen/X86/avxvnniint8-builtins.c b/clang/test/CodeGen/X86/avxvnniint8-builtins.c
index cbdf443888a15..80d005c16d387 100644
--- a/clang/test/CodeGen/X86/avxvnniint8-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint8-builtins.c
@@ -1,5 +1,7 @@
// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
// RUN: %clang_cc1 -ffreestanding %s -triple=i386- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386- -target-feature +avx10.2-256 -emit-llvm -o - -Wall -Werror | FileCheck %s
#include <immintrin.h>
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 7160c8dfa7600..eb2cb3fbfce8e 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -4980,6 +4980,85 @@ let TargetPrefix = "x86" in {
//===----------------------------------------------------------------------===//
// AVX10.2 intrinsics
let TargetPrefix = "x86" in {
+ // VNNI FP16
+ def int_x86_avx10_vdpphps_128 :
+ ClangBuiltin<"__builtin_ia32_vdpphps128">,
+ DefaultAttrsIntrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v8f16_ty, llvm_v8f16_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vdpphps_256 :
+ ClangBuiltin<"__builtin_ia32_vdpphps256">,
+ DefaultAttrsIntrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_v16f16_ty, llvm_v16f16_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vdpphps_512 :
+ ClangBuiltin<"__builtin_ia32_vdpphps512">,
+ DefaultAttrsIntrinsic<[llvm_v16f32_ty],
+ [llvm_v16f32_ty, llvm_v32f16_ty, llvm_v32f16_ty],
+ [IntrNoMem]>;
+ // VNNI INT8
+ def int_x86_avx10_vpdpbssd_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpbssd512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpbssds_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpbssds512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpbsud_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpbsud512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpbsuds_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpbsuds512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpbuud_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpbuud512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpbuuds_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpbuuds512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ // VNNI INT16
+ def int_x86_avx10_vpdpwsud_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpwsud512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpwsuds_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpwsuds512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpwusd_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpwusd512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpwusds_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpwusds512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpwuud_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpwuud512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx10_vpdpwuuds_512 :
+ ClangBuiltin<"__builtin_ia32_vpdpwuuds512">,
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty],
+ [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+ [IntrNoMem]>;
+
// VMPSADBW
def int_x86_avx10_vmpsadbw_512 :
ClangBuiltin<"__builtin_ia32_mpsadbw512">,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fe1865409a265..fff65a1bd967c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34033,6 +34033,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVTNEPS2BF16)
NODE_NAME_CASE(MCVTNEPS2BF16)
NODE_NAME_CASE(DPBF16PS)
+ NODE_NAME_CASE(DPFP16PS)
NODE_NAME_CASE(MPSADBW)
NODE_NAME_CASE(LWPINS)
NODE_NAME_CASE(MGATHER)
@@ -34058,6 +34059,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VPDPBUUDS)
NODE_NAME_CASE(VPDPBSSD)
NODE_NAME_CASE(VPDPBSSDS)
+ NODE_NAME_CASE(VPDPWSUD)
+ NODE_NAME_CASE(VPDPWSUDS)
+ NODE_NAME_CASE(VPDPWUSD)
+ NODE_NAME_CASE(VPDPWUSDS)
+ NODE_NAME_CASE(VPDPWUUD)
+ NODE_NAME_CASE(VPDPWUUDS)
NODE_NAME_CASE(VMINMAX)
NODE_NAME_CASE(VMINMAX_SAE)
NODE_NAME_CASE(VMINMAXS)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 7642a528fb22e..b985f7529ea2a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -595,6 +595,13 @@ namespace llvm {
VPDPBSSD,
VPDPBSSDS,
+ VPDPWSUD,
+ VPDPWSUDS,
+ VPDPWUSD,
+ VPDPWUSDS,
+ VPDPWUUD,
+ VPDPWUUDS,
+
VMINMAX,
VMINMAX_SAE,
VMINMAXS,
@@ -661,9 +668,10 @@ namespace llvm {
// SRC, PASSTHRU, MASK
MCVTNEPS2BF16,
- // Dot product of BF16 pairs to accumulated into
+ // Dot product of BF16/FP16 pairs to accumulated into
// packed single precision.
DPBF16PS,
+ DPFP16PS,
// A stack checking function call. On Windows it's _chkstk call.
DYN_ALLOCA,
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 920317ded15c6..8e4586f2002d9 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -12,6 +12,40 @@
//
//===----------------------------------------------------------------------===//
+// VNNI FP16
+let ExeDomain = SSEPackedSingle in
+defm VDPPHPS : avx512_dpf16ps_sizes<0x52, "vdpphps", X86dpfp16ps, avx512vl_f16_info,
+ [HasAVX10_2], [HasAVX10_2_512]>,
+ T8, PS, EVEX_CD8<32, CD8VF>;
+
+// VNNI INT8
+defm VPDPBSSD : VNNI_common<0x50, "vpdpbssd", X86vpdpbssd, SchedWriteVecIMul, 1,
+ [HasAVX10_2], [HasAVX10_2_512]>, XD;
+defm VPDPBSSDS : VNNI_common<0x51, "vpdpbssds", X86vpdpbssds, SchedWriteVecIMul, 1,
+ [HasAVX10_2], [HasAVX10_2_512]>, XD;
+defm VPDPBSUD : VNNI_common<0x50, "vpdpbsud", X86vpdpbsud, SchedWriteVecIMul, 0,
+ [HasAVX10_2], [HasAVX10_2_512]>, XS;
+defm VPDPBSUDS : VNNI_common<0x51, "vpdpbsuds", X86vpdpbsuds, SchedWriteVecIMul, 0,
+ [HasAVX10_2], [HasAVX10_2_512]>, XS;
+defm VPDPBUUD : VNNI_common<0x50, "vpdpbuud", X86vpdpbuud, SchedWriteVecIMul, 1,
+ [HasAVX10_2], [HasAVX10_2_512]>, PS;
+defm VPDPBUUDS : VNNI_common<0x51, "vpdpbuuds", X86vpdpbuuds, SchedWriteVecIMul, 1,
+ [HasAVX10_2], [HasAVX10_2_512]>, PS;
+
+// VNNI INT16
+defm VPDPWSUD : VNNI_common<0xd2, "vpdpwsud", X86vpdpwsud, SchedWriteVecIMul, 0,
+ [HasAVX10_2], [HasAVX10_2_512]>, XS;
+defm VPDPWSUDS : VNNI_common<0xd3, "vpdpwsuds", X86vpdpwsuds, SchedWriteVecIMul, 0,
+ [HasAVX10_2], [HasAVX10_2_512]>, XS;
+defm VPDPWUSD : VNNI_common<0xd2, "vpdpwusd", X86vpdpwusd, SchedWriteVecIMul, 0,
+ [HasAVX10_2], [HasAVX10_2_512]>, PD;
+defm VPDPWUSDS : VNNI_common<0xd3, "vpdpwusds", X86vpdpwusds, SchedWriteVecIMul, 0,
+ [HasAVX10_2], [HasAVX10_2_512]>, PD;
+defm VPDPWUUD : VNNI_common<0xd2, "vpdpwuud", X86vpdpwuud, SchedWriteVecIMul, 1,
+ [HasAVX10_2], [HasAVX10_2_512]>, PS;
+defm VPDPWUUDS : VNNI_common<0xd3, "vpdpwuuds", X86vpdpwuuds, SchedWriteVecIMul, 1,
+ [HasAVX10_2], [HasAVX10_2_512]>, PS;
+
// VMPSADBW
defm VMPSADBW : avx512_common_3Op_rm_imm8<0x42, X86Vmpsadbw, "vmpsadbw", SchedWritePSADBW,
avx512vl_i16_info, avx512vl_i8_info,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index f9b8cb689694e..e616a8a37c648 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12390,13 +12390,13 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
(VTI.VT (OpNode VTI.RC:$src1,
VTI.RC:$src2, VTI.RC:$src3)),
IsCommutable, IsCommutable>,
- EVEX, VVVV, T8, PD, Sched<[sched]>;
+ EVEX, VVVV, T8, Sched<[sched]>;
defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.LdFrag addr:$src3))))>,
- EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8, PD,
+ EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8,
Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -12406,17 +12406,18 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
EVEX, VVVV, EVEX_CD8<32, CD8VF>, EVEX_B,
- T8, PD, Sched<[sched.Folded, sched.ReadAfterFold,
+ T8, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
}
}
multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
- X86SchedWriteWidths sched, bit IsCommutable> {
- let Predicates = [HasVNNI] in
+ X86SchedWriteWidths sched, bit IsCommutable,
+ list<Predicate> prds, list<Predicate> prds512> {
+ let Predicates = prds512 in
defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info,
IsCommutable>, EVEX_V512;
- let Predicates = [HasVNNI, HasVLX] in {
+ let Predicates = prds in {
defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info,
IsCommutable>, EVEX_V256;
defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info,
@@ -12425,10 +12426,14 @@ multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
}
// FIXME: Is there a better scheduler class for VPDP?
-defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>;
-defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>;
-defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>;
-defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>;
+defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0,
+ [HasVNNI, HasVLX], [HasVNNI]>, PD;
+defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0,
+ [HasVNNI, HasVLX], [HasVNNI]>, PD;
+defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1,
+ [HasVNNI, HasVLX], [HasVNNI]>, PD;
+defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1,
+ [HasVNNI, HasVLX], [HasVNNI]>, PD;
// Patterns to match VPDPWSSD from existing instructions/intrinsics.
let Predicates = [HasVNNI] in {
@@ -12806,9 +12811,9 @@ let Predicates = [HasBF16] in {
}
let Constraints = "$src1 = $dst" in {
-multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86FoldableSchedWrite sched,
- X86VectorVTInfo _, X86VectorVTInfo src_v> {
+multiclass avx512_dpf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _, X86VectorVTInfo src_v> {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins src_v.RC:$src2, src_v.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -12834,25 +12839,25 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
} // Constraints = "$src1 = $dst"
-multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
- AVX512VLVectorVTInfo src_v, Predicate prd> {
- let Predicates = [prd] in {
- defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512,
- src_v.info512>, EVEX_V512;
+multiclass avx512_dpf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _, list<Predicate> prds,
+ list<Predicate> prds512> {
+ let Predicates = prds512 in {
+ defm Z : avx512_dpf16ps_rm<opc, OpcodeStr, OpNode, WriteFMAZ,
+ avx512vl_f32_info.info512, _.info512>, EVEX_V512;
}
- let Predicates = [HasVLX, prd] in {
- defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256,
- src_v.info256>, EVEX_V256;
- defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128,
- src_v.info128>, EVEX_V128;
+ let Predicates = prds in {
+ defm Z256 : avx512_dpf16ps_rm<opc, OpcodeStr, OpNode, WriteFMAY,
+ v8f32x_info, _.info256>, EVEX_V256;
+ defm Z128 : avx512_dpf16ps_rm<opc, OpcodeStr, OpNode, WriteFMAX,
+ v4f32x_info, _.info128>, EVEX_V128;
}
}
let ExeDomain = SSEPackedSingle in
-defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
- avx512vl_f32_info, avx512vl_bf16_info,
- HasBF16>, T8, XS, EVEX_CD8<32, CD8VF>;
+defm VDPBF16PS : avx512_dpf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, avx512vl_bf16_info,
+ [HasVLX, HasBF16], [HasBF16]>,
+ T8, XS, EVEX_CD8<32, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX512FP16
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 11b75240b2504..78c76cacbfef3 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -802,6 +802,11 @@ def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS",
SDTCisSameAs<0,1>,
SDTCVecEltisVT<2, bf16>,
SDTCisSameAs<2,3>]>>;
+def X86dpfp16ps : SDNode<"X86ISD::DPFP16PS",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+ SDTCisSameAs<0,1>,
+ SDTCVecEltisVT<2, f16>,
+ SDTCisSameAs<2,3>]>>;
// galois field arithmetic
def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
@@ -819,6 +824,13 @@ def X86vpdpbsuds : SDNode<"X86ISD::VPDPBSUDS", SDTVnni>;
def X86vpdpbuud : SDNode<"X86ISD::VPDPBUUD", SDTVnni>;
def X86vpdpbuuds : SDNode<"X86ISD::VPDPBUUDS", SDTVnni>;
+def X86vpdpwsud : SDNode<"X86ISD::VPDPWSUD", SDTVnni>;
+def X86vpdpwsuds : SDNode<"X86ISD::VPDPWSUDS", SDTVnni>;
+def X86vpdpwusd : SDNode<"X86ISD::VPDPWUSD", SDTVnni>;
+def X86vpdpwusds : SDNode<"X86ISD::VPDPWUSDS", SDTVnni>;
+def X86vpdpwuud : SDNode<"X86ISD::VPDPWUUD", SDTVnni>;
+def X86vpdpwuuds : SDNode<"X86ISD::VPDPWUUDS", SDTVnni>;
+
def X86Vmpsadbw : SDNode<"X86ISD::MPSADBW", SDTX86PSADBW>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 7fc786b1e570b..7e2e97d387a83 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2953,6 +2953,42 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPDPBUUDSYrr:
case X86::VPDPBUUDrr:
case X86::VPDPBUUDYrr:
+ case X86::VPDPBSSDSZ128r:
+ case X86::VPDPBSSDSZ128rk:
+ case X86::VPDPBSSDSZ128rkz:
+ case X86::VPDPBSSDSZ256r:
+ case X86::VPDPBSSDSZ256rk:
+ case X86::VPDPBSSDSZ256rkz:
+ case X86::VPDPBSSDSZr:
+ case X86::VPDPBSSDSZrk:
+ case X86::VPDPBSSDSZrkz:
+ case X86::VPDPBSSDZ128r:
+ case X86::VPDPBSSDZ128rk:
+ case X86::VPDPBSSDZ128rkz:
+ case X86::VPDPBSSDZ256r:
+ case X86::VPDPBSSDZ256rk:
+ case X86::VPDPBSSDZ256rkz:
+ case X86::VPDPBSSDZr:
+ case X86::VPDPBSSDZrk:
+ case X86::VPDPBSSDZrkz:
+ case X86::VPDPBUUDSZ128r:
+ case X86::VPDPBUUDSZ128rk:
+ case X86::VPDPBUUDSZ128rkz:
+ case X86::VPDPBUUDSZ256r:
+ case X86::VPDPBUUDSZ256rk:
+ case X86::VPDPBUUDSZ256rkz:
+ case X86::VPDPBUUDSZr:
+ case X86::VPDPBUUDSZrk:
+ case X86::VPDPBUUDSZrkz:
+ case X86::VPDPBUUDZ128r:
+ case X86::VPDPBUUDZ128rk:
+ case X86::VPDPBUUDZ128rkz:
+ case X86::VPDPBUUDZ256r:
+ case X86::VPDPBUUDZ256rk:
+ case X86::VPDPBUUDZ256rkz:
+ case X86::VPDPBUUDZr:
+ case X86::VPDPBUUDZrk:
+ case X86::VPDPBUUDZrkz:
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128rk:
case X86::VPDPWSSDZ128rkz:
@@ -2971,6 +3007,24 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPDPWSSDSZr:
case X86::VPDPWSSDSZrk:
case X86::VPDPWSSDSZrkz:
+ case X86::VPDPWUUDZ128r:
+ case X86::VPDPWUUDZ128rk:
+ case X86::VPDPWUUDZ128rkz:
+ case X86::VPDPWUUDZ256r:
+ case X86::VPDPWUUDZ256rk:
+ case X86::VPDPWUUDZ256rkz:
+ case X86::VPDPWUUDZr:
+ case X86::VPDPWUUDZrk:
+ case X86::VPDPWUUDZrkz:
+ case X86::VPDPWUUDSZ128r:
+ case X86::VPDPWUUDSZ128rk:
+ case X86::VPDPWUUDSZ128rkz:
+ case X86::VPDPWUUDSZ256r:
+ case X86::VPDPWUUDSZ256rk:
+ case X86::VPDPWUUDSZ256rkz:
+ case X86::VPDPWUUDSZr:
+ case X86::VPDPWUUDSZrk:
+ case X86::VPDPWUUDSZrkz:
case X86::VPMADD52HUQrr:
case X86::VPMADD52HUQYrr:
case X86::VPMADD52HUQZ128r:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 2fc3b6aa98858..5f9211edfa161 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8425,46 +8425,41 @@ defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8, XS,
defm VSM4RNDS4 : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8, XD, VEX, VVVV;
defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8, XD, VEX_L, VEX, VVVV;
-let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
-multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
- let isCommutable = IsCommutable in
- def rr : I<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, VR128:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst,
- (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
- VR128:$src1, VR128:$src2, VR128:$src3)))]>,
- VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
-
- def rm : I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, i128mem:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst,
- (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
- VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
- VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
-
- let isCommutable = IsCommutable in
- def Yrr : I<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, VR256:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR256:$dst,
- (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
- VR256:$src1, VR256:$src2, VR256:$src3)))]>,
- VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
-
- def Yrm : I<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, i256mem:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR256:$dst,
- (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
- VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
- VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+let Predicates = [HasAVXVNNIINT16] in {
+ defm VPDPWSUD : avx_dotprod_rm<0xd2,"vpdpwsud", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpwsud, SchedWriteVecIMul.XMM,
+ 0>, T8, XS;
+ defm VPDPWSUDY : avx_dotprod_rm<0xd2,"vpdpwsud", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpwsud, SchedWriteVecIMul.YMM,
+ 0>, VEX_L, T8, XS;
+ defm VPDPWSUDS : avx_dotprod_rm<0xd3,"vpdpwsuds", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpwsuds, SchedWriteVecIMul.XMM,
+ 0>, T8, XS;
+ defm VPDPWSUDSY : avx_dotprod_rm<0xd3,"vpdpwsuds", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpwsuds, SchedWriteVecIMul.YMM,
+ 0>, VEX_L, T8, XS;
+ defm VPDPWUSD : avx_dotprod_rm<0xd2,"vpdpwusd", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpwusd, SchedWriteVecIMul.XMM,
+ 0>, T8, PD;
+ defm VPDPWUSDY : avx_dotprod_rm<0xd2,"vpdpwusd", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpwusd, SchedWriteVecIMul.YMM,
+ 0>, VEX_L, T8, PD;
+ defm VPDPWUSDS : avx_dotprod_rm<0xd3,"vpdpwusds", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpwusds, SchedWriteVecIMul.XMM,
+ 0>, T8, PD;
+ defm VPDPWUSDSY : avx_dotprod_rm<0xd3,"vpdpwusds", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpwusds, SchedWriteVecIMul.YMM,
+ 0>, VEX_L, T8, PD;
+ defm VPDPWUUD : avx_dotprod_rm<0xd2,"vpdpwuud", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpwuud, SchedWriteVecIMul.XMM,
+ 1>, T8;
+ defm VPDPWUUDY : avx_dotprod_rm<0xd2,"vpdpwuud", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpwuud, SchedWriteVecIMul.YMM,
+ 1>, VEX_L, T8;
+ defm VPDPWUUDS : avx_dotprod_rm<0xd3,"vpdpwuuds", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpwuuds, SchedWriteVecIMul.XMM,
+ 1>, T8;
+ defm VPDPWUUDSY : avx_dotprod_rm<0xd3,"vpdpwuuds", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpwuuds, SchedWriteVecIMul.YMM,
+ 1>, VEX_L, T8;
}
-
-defm VPDPWSUD : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8, XS;
-defm VPDPWSUDS : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8, XS;
-defm VPDPWUSD : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8, PD;
-defm VPDPWUSDS : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8, PD;
-defm VPDPWUUD : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8;
-defm VPDPWUUDS : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index a7473e495330b..536391da295dd 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -552,6 +552,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FDIV_RND),
X86_INTRINSIC_DATA(avx10_vdivps256, INTR_TYPE_2OP, ISD::FDIV,
X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx10_vdpphps_128, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0),
+ X86_INTRINSIC_DATA(avx10_vdpphps_256, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0),
+ X86_INTRINSIC_DATA(avx10_vdpphps_512, INTR_TYPE_3OP, X86ISD::DPFP16PS, 0),
X86_INTRINSIC_DATA(avx10_vfmaddpd256, INTR_TYPE_3OP, ISD::FMA,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx10_vfmaddph256, INTR_TYPE_3OP, ISD::FMA,
@@ -590,6 +593,24 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FMUL_RND),
X86_INTRINSIC_DATA(avx10_vmulps256, INTR_TYPE_2OP, ISD::FMUL,
X86ISD::FMUL_RND),
+ X86_INTRINSIC_DATA(avx10_vpdpbssd_512, INTR_TYPE_3OP, X86ISD::VPDPBSSD, 0),
+ X86_INTRINSIC_DATA(avx10_vpdpbssds_512, INTR_TYPE_3OP, X86ISD::VPDPBSSDS,
+ 0),
+ X86_INTRINSIC_DATA(avx10_vpdpbsud_512, INTR_TYPE_3OP, X86ISD::VPDPBSUD, 0),
+ X86_INTRINSIC_DATA(avx10_vpdpbsuds_512, INTR_TYPE_3OP, X86ISD::VPDPBSUDS,
+ 0),
+ X86_INTRINSIC_DATA(avx10_vpdpbuud_512, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0),
+ X86_INTRINSIC_DATA(avx10_vpdpbuuds_512, INTR_TYPE_3OP, X86ISD::VPDPBUUDS,
+ 0),
+ X86_INTRINSIC_DATA(avx10_vpdpwsud_512, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0),
+ X86_INTRINSIC_DATA(avx10_vpdpwsuds_512, INTR_TYPE_3OP, X86ISD::VPDPWSUDS,
+ 0),
+ X86_INTRINSIC_DATA(avx10_vpdpwusd_512, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0),
+ X86_INTRINSIC_DATA(avx10_vpdpwusds_512, INTR_TYPE_3OP, X86ISD::VPDPWUSDS,
+ 0),
+ X86_INTRINSIC_DATA(avx10_vpdpwuud_512, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0),
+ X86_INTRINSIC_DATA(avx10_vpdpwuuds_512, INTR_TYPE_3OP, X86ISD::VPDPWUUDS,
+ 0),
X86_INTRINSIC_DATA(avx10_vsqrtpd256, INTR_TYPE_1OP, ISD::FSQRT,
X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx10_vsqrtph256, INTR_TYPE_1OP, ISD::FSQRT,
@@ -662,6 +683,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_vpdpbuud_256, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0),
X86_INTRINSIC_DATA(avx2_vpdpbuuds_128, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
X86_INTRINSIC_DATA(avx2_vpdpbuuds_256, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwsud_128, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwsud_256, INTR_TYPE_3OP, X86ISD::VPDPWSUD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwsuds_128, INTR_TYPE_3OP, X86ISD::VPDPWSUDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwsuds_256, INTR_TYPE_3OP, X86ISD::VPDPWSUDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwusd_128, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwusd_256, INTR_TYPE_3OP, X86ISD::VPDPWUSD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwusds_128, INTR_TYPE_3OP, X86ISD::VPDPWUSDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwusds_256, INTR_TYPE_3OP, X86ISD::VPDPWUSDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwuud_128, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwuud_256, INTR_TYPE_3OP, X86ISD::VPDPWUUD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwuuds_128, INTR_TYPE_3OP, X86ISD::VPDPWUUDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpwuuds_256, INTR_TYPE_3OP, X86ISD::VPDPWUUDS, 0),
X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD,
X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD,
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index bafa52a2a83ae..07e86cb01e133 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -1,6 +1,389 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+
+; VNNI FP16
+
+define <16 x float> @test_mm512_dpph_ps(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B) {
+; CHECK-LABEL: test_mm512_dpph_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vdpphps %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x52,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B)
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_dpph_ps(<16 x float> %__W, i16 zeroext %__U, <32 x half> %__A, <32 x half> %__B) {
+; X86-LABEL: test_mm512_mask_dpph_ps:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vdpphps %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x52,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpph_ps:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vdpphps %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x52,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dph = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x float> %dph, <16 x float> %__W
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_dpph_ps(i16 zeroext %__U, <16 x float> %__W, <32 x half> %__A, <32 x half> %__B) {
+; X86-LABEL: test_mm512_maskz_dpph_ps:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vdpphps %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x52,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpph_ps:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vdpphps %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x52,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dph = tail call <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float> %__W, <32 x half> %__A, <32 x half> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x float> %dph, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32 x half>)
+
+; VNNI INT8
+
+define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpbssd_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpdpbssd (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0x00]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpbssd_epi32:
+; X64: # %bb.0:
+; X64-NEXT: vpdpbssd (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+ %__B = load <16 x i32>, ptr %pB
+ %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpbssds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpbssds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpbssd_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xc9,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpbssd_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xc9,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpbsud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpdpbsud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0x00]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpbsud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: vpdpbsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+ %__B = load <16 x i32>, ptr %pB
+ %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpbsuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpbsuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpbsud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpbsud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpbuud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpdpbuud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0x00]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpbuud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: vpdpbuud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+ %__B = load <16 x i32>, ptr %pB
+ %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpbuuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpbuuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpbuud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpbuud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+; VNNI INT16
+
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpwsud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpdpwsud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x00]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpwsud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+ %__B = load <16 x i32>, ptr %pB
+ %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpwsuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpwsuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpwsud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpwsud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpwusd_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpdpwusd (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0x00]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpwusd_epi32:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwusd (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+ %__B = load <16 x i32>, ptr %pB
+ %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpwusds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpwusds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpwusd_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpwusd_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+; X86-LABEL: test_mm512_dpwuud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpdpwuud (%eax), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0x00]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_dpwuud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwuud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+ %__B = load <16 x i32>, ptr %pB
+ %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_mask_dpwuuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_mask_dpwuuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+; X86-LABEL: test_mm512_maskz_dpwuud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm512_maskz_dpwuud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %bst = bitcast i16 %__U to <16 x i1>
+ %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
; VMPSADBW
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 34d740302d744..31cec891c4cf3 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -2,6 +2,569 @@
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+; VNNI FP16
+
+define <4 x float> @test_mm_dpph_ps(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B) {
+; CHECK-LABEL: test_mm_dpph_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vdpphps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x52,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_dpph_ps(<4 x float> %__W, i8 zeroext %__U, <8 x half> %__A, <8 x half> %__B) {
+; X86-LABEL: test_mm_mask_dpph_ps:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vdpphps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x52,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpph_ps:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vdpphps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x52,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dph = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %ext = shufflevector <8 x i1> %bst, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %ext, <4 x float> %dph, <4 x float> %__W
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_maskz_dpph_ps(i8 zeroext %__U, <4 x float> %__W, <8 x half> %__A, <8 x half> %__B) {
+; X86-LABEL: test_mm_maskz_dpph_ps:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vdpphps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x52,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpph_ps:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vdpphps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x52,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dph = tail call <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float> %__W, <8 x half> %__A, <8 x half> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %ext = shufflevector <8 x i1> %bst, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %ext, <4 x float> %dph, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_mm256_dpph_ps(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B) {
+; CHECK-LABEL: test_mm256_dpph_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vdpphps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x52,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_dpph_ps(<8 x float> %__W, i8 zeroext %__U, <16 x half> %__A, <16 x half> %__B) {
+; X86-LABEL: test_mm256_mask_dpph_ps:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vdpphps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x52,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpph_ps:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vdpphps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x52,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dph = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x float> %dph, <8 x float> %__W
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_maskz_dpph_ps(i8 zeroext %__U, <8 x float> %__W, <16 x half> %__A, <16 x half> %__B) {
+; X86-LABEL: test_mm256_maskz_dpph_ps:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vdpphps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x52,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpph_ps:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vdpphps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x52,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dph = tail call <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float> %__W, <16 x half> %__A, <16 x half> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x float> %dph, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float>, <8 x half>, <8 x half>)
+declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x half>)
+
+; VNNI INT8
+
+define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpbssd_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpbssd_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpbssds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpbssds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpbssds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpbssds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpbssd_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpbssd_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpbsud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpbsud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpbsuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpbsuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpbsuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpbsuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpbsud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpbsud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpbuud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpbuud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpbuuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpbuuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpbuuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpbuuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpbuud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x50,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpbuud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x50,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+; VNNI INT16
+
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpwsud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpwsud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpwsuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpwsuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpwsuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpwsuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpwsud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpwsud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpwusd_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpwusd_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpwusds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpwusds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpwusds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpwusds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpwusd_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpwusd_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_mask_dpwuud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_dpwuud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+; X86-LABEL: test_mm_maskz_dpwuuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_dpwuuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %bst = bitcast i4 %__U to <4 x i1>
+ %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_maskz_dpwuuds_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_maskz_dpwuuds_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+; X86-LABEL: test_mm256_mask_dpwuud_epi32:
+; X86: # %bb.0:
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mm256_mask_dpwuud_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %bst = bitcast i8 %__U to <8 x i1>
+ %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
; VMPSADBW
define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
index 999c968fa80db..8601d454215ad 100644
--- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -1,12 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefix=AVX10
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefix=AVX10
define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
ret <4 x i32> %ret
}
@@ -17,6 +24,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
ret <8 x i32> %ret
}
@@ -27,6 +39,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
ret <4 x i32> %ret
}
@@ -37,6 +54,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
ret <8 x i32> %ret
}
@@ -47,6 +69,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
ret <4 x i32> %ret
}
@@ -57,6 +84,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
ret <8 x i32> %ret
}
@@ -67,6 +99,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
ret <4 x i32> %ret
}
@@ -77,6 +114,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
ret <8 x i32> %ret
}
@@ -87,6 +129,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
ret <4 x i32> %ret
}
@@ -97,6 +144,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
ret <8 x i32> %ret
}
@@ -107,6 +159,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
ret <4 x i32> %ret
}
@@ -117,6 +174,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
ret <8 x i32> %ret
}
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
index f9e44ac4132be..607720fbc3f33 100644
--- a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64
declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
@@ -22,6 +24,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
; X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x50,0x18]
+; AVX10-X86-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
+; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbssd (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x50,0x1f]
+; AVX10-X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
+; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <4 x i32>, ptr %x2p
%1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
%2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -48,6 +67,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
; X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x51,0x18]
+; AVX10-X86-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
+; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbssds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x51,0x1f]
+; AVX10-X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
+; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <4 x i32>, ptr %x2p
%1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
%2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -74,6 +110,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
; X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x50,0x18]
+; AVX10-X86-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
+; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbssd (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x50,0x1f]
+; AVX10-X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
+; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <8 x i32>, ptr %x2p
%1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
%2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -100,6 +153,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
; X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x51,0x18]
+; AVX10-X86-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
+; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbssds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x51,0x1f]
+; AVX10-X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
+; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <8 x i32>, ptr %x2p
%1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
%2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -126,6 +196,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, pt
; X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x50,0x18]
+; AVX10-X86-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
+; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbsud (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x50,0x1f]
+; AVX10-X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
+; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <4 x i32>, ptr %x2p
%1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
%2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -152,6 +239,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, p
; X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x51,0x18]
+; AVX10-X86-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
+; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbsuds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x51,0x1f]
+; AVX10-X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
+; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <4 x i32>, ptr %x2p
%1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
%2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -178,6 +282,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, pt
; X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x50,0x18]
+; AVX10-X86-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
+; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbsud (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x50,0x1f]
+; AVX10-X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
+; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <8 x i32>, ptr %x2p
%1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
%2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -204,6 +325,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, p
; X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x51,0x18]
+; AVX10-X86-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
+; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbsuds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x51,0x1f]
+; AVX10-X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
+; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <8 x i32>, ptr %x2p
%1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
%2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -230,6 +368,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, pt
; X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x50,0x18]
+; AVX10-X86-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
+; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbuud (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x50,0x1f]
+; AVX10-X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
+; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <4 x i32>, ptr %x2p
%1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
%2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -256,6 +411,23 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, p
; X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x51,0x18]
+; AVX10-X86-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
+; AVX10-X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbuuds (%rdi), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x51,0x1f]
+; AVX10-X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
+; AVX10-X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <4 x i32>, ptr %x2p
%1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
%2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
@@ -282,6 +454,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, pt
; X64-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x50,0x18]
+; AVX10-X86-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
+; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbuud (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x50,0x1f]
+; AVX10-X64-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
+; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <8 x i32>, ptr %x2p
%1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
%2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
@@ -308,6 +497,23 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, p
; X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; AVX10-X86: # %bb.0:
+; AVX10-X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X86-NEXT: vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x51,0x18]
+; AVX10-X86-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
+; AVX10-X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT: retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; AVX10-X64: # %bb.0:
+; AVX10-X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; AVX10-X64-NEXT: vpdpbuuds (%rdi), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x51,0x1f]
+; AVX10-X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
+; AVX10-X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X64-NEXT: retq # encoding: [0xc3]
%x2 = load <8 x i32>, ptr %x2p
%1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
%2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
diff --git a/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt b/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt
index 8254e37e9aa9e..912c0799d1316 100644
--- a/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10_2ni-32.txt
@@ -1,6 +1,1416 @@
# RUN: llvm-mc --disassemble %s -triple=i386 | FileCheck %s --check-prefixes=ATT
# RUN: llvm-mc --disassemble %s -triple=i386 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+# VNNI FP16
+
+# ATT: vdpphps %xmm4, %xmm3, %xmm2
+# INTEL: vdpphps xmm2, xmm3, xmm4
+0x62,0xf2,0x64,0x08,0x52,0xd4
+
+# ATT: vdpphps %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vdpphps xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0x52,0xd4
+
+# ATT: vdpphps %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdpphps xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0x52,0xd4
+
+# ATT: vdpphps %ymm4, %ymm3, %ymm2
+# INTEL: vdpphps ymm2, ymm3, ymm4
+0x62,0xf2,0x64,0x28,0x52,0xd4
+
+# ATT: vdpphps %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vdpphps ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0x52,0xd4
+
+# ATT: vdpphps %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdpphps ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0x52,0xd4
+
+# ATT: vdpphps %zmm4, %zmm3, %zmm2
+# INTEL: vdpphps zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0x52,0xd4
+
+# ATT: vdpphps %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vdpphps zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0x52,0xd4
+
+# ATT: vdpphps %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdpphps zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0x52,0xd4
+
+# ATT: vdpphps 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x08,0x52,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vdpphps 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0x52,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vdpphps (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vdpphps xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0x52,0x10
+
+# ATT: vdpphps -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0x62,0xf2,0x64,0x08,0x52,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vdpphps 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0x52,0x51,0x7f
+
+# ATT: vdpphps -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0x52,0x52,0x80
+
+# ATT: vdpphps 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x28,0x52,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vdpphps 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0x52,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vdpphps (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vdpphps ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0x52,0x10
+
+# ATT: vdpphps -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0x62,0xf2,0x64,0x28,0x52,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vdpphps 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0x52,0x51,0x7f
+
+# ATT: vdpphps -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0x52,0x52,0x80
+
+# ATT: vdpphps 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0x52,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vdpphps 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0x52,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vdpphps (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vdpphps zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0x52,0x10
+
+# ATT: vdpphps -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0x52,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vdpphps 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0x52,0x51,0x7f
+
+# ATT: vdpphps -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0x52,0x52,0x80
+
+# VNNI INT8
+
+# ATT: vpdpbssd %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x50,0xd4
+
+# ATT: vpdpbssd %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbssd xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x67,0x0f,0x50,0xd4
+
+# ATT: vpdpbssd %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x67,0x8f,0x50,0xd4
+
+# ATT: vpdpbssd %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x50,0xd4
+
+# ATT: vpdpbssd %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbssd ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x67,0x2f,0x50,0xd4
+
+# ATT: vpdpbssd %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x67,0xaf,0x50,0xd4
+
+# ATT: vpdpbssd %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbssd zmm2, zmm3, zmm4
+0x62,0xf2,0x67,0x48,0x50,0xd4
+
+# ATT: vpdpbssd %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbssd zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x67,0x4f,0x50,0xd4
+
+# ATT: vpdpbssd %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x67,0xcf,0x50,0xd4
+
+# ATT: vpdpbssd 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x67,0x18,0x50,0x10
+
+# ATT: vpdpbssd -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbssd 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x67,0x8f,0x50,0x51,0x7f
+
+# ATT: vpdpbssd -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x67,0x9f,0x50,0x52,0x80
+
+# ATT: vpdpbssd 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x67,0x38,0x50,0x10
+
+# ATT: vpdpbssd -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbssd 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x67,0xaf,0x50,0x51,0x7f
+
+# ATT: vpdpbssd -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x67,0xbf,0x50,0x52,0x80
+
+# ATT: vpdpbssd 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x67,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x67,0x58,0x50,0x10
+
+# ATT: vpdpbssd -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x67,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbssd 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x67,0xcf,0x50,0x51,0x7f
+
+# ATT: vpdpbssd -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x67,0xdf,0x50,0x52,0x80
+
+# ATT: vpdpbssds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x51,0xd4
+
+# ATT: vpdpbssds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbssds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x67,0x0f,0x51,0xd4
+
+# ATT: vpdpbssds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x67,0x8f,0x51,0xd4
+
+# ATT: vpdpbssds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x51,0xd4
+
+# ATT: vpdpbssds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbssds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x67,0x2f,0x51,0xd4
+
+# ATT: vpdpbssds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x67,0xaf,0x51,0xd4
+
+# ATT: vpdpbssds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbssds zmm2, zmm3, zmm4
+0x62,0xf2,0x67,0x48,0x51,0xd4
+
+# ATT: vpdpbssds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbssds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x67,0x4f,0x51,0xd4
+
+# ATT: vpdpbssds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x67,0xcf,0x51,0xd4
+
+# ATT: vpdpbssds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x67,0x18,0x51,0x10
+
+# ATT: vpdpbssds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbssds 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x67,0x8f,0x51,0x51,0x7f
+
+# ATT: vpdpbssds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x67,0x9f,0x51,0x52,0x80
+
+# ATT: vpdpbssds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x67,0x38,0x51,0x10
+
+# ATT: vpdpbssds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbssds 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x67,0xaf,0x51,0x51,0x7f
+
+# ATT: vpdpbssds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x67,0xbf,0x51,0x52,0x80
+
+# ATT: vpdpbssds 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x67,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x67,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x67,0x58,0x51,0x10
+
+# ATT: vpdpbssds -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x67,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbssds 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x67,0xcf,0x51,0x51,0x7f
+
+# ATT: vpdpbssds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x67,0xdf,0x51,0x52,0x80
+
+# ATT: vpdpbsud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x50,0xd4
+
+# ATT: vpdpbsud %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbsud xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x66,0x0f,0x50,0xd4
+
+# ATT: vpdpbsud %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x66,0x8f,0x50,0xd4
+
+# ATT: vpdpbsud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x50,0xd4
+
+# ATT: vpdpbsud %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbsud ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x66,0x2f,0x50,0xd4
+
+# ATT: vpdpbsud %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x66,0xaf,0x50,0xd4
+
+# ATT: vpdpbsud %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbsud zmm2, zmm3, zmm4
+0x62,0xf2,0x66,0x48,0x50,0xd4
+
+# ATT: vpdpbsud %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbsud zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x66,0x4f,0x50,0xd4
+
+# ATT: vpdpbsud %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x66,0xcf,0x50,0xd4
+
+# ATT: vpdpbsud 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x66,0x18,0x50,0x10
+
+# ATT: vpdpbsud -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbsud 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x66,0x8f,0x50,0x51,0x7f
+
+# ATT: vpdpbsud -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x66,0x9f,0x50,0x52,0x80
+
+# ATT: vpdpbsud 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x66,0x38,0x50,0x10
+
+# ATT: vpdpbsud -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbsud 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x66,0xaf,0x50,0x51,0x7f
+
+# ATT: vpdpbsud -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x66,0xbf,0x50,0x52,0x80
+
+# ATT: vpdpbsud 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x66,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x66,0x58,0x50,0x10
+
+# ATT: vpdpbsud -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x66,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbsud 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x66,0xcf,0x50,0x51,0x7f
+
+# ATT: vpdpbsud -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x66,0xdf,0x50,0x52,0x80
+
+# ATT: vpdpbsuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x51,0xd4
+
+# ATT: vpdpbsuds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbsuds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x66,0x0f,0x51,0xd4
+
+# ATT: vpdpbsuds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x66,0x8f,0x51,0xd4
+
+# ATT: vpdpbsuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x51,0xd4
+
+# ATT: vpdpbsuds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbsuds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x66,0x2f,0x51,0xd4
+
+# ATT: vpdpbsuds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x66,0xaf,0x51,0xd4
+
+# ATT: vpdpbsuds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbsuds zmm2, zmm3, zmm4
+0x62,0xf2,0x66,0x48,0x51,0xd4
+
+# ATT: vpdpbsuds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbsuds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x66,0x4f,0x51,0xd4
+
+# ATT: vpdpbsuds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x66,0xcf,0x51,0xd4
+
+# ATT: vpdpbsuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x66,0x18,0x51,0x10
+
+# ATT: vpdpbsuds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbsuds 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x66,0x8f,0x51,0x51,0x7f
+
+# ATT: vpdpbsuds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x66,0x9f,0x51,0x52,0x80
+
+# ATT: vpdpbsuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x66,0x38,0x51,0x10
+
+# ATT: vpdpbsuds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbsuds 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x66,0xaf,0x51,0x51,0x7f
+
+# ATT: vpdpbsuds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x66,0xbf,0x51,0x52,0x80
+
+# ATT: vpdpbsuds 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x66,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x66,0x58,0x51,0x10
+
+# ATT: vpdpbsuds -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x66,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbsuds 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x66,0xcf,0x51,0x51,0x7f
+
+# ATT: vpdpbsuds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x66,0xdf,0x51,0x52,0x80
+
+# ATT: vpdpbuud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x50,0xd4
+
+# ATT: vpdpbuud %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbuud xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0x50,0xd4
+
+# ATT: vpdpbuud %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0x50,0xd4
+
+# ATT: vpdpbuud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x50,0xd4
+
+# ATT: vpdpbuud %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbuud ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0x50,0xd4
+
+# ATT: vpdpbuud %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0x50,0xd4
+
+# ATT: vpdpbuud %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbuud zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0x50,0xd4
+
+# ATT: vpdpbuud %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbuud zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0x50,0xd4
+
+# ATT: vpdpbuud %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0x50,0xd4
+
+# ATT: vpdpbuud 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0x50,0x10
+
+# ATT: vpdpbuud -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbuud 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0x50,0x51,0x7f
+
+# ATT: vpdpbuud -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0x50,0x52,0x80
+
+# ATT: vpdpbuud 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0x50,0x10
+
+# ATT: vpdpbuud -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbuud 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0x50,0x51,0x7f
+
+# ATT: vpdpbuud -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0x50,0x52,0x80
+
+# ATT: vpdpbuud 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0x50,0x10
+
+# ATT: vpdpbuud -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbuud 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0x50,0x51,0x7f
+
+# ATT: vpdpbuud -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0x50,0x52,0x80
+
+# ATT: vpdpbuuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x51,0xd4
+
+# ATT: vpdpbuuds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbuuds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0x51,0xd4
+
+# ATT: vpdpbuuds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0x51,0xd4
+
+# ATT: vpdpbuuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x51,0xd4
+
+# ATT: vpdpbuuds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbuuds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0x51,0xd4
+
+# ATT: vpdpbuuds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0x51,0xd4
+
+# ATT: vpdpbuuds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpbuuds zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0x51,0xd4
+
+# ATT: vpdpbuuds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbuuds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0x51,0xd4
+
+# ATT: vpdpbuuds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0x51,0xd4
+
+# ATT: vpdpbuuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0x51,0x10
+
+# ATT: vpdpbuuds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbuuds 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0x51,0x51,0x7f
+
+# ATT: vpdpbuuds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0x51,0x52,0x80
+
+# ATT: vpdpbuuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0x51,0x10
+
+# ATT: vpdpbuuds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbuuds 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0x51,0x51,0x7f
+
+# ATT: vpdpbuuds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0x51,0x52,0x80
+
+# ATT: vpdpbuuds 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0x51,0x10
+
+# ATT: vpdpbuuds -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbuuds 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0x51,0x51,0x7f
+
+# ATT: vpdpbuuds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0x51,0x52,0x80
+
+# VNNI INT16
+
+# ATT: vpdpwsud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xd2,0xd4
+
+# ATT: vpdpwsud %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwsud xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x66,0x0f,0xd2,0xd4
+
+# ATT: vpdpwsud %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x66,0x8f,0xd2,0xd4
+
+# ATT: vpdpwsud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xd2,0xd4
+
+# ATT: vpdpwsud %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwsud ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x66,0x2f,0xd2,0xd4
+
+# ATT: vpdpwsud %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x66,0xaf,0xd2,0xd4
+
+# ATT: vpdpwsud %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwsud zmm2, zmm3, zmm4
+0x62,0xf2,0x66,0x48,0xd2,0xd4
+
+# ATT: vpdpwsud %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwsud zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x66,0x4f,0xd2,0xd4
+
+# ATT: vpdpwsud %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x66,0xcf,0xd2,0xd4
+
+# ATT: vpdpwsud 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x66,0x18,0xd2,0x10
+
+# ATT: vpdpwsud -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwsud 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x66,0x8f,0xd2,0x51,0x7f
+
+# ATT: vpdpwsud -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x66,0x9f,0xd2,0x52,0x80
+
+# ATT: vpdpwsud 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x66,0x38,0xd2,0x10
+
+# ATT: vpdpwsud -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwsud 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x66,0xaf,0xd2,0x51,0x7f
+
+# ATT: vpdpwsud -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x66,0xbf,0xd2,0x52,0x80
+
+# ATT: vpdpwsud 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x66,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x66,0x58,0xd2,0x10
+
+# ATT: vpdpwsud -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x66,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwsud 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x66,0xcf,0xd2,0x51,0x7f
+
+# ATT: vpdpwsud -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x66,0xdf,0xd2,0x52,0x80
+
+# ATT: vpdpwsuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xd3,0xd4
+
+# ATT: vpdpwsuds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwsuds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x66,0x0f,0xd3,0xd4
+
+# ATT: vpdpwsuds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x66,0x8f,0xd3,0xd4
+
+# ATT: vpdpwsuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xd3,0xd4
+
+# ATT: vpdpwsuds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwsuds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x66,0x2f,0xd3,0xd4
+
+# ATT: vpdpwsuds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x66,0xaf,0xd3,0xd4
+
+# ATT: vpdpwsuds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwsuds zmm2, zmm3, zmm4
+0x62,0xf2,0x66,0x48,0xd3,0xd4
+
+# ATT: vpdpwsuds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwsuds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x66,0x4f,0xd3,0xd4
+
+# ATT: vpdpwsuds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x66,0xcf,0xd3,0xd4
+
+# ATT: vpdpwsuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x66,0x18,0xd3,0x10
+
+# ATT: vpdpwsuds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwsuds 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x66,0x8f,0xd3,0x51,0x7f
+
+# ATT: vpdpwsuds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x66,0x9f,0xd3,0x52,0x80
+
+# ATT: vpdpwsuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x66,0x38,0xd3,0x10
+
+# ATT: vpdpwsuds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwsuds 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x66,0xaf,0xd3,0x51,0x7f
+
+# ATT: vpdpwsuds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x66,0xbf,0xd3,0x52,0x80
+
+# ATT: vpdpwsuds 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x66,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x66,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x66,0x58,0xd3,0x10
+
+# ATT: vpdpwsuds -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x66,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwsuds 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x66,0xcf,0xd3,0x51,0x7f
+
+# ATT: vpdpwsuds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x66,0xdf,0xd3,0x52,0x80
+
+# ATT: vpdpwusd %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmm4
+0xc4,0xe2,0x61,0xd2,0xd4
+
+# ATT: vpdpwusd %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwusd xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x65,0x0f,0xd2,0xd4
+
+# ATT: vpdpwusd %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x65,0x8f,0xd2,0xd4
+
+# ATT: vpdpwusd %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymm4
+0xc4,0xe2,0x65,0xd2,0xd4
+
+# ATT: vpdpwusd %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwusd ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x65,0x2f,0xd2,0xd4
+
+# ATT: vpdpwusd %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x65,0xaf,0xd2,0xd4
+
+# ATT: vpdpwusd %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwusd zmm2, zmm3, zmm4
+0x62,0xf2,0x65,0x48,0xd2,0xd4
+
+# ATT: vpdpwusd %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwusd zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x65,0x4f,0xd2,0xd4
+
+# ATT: vpdpwusd %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x65,0xcf,0xd2,0xd4
+
+# ATT: vpdpwusd 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x65,0x18,0xd2,0x10
+
+# ATT: vpdpwusd -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwusd 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x65,0x8f,0xd2,0x51,0x7f
+
+# ATT: vpdpwusd -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x65,0x9f,0xd2,0x52,0x80
+
+# ATT: vpdpwusd 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x65,0x38,0xd2,0x10
+
+# ATT: vpdpwusd -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwusd 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x65,0xaf,0xd2,0x51,0x7f
+
+# ATT: vpdpwusd -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x65,0xbf,0xd2,0x52,0x80
+
+# ATT: vpdpwusd 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x65,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x65,0x58,0xd2,0x10
+
+# ATT: vpdpwusd -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x65,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwusd 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x65,0xcf,0xd2,0x51,0x7f
+
+# ATT: vpdpwusd -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x65,0xdf,0xd2,0x52,0x80
+
+# ATT: vpdpwusds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmm4
+0xc4,0xe2,0x61,0xd3,0xd4
+
+# ATT: vpdpwusds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwusds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x65,0x0f,0xd3,0xd4
+
+# ATT: vpdpwusds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x65,0x8f,0xd3,0xd4
+
+# ATT: vpdpwusds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymm4
+0xc4,0xe2,0x65,0xd3,0xd4
+
+# ATT: vpdpwusds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwusds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x65,0x2f,0xd3,0xd4
+
+# ATT: vpdpwusds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x65,0xaf,0xd3,0xd4
+
+# ATT: vpdpwusds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwusds zmm2, zmm3, zmm4
+0x62,0xf2,0x65,0x48,0xd3,0xd4
+
+# ATT: vpdpwusds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwusds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x65,0x4f,0xd3,0xd4
+
+# ATT: vpdpwusds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x65,0xcf,0xd3,0xd4
+
+# ATT: vpdpwusds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x65,0x18,0xd3,0x10
+
+# ATT: vpdpwusds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwusds 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x65,0x8f,0xd3,0x51,0x7f
+
+# ATT: vpdpwusds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x65,0x9f,0xd3,0x52,0x80
+
+# ATT: vpdpwusds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x65,0x38,0xd3,0x10
+
+# ATT: vpdpwusds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwusds 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x65,0xaf,0xd3,0x51,0x7f
+
+# ATT: vpdpwusds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x65,0xbf,0xd3,0x52,0x80
+
+# ATT: vpdpwusds 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x65,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x65,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x65,0x58,0xd3,0x10
+
+# ATT: vpdpwusds -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x65,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwusds 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x65,0xcf,0xd3,0x51,0x7f
+
+# ATT: vpdpwusds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x65,0xdf,0xd3,0x52,0x80
+
+# ATT: vpdpwuud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0xd2,0xd4
+
+# ATT: vpdpwuud %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwuud xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0xd2,0xd4
+
+# ATT: vpdpwuud %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0xd2,0xd4
+
+# ATT: vpdpwuud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0xd2,0xd4
+
+# ATT: vpdpwuud %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwuud ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0xd2,0xd4
+
+# ATT: vpdpwuud %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0xd2,0xd4
+
+# ATT: vpdpwuud %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwuud zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0xd2,0xd4
+
+# ATT: vpdpwuud %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwuud zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0xd2,0xd4
+
+# ATT: vpdpwuud %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0xd2,0xd4
+
+# ATT: vpdpwuud 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0xd2,0x10
+
+# ATT: vpdpwuud -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwuud 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0xd2,0x51,0x7f
+
+# ATT: vpdpwuud -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0xd2,0x52,0x80
+
+# ATT: vpdpwuud 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0xd2,0x10
+
+# ATT: vpdpwuud -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwuud 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0xd2,0x51,0x7f
+
+# ATT: vpdpwuud -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0xd2,0x52,0x80
+
+# ATT: vpdpwuud 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0xd2,0x10
+
+# ATT: vpdpwuud -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwuud 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0xd2,0x51,0x7f
+
+# ATT: vpdpwuud -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0xd2,0x52,0x80
+
+# ATT: vpdpwuuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0xd3,0xd4
+
+# ATT: vpdpwuuds %xmm4, %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwuuds xmm2 {k7}, xmm3, xmm4
+0x62,0xf2,0x64,0x0f,0xd3,0xd4
+
+# ATT: vpdpwuuds %xmm4, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4
+0x62,0xf2,0x64,0x8f,0xd3,0xd4
+
+# ATT: vpdpwuuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0xd3,0xd4
+
+# ATT: vpdpwuuds %ymm4, %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwuuds ymm2 {k7}, ymm3, ymm4
+0x62,0xf2,0x64,0x2f,0xd3,0xd4
+
+# ATT: vpdpwuuds %ymm4, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4
+0x62,0xf2,0x64,0xaf,0xd3,0xd4
+
+# ATT: vpdpwuuds %zmm4, %zmm3, %zmm2
+# INTEL: vpdpwuuds zmm2, zmm3, zmm4
+0x62,0xf2,0x64,0x48,0xd3,0xd4
+
+# ATT: vpdpwuuds %zmm4, %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwuuds zmm2 {k7}, zmm3, zmm4
+0x62,0xf2,0x64,0x4f,0xd3,0xd4
+
+# ATT: vpdpwuuds %zmm4, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4
+0x62,0xf2,0x64,0xcf,0xd3,0xd4
+
+# ATT: vpdpwuuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%edi,%eax,4), %xmm3, %xmm2 {%k7}
+# INTEL: vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%eax){1to4}, %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4}
+0x62,0xf2,0x64,0x18,0xd3,0x10
+
+# ATT: vpdpwuuds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwuuds 2032(%ecx), %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+0x62,0xf2,0x64,0x8f,0xd3,0x51,0x7f
+
+# ATT: vpdpwuuds -512(%edx){1to4}, %xmm3, %xmm2 {%k7} {z}
+# INTEL: vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+0x62,0xf2,0x64,0x9f,0xd3,0x52,0x80
+
+# ATT: vpdpwuuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%edi,%eax,4), %ymm3, %ymm2 {%k7}
+# INTEL: vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%eax){1to8}, %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8}
+0x62,0xf2,0x64,0x38,0xd3,0x10
+
+# ATT: vpdpwuuds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwuuds 4064(%ecx), %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+0x62,0xf2,0x64,0xaf,0xd3,0x51,0x7f
+
+# ATT: vpdpwuuds -512(%edx){1to8}, %ymm3, %ymm2 {%k7} {z}
+# INTEL: vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+0x62,0xf2,0x64,0xbf,0xd3,0x52,0x80
+
+# ATT: vpdpwuuds 268435456(%esp,%esi,8), %zmm3, %zmm2
+# INTEL: vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf2,0x64,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%edi,%eax,4), %zmm3, %zmm2 {%k7}
+# INTEL: vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+0x62,0xf2,0x64,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%eax){1to16}, %zmm3, %zmm2
+# INTEL: vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16}
+0x62,0xf2,0x64,0x58,0xd3,0x10
+
+# ATT: vpdpwuuds -2048(,%ebp,2), %zmm3, %zmm2
+# INTEL: vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+0x62,0xf2,0x64,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwuuds 8128(%ecx), %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+0x62,0xf2,0x64,0xcf,0xd3,0x51,0x7f
+
+# ATT: vpdpwuuds -512(%edx){1to16}, %zmm3, %zmm2 {%k7} {z}
+# INTEL: vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+0x62,0xf2,0x64,0xdf,0xd3,0x52,0x80
+
# VMPSADBW
# ATT: vmpsadbw $123, %xmm4, %xmm3, %xmm2
diff --git a/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt b/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt
index 7f68e9d0da131..b5d25ee7e0c01 100644
--- a/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt
+++ b/llvm/test/MC/Disassembler/X86/avx10_2ni-64.txt
@@ -1,6 +1,1416 @@
# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+# VNNI FP16
+
+# ATT: vdpphps %xmm24, %xmm23, %xmm22
+# INTEL: vdpphps xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0x52,0xf0
+
+# ATT: vdpphps %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vdpphps xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0x52,0xf0
+
+# ATT: vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdpphps xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0x52,0xf0
+
+# ATT: vdpphps %ymm24, %ymm23, %ymm22
+# INTEL: vdpphps ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0x52,0xf0
+
+# ATT: vdpphps %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vdpphps ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0x52,0xf0
+
+# ATT: vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdpphps ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0x52,0xf0
+
+# ATT: vdpphps %zmm24, %zmm23, %zmm22
+# INTEL: vdpphps zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0x52,0xf0
+
+# ATT: vdpphps %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vdpphps zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0x52,0xf0
+
+# ATT: vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdpphps zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0x52,0xf0
+
+# ATT: vdpphps 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vdpphps xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vdpphps 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vdpphps xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vdpphps (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vdpphps xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0x52,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vdpphps -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vdpphps xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vdpphps 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdpphps xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0x52,0x71,0x7f
+
+# ATT: vdpphps -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vdpphps xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0x52,0x72,0x80
+
+# ATT: vdpphps 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vdpphps ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vdpphps 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vdpphps ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vdpphps (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vdpphps ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0x52,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vdpphps -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vdpphps ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vdpphps 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdpphps ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0x52,0x71,0x7f
+
+# ATT: vdpphps -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vdpphps ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0x52,0x72,0x80
+
+# ATT: vdpphps 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vdpphps zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vdpphps 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vdpphps zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vdpphps (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vdpphps zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0x52,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vdpphps -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vdpphps zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0x52,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vdpphps 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdpphps zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0x52,0x71,0x7f
+
+# ATT: vdpphps -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vdpphps zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0x52,0x72,0x80
+
+# VNNI INT8
+
+# ATT: vpdpbssd %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbssd xmm22, xmm23, xmm24
+0x62,0x82,0x47,0x00,0x50,0xf0
+
+# ATT: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbssd xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x47,0x07,0x50,0xf0
+
+# ATT: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x47,0x87,0x50,0xf0
+
+# ATT: vpdpbssd %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbssd ymm22, ymm23, ymm24
+0x62,0x82,0x47,0x20,0x50,0xf0
+
+# ATT: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbssd ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x47,0x27,0x50,0xf0
+
+# ATT: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x47,0xa7,0x50,0xf0
+
+# ATT: vpdpbssd %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbssd zmm22, zmm23, zmm24
+0x62,0x82,0x47,0x40,0x50,0xf0
+
+# ATT: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbssd zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x47,0x47,0x50,0xf0
+
+# ATT: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x47,0xc7,0x50,0xf0
+
+# ATT: vpdpbssd 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbssd xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbssd xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbssd xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x47,0x10,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssd -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbssd xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x47,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbssd 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x47,0x87,0x50,0x71,0x7f
+
+# ATT: vpdpbssd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssd xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x47,0x97,0x50,0x72,0x80
+
+# ATT: vpdpbssd 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbssd ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbssd ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbssd ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x47,0x30,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssd -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbssd ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x47,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbssd 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x47,0xa7,0x50,0x71,0x7f
+
+# ATT: vpdpbssd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssd ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x47,0xb7,0x50,0x72,0x80
+
+# ATT: vpdpbssd 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbssd zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbssd zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbssd zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x47,0x50,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssd -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbssd zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x47,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbssd 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x47,0xc7,0x50,0x71,0x7f
+
+# ATT: vpdpbssd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssd zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x47,0xd7,0x50,0x72,0x80
+
+# ATT: vpdpbssds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbssds xmm22, xmm23, xmm24
+0x62,0x82,0x47,0x00,0x51,0xf0
+
+# ATT: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbssds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x47,0x07,0x51,0xf0
+
+# ATT: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x47,0x87,0x51,0xf0
+
+# ATT: vpdpbssds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbssds ymm22, ymm23, ymm24
+0x62,0x82,0x47,0x20,0x51,0xf0
+
+# ATT: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbssds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x47,0x27,0x51,0xf0
+
+# ATT: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x47,0xa7,0x51,0xf0
+
+# ATT: vpdpbssds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbssds zmm22, zmm23, zmm24
+0x62,0x82,0x47,0x40,0x51,0xf0
+
+# ATT: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbssds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x47,0x47,0x51,0xf0
+
+# ATT: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x47,0xc7,0x51,0xf0
+
+# ATT: vpdpbssds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbssds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbssds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbssds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x47,0x10,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssds -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbssds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x47,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbssds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x47,0x87,0x51,0x71,0x7f
+
+# ATT: vpdpbssds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbssds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x47,0x97,0x51,0x72,0x80
+
+# ATT: vpdpbssds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbssds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbssds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbssds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x47,0x30,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssds -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbssds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x47,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbssds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x47,0xa7,0x51,0x71,0x7f
+
+# ATT: vpdpbssds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbssds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x47,0xb7,0x51,0x72,0x80
+
+# ATT: vpdpbssds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbssds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x47,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbssds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x47,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbssds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x47,0x50,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssds -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbssds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x47,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbssds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x47,0xc7,0x51,0x71,0x7f
+
+# ATT: vpdpbssds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbssds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x47,0xd7,0x51,0x72,0x80
+
+# ATT: vpdpbsud %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbsud xmm22, xmm23, xmm24
+0x62,0x82,0x46,0x00,0x50,0xf0
+
+# ATT: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbsud xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x46,0x07,0x50,0xf0
+
+# ATT: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x46,0x87,0x50,0xf0
+
+# ATT: vpdpbsud %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbsud ymm22, ymm23, ymm24
+0x62,0x82,0x46,0x20,0x50,0xf0
+
+# ATT: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbsud ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x46,0x27,0x50,0xf0
+
+# ATT: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x46,0xa7,0x50,0xf0
+
+# ATT: vpdpbsud %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbsud zmm22, zmm23, zmm24
+0x62,0x82,0x46,0x40,0x50,0xf0
+
+# ATT: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbsud zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x46,0x47,0x50,0xf0
+
+# ATT: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x46,0xc7,0x50,0xf0
+
+# ATT: vpdpbsud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbsud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbsud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbsud xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x46,0x10,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsud -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbsud xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x46,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x46,0x87,0x50,0x71,0x7f
+
+# ATT: vpdpbsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x46,0x97,0x50,0x72,0x80
+
+# ATT: vpdpbsud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbsud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbsud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbsud ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x46,0x30,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsud -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbsud ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x46,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x46,0xa7,0x50,0x71,0x7f
+
+# ATT: vpdpbsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x46,0xb7,0x50,0x72,0x80
+
+# ATT: vpdpbsud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbsud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbsud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbsud zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x46,0x50,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsud -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbsud zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x46,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x46,0xc7,0x50,0x71,0x7f
+
+# ATT: vpdpbsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x46,0xd7,0x50,0x72,0x80
+
+# ATT: vpdpbsuds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbsuds xmm22, xmm23, xmm24
+0x62,0x82,0x46,0x00,0x51,0xf0
+
+# ATT: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbsuds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x46,0x07,0x51,0xf0
+
+# ATT: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x46,0x87,0x51,0xf0
+
+# ATT: vpdpbsuds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbsuds ymm22, ymm23, ymm24
+0x62,0x82,0x46,0x20,0x51,0xf0
+
+# ATT: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbsuds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x46,0x27,0x51,0xf0
+
+# ATT: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x46,0xa7,0x51,0xf0
+
+# ATT: vpdpbsuds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbsuds zmm22, zmm23, zmm24
+0x62,0x82,0x46,0x40,0x51,0xf0
+
+# ATT: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbsuds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x46,0x47,0x51,0xf0
+
+# ATT: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x46,0xc7,0x51,0xf0
+
+# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbsuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbsuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbsuds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x46,0x10,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsuds -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbsuds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x46,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x46,0x87,0x51,0x71,0x7f
+
+# ATT: vpdpbsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbsuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x46,0x97,0x51,0x72,0x80
+
+# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbsuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbsuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbsuds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x46,0x30,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsuds -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbsuds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x46,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x46,0xa7,0x51,0x71,0x7f
+
+# ATT: vpdpbsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbsuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x46,0xb7,0x51,0x72,0x80
+
+# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbsuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbsuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbsuds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x46,0x50,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsuds -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbsuds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x46,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x46,0xc7,0x51,0x71,0x7f
+
+# ATT: vpdpbsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbsuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x46,0xd7,0x51,0x72,0x80
+
+# ATT: vpdpbuud %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbuud xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0x50,0xf0
+
+# ATT: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbuud xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0x50,0xf0
+
+# ATT: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0x50,0xf0
+
+# ATT: vpdpbuud %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbuud ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0x50,0xf0
+
+# ATT: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbuud ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0x50,0xf0
+
+# ATT: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0x50,0xf0
+
+# ATT: vpdpbuud %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbuud zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0x50,0xf0
+
+# ATT: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbuud zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0x50,0xf0
+
+# ATT: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0x50,0xf0
+
+# ATT: vpdpbuud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbuud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbuud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbuud xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuud -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbuud xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0x50,0x71,0x7f
+
+# ATT: vpdpbuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0x50,0x72,0x80
+
+# ATT: vpdpbuud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbuud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbuud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbuud ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuud -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbuud ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0x50,0x71,0x7f
+
+# ATT: vpdpbuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0x50,0x72,0x80
+
+# ATT: vpdpbuud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbuud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbuud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbuud zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0x50,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuud -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbuud zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0x50,0x71,0x7f
+
+# ATT: vpdpbuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0x50,0x72,0x80
+
+# ATT: vpdpbuuds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpbuuds xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0x51,0xf0
+
+# ATT: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbuuds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0x51,0xf0
+
+# ATT: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0x51,0xf0
+
+# ATT: vpdpbuuds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpbuuds ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0x51,0xf0
+
+# ATT: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbuuds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0x51,0xf0
+
+# ATT: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0x51,0xf0
+
+# ATT: vpdpbuuds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpbuuds zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0x51,0xf0
+
+# ATT: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbuuds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0x51,0xf0
+
+# ATT: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0x51,0xf0
+
+# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpbuuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpbuuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpbuuds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuuds -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpbuuds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0x51,0x71,0x7f
+
+# ATT: vpdpbuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpbuuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0x51,0x72,0x80
+
+# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpbuuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpbuuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpbuuds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuuds -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpbuuds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0x51,0x71,0x7f
+
+# ATT: vpdpbuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpbuuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0x51,0x72,0x80
+
+# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpbuuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpbuuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpbuuds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0x51,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuuds -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpbuuds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpbuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0x51,0x71,0x7f
+
+# ATT: vpdpbuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpbuuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0x51,0x72,0x80
+
+# VNNI INT16
+
+# ATT: vpdpwsud %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwsud xmm22, xmm23, xmm24
+0x62,0x82,0x46,0x00,0xd2,0xf0
+
+# ATT: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwsud xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x46,0x07,0xd2,0xf0
+
+# ATT: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x46,0x87,0xd2,0xf0
+
+# ATT: vpdpwsud %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwsud ymm22, ymm23, ymm24
+0x62,0x82,0x46,0x20,0xd2,0xf0
+
+# ATT: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwsud ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x46,0x27,0xd2,0xf0
+
+# ATT: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x46,0xa7,0xd2,0xf0
+
+# ATT: vpdpwsud %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwsud zmm22, zmm23, zmm24
+0x62,0x82,0x46,0x40,0xd2,0xf0
+
+# ATT: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwsud zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x46,0x47,0xd2,0xf0
+
+# ATT: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x46,0xc7,0xd2,0xf0
+
+# ATT: vpdpwsud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwsud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwsud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwsud xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x46,0x10,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsud -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwsud xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x46,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x46,0x87,0xd2,0x71,0x7f
+
+# ATT: vpdpwsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x46,0x97,0xd2,0x72,0x80
+
+# ATT: vpdpwsud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwsud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwsud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwsud ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x46,0x30,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsud -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwsud ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x46,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x46,0xa7,0xd2,0x71,0x7f
+
+# ATT: vpdpwsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x46,0xb7,0xd2,0x72,0x80
+
+# ATT: vpdpwsud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwsud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwsud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwsud zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x46,0x50,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsud -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwsud zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x46,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x46,0xc7,0xd2,0x71,0x7f
+
+# ATT: vpdpwsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x46,0xd7,0xd2,0x72,0x80
+
+# ATT: vpdpwsuds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwsuds xmm22, xmm23, xmm24
+0x62,0x82,0x46,0x00,0xd3,0xf0
+
+# ATT: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwsuds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x46,0x07,0xd3,0xf0
+
+# ATT: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x46,0x87,0xd3,0xf0
+
+# ATT: vpdpwsuds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwsuds ymm22, ymm23, ymm24
+0x62,0x82,0x46,0x20,0xd3,0xf0
+
+# ATT: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwsuds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x46,0x27,0xd3,0xf0
+
+# ATT: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x46,0xa7,0xd3,0xf0
+
+# ATT: vpdpwsuds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwsuds zmm22, zmm23, zmm24
+0x62,0x82,0x46,0x40,0xd3,0xf0
+
+# ATT: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwsuds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x46,0x47,0xd3,0xf0
+
+# ATT: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x46,0xc7,0xd3,0xf0
+
+# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwsuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwsuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwsuds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x46,0x10,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsuds -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwsuds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x46,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x46,0x87,0xd3,0x71,0x7f
+
+# ATT: vpdpwsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwsuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x46,0x97,0xd3,0x72,0x80
+
+# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwsuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwsuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwsuds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x46,0x30,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsuds -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwsuds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x46,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x46,0xa7,0xd3,0x71,0x7f
+
+# ATT: vpdpwsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwsuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x46,0xb7,0xd3,0x72,0x80
+
+# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwsuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x46,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwsuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x46,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwsuds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x46,0x50,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsuds -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwsuds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x46,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x46,0xc7,0xd3,0x71,0x7f
+
+# ATT: vpdpwsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwsuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x46,0xd7,0xd3,0x72,0x80
+
+# ATT: vpdpwusd %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwusd xmm22, xmm23, xmm24
+0x62,0x82,0x45,0x00,0xd2,0xf0
+
+# ATT: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwusd xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x45,0x07,0xd2,0xf0
+
+# ATT: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x45,0x87,0xd2,0xf0
+
+# ATT: vpdpwusd %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwusd ymm22, ymm23, ymm24
+0x62,0x82,0x45,0x20,0xd2,0xf0
+
+# ATT: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwusd ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x45,0x27,0xd2,0xf0
+
+# ATT: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x45,0xa7,0xd2,0xf0
+
+# ATT: vpdpwusd %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwusd zmm22, zmm23, zmm24
+0x62,0x82,0x45,0x40,0xd2,0xf0
+
+# ATT: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwusd zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x45,0x47,0xd2,0xf0
+
+# ATT: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x45,0xc7,0xd2,0xf0
+
+# ATT: vpdpwusd 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwusd xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwusd xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwusd xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x45,0x10,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusd -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwusd xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x45,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwusd 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x45,0x87,0xd2,0x71,0x7f
+
+# ATT: vpdpwusd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusd xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x45,0x97,0xd2,0x72,0x80
+
+# ATT: vpdpwusd 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwusd ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwusd ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwusd ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x45,0x30,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusd -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwusd ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x45,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwusd 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x45,0xa7,0xd2,0x71,0x7f
+
+# ATT: vpdpwusd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusd ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x45,0xb7,0xd2,0x72,0x80
+
+# ATT: vpdpwusd 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwusd zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwusd zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwusd zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x45,0x50,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusd -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwusd zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x45,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwusd 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x45,0xc7,0xd2,0x71,0x7f
+
+# ATT: vpdpwusd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusd zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x45,0xd7,0xd2,0x72,0x80
+
+# ATT: vpdpwusds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwusds xmm22, xmm23, xmm24
+0x62,0x82,0x45,0x00,0xd3,0xf0
+
+# ATT: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwusds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x45,0x07,0xd3,0xf0
+
+# ATT: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x45,0x87,0xd3,0xf0
+
+# ATT: vpdpwusds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwusds ymm22, ymm23, ymm24
+0x62,0x82,0x45,0x20,0xd3,0xf0
+
+# ATT: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwusds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x45,0x27,0xd3,0xf0
+
+# ATT: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x45,0xa7,0xd3,0xf0
+
+# ATT: vpdpwusds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwusds zmm22, zmm23, zmm24
+0x62,0x82,0x45,0x40,0xd3,0xf0
+
+# ATT: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwusds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x45,0x47,0xd3,0xf0
+
+# ATT: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x45,0xc7,0xd3,0xf0
+
+# ATT: vpdpwusds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwusds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwusds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwusds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x45,0x10,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusds -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwusds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x45,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwusds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x45,0x87,0xd3,0x71,0x7f
+
+# ATT: vpdpwusds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwusds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x45,0x97,0xd3,0x72,0x80
+
+# ATT: vpdpwusds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwusds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwusds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwusds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x45,0x30,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusds -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwusds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x45,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwusds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x45,0xa7,0xd3,0x71,0x7f
+
+# ATT: vpdpwusds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwusds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x45,0xb7,0xd3,0x72,0x80
+
+# ATT: vpdpwusds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwusds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x45,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwusds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x45,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwusds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x45,0x50,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusds -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwusds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x45,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwusds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x45,0xc7,0xd3,0x71,0x7f
+
+# ATT: vpdpwusds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwusds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x45,0xd7,0xd3,0x72,0x80
+
+# ATT: vpdpwuud %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwuud xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0xd2,0xf0
+
+# ATT: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwuud xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0xd2,0xf0
+
+# ATT: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0xd2,0xf0
+
+# ATT: vpdpwuud %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwuud ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0xd2,0xf0
+
+# ATT: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwuud ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0xd2,0xf0
+
+# ATT: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0xd2,0xf0
+
+# ATT: vpdpwuud %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwuud zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0xd2,0xf0
+
+# ATT: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwuud zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0xd2,0xf0
+
+# ATT: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0xd2,0xf0
+
+# ATT: vpdpwuud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwuud xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwuud xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwuud xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuud -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwuud xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0xd2,0x71,0x7f
+
+# ATT: vpdpwuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuud xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0xd2,0x72,0x80
+
+# ATT: vpdpwuud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwuud ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwuud ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwuud ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuud -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwuud ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0xd2,0x71,0x7f
+
+# ATT: vpdpwuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuud ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0xd2,0x72,0x80
+
+# ATT: vpdpwuud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwuud zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwuud zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwuud zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0xd2,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuud -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwuud zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0xd2,0x71,0x7f
+
+# ATT: vpdpwuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuud zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0xd2,0x72,0x80
+
+# ATT: vpdpwuuds %xmm24, %xmm23, %xmm22
+# INTEL: vpdpwuuds xmm22, xmm23, xmm24
+0x62,0x82,0x44,0x00,0xd3,0xf0
+
+# ATT: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwuuds xmm22 {k7}, xmm23, xmm24
+0x62,0x82,0x44,0x07,0xd3,0xf0
+
+# ATT: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, xmm24
+0x62,0x82,0x44,0x87,0xd3,0xf0
+
+# ATT: vpdpwuuds %ymm24, %ymm23, %ymm22
+# INTEL: vpdpwuuds ymm22, ymm23, ymm24
+0x62,0x82,0x44,0x20,0xd3,0xf0
+
+# ATT: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwuuds ymm22 {k7}, ymm23, ymm24
+0x62,0x82,0x44,0x27,0xd3,0xf0
+
+# ATT: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, ymm24
+0x62,0x82,0x44,0xa7,0xd3,0xf0
+
+# ATT: vpdpwuuds %zmm24, %zmm23, %zmm22
+# INTEL: vpdpwuuds zmm22, zmm23, zmm24
+0x62,0x82,0x44,0x40,0xd3,0xf0
+
+# ATT: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwuuds zmm22 {k7}, zmm23, zmm24
+0x62,0x82,0x44,0x47,0xd3,0xf0
+
+# ATT: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, zmm24
+0x62,0x82,0x44,0xc7,0xd3,0xf0
+
+# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+# INTEL: vpdpwuuds xmm22, xmm23, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+# INTEL: vpdpwuuds xmm22 {k7}, xmm23, xmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%rip){1to4}, %xmm23, %xmm22
+# INTEL: vpdpwuuds xmm22, xmm23, dword ptr [rip]{1to4}
+0x62,0xe2,0x44,0x10,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuuds -512(,%rbp,2), %xmm23, %xmm22
+# INTEL: vpdpwuuds xmm22, xmm23, xmmword ptr [2*rbp - 512]
+0x62,0xe2,0x44,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, xmmword ptr [rcx + 2032]
+0x62,0xe2,0x44,0x87,0xd3,0x71,0x7f
+
+# ATT: vpdpwuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+# INTEL: vpdpwuuds xmm22 {k7} {z}, xmm23, dword ptr [rdx - 512]{1to4}
+0x62,0xe2,0x44,0x97,0xd3,0x72,0x80
+
+# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+# INTEL: vpdpwuuds ymm22, ymm23, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+# INTEL: vpdpwuuds ymm22 {k7}, ymm23, ymmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%rip){1to8}, %ymm23, %ymm22
+# INTEL: vpdpwuuds ymm22, ymm23, dword ptr [rip]{1to8}
+0x62,0xe2,0x44,0x30,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuuds -1024(,%rbp,2), %ymm23, %ymm22
+# INTEL: vpdpwuuds ymm22, ymm23, ymmword ptr [2*rbp - 1024]
+0x62,0xe2,0x44,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, ymmword ptr [rcx + 4064]
+0x62,0xe2,0x44,0xa7,0xd3,0x71,0x7f
+
+# ATT: vpdpwuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+# INTEL: vpdpwuuds ymm22 {k7} {z}, ymm23, dword ptr [rdx - 512]{1to8}
+0x62,0xe2,0x44,0xb7,0xd3,0x72,0x80
+
+# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+# INTEL: vpdpwuuds zmm22, zmm23, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa2,0x44,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+# INTEL: vpdpwuuds zmm22 {k7}, zmm23, zmmword ptr [r8 + 4*rax + 291]
+0x62,0xc2,0x44,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%rip){1to16}, %zmm23, %zmm22
+# INTEL: vpdpwuuds zmm22, zmm23, dword ptr [rip]{1to16}
+0x62,0xe2,0x44,0x50,0xd3,0x35,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuuds -2048(,%rbp,2), %zmm23, %zmm22
+# INTEL: vpdpwuuds zmm22, zmm23, zmmword ptr [2*rbp - 2048]
+0x62,0xe2,0x44,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, zmmword ptr [rcx + 8128]
+0x62,0xe2,0x44,0xc7,0xd3,0x71,0x7f
+
+# ATT: vpdpwuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+# INTEL: vpdpwuuds zmm22 {k7} {z}, zmm23, dword ptr [rdx - 512]{1to16}
+0x62,0xe2,0x44,0xd7,0xd3,0x72,0x80
+
# VMPSADBW
# ATT: vmpsadbw $123, %xmm24, %xmm23, %xmm22
diff --git a/llvm/test/MC/X86/avx10_2ni-32-intel.s b/llvm/test/MC/X86/avx10_2ni-32-intel.s
index 5dbc1c226e67a..123f57411acb0 100644
--- a/llvm/test/MC/X86/avx10_2ni-32-intel.s
+++ b/llvm/test/MC/X86/avx10_2ni-32-intel.s
@@ -1,5 +1,1415 @@
// RUN: llvm-mc -triple i386 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+// VNNI FP16
+
+// CHECK: vdpphps xmm2, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0xd4]
+ vdpphps xmm2, xmm3, xmm4
+
+// CHECK: vdpphps xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x52,0xd4]
+ vdpphps xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vdpphps xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x52,0xd4]
+ vdpphps xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vdpphps ymm2, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0xd4]
+ vdpphps ymm2, ymm3, ymm4
+
+// CHECK: vdpphps ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x52,0xd4]
+ vdpphps ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vdpphps ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x52,0xd4]
+ vdpphps ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vdpphps zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0xd4]
+ vdpphps zmm2, zmm3, zmm4
+
+// CHECK: vdpphps zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x52,0xd4]
+ vdpphps zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vdpphps zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x52,0xd4]
+ vdpphps zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vdpphps xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x52,0x94,0x87,0x23,0x01,0x00,0x00]
+ vdpphps xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdpphps xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x52,0x10]
+ vdpphps xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0x62,0xf2,0x64,0x08,0x52,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vdpphps xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x52,0x51,0x7f]
+ vdpphps xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x52,0x52,0x80]
+ vdpphps xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vdpphps ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x52,0x94,0x87,0x23,0x01,0x00,0x00]
+ vdpphps ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdpphps ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x52,0x10]
+ vdpphps ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0x62,0xf2,0x64,0x28,0x52,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vdpphps ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x52,0x51,0x7f]
+ vdpphps ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x52,0x52,0x80]
+ vdpphps ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vdpphps zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x52,0x94,0x87,0x23,0x01,0x00,0x00]
+ vdpphps zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vdpphps zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x52,0x10]
+ vdpphps zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x52,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vdpphps zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x52,0x51,0x7f]
+ vdpphps zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x52,0x52,0x80]
+ vdpphps zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// VNNI INT8
+
+// CHECK: vpdpbssd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4]
+ vpdpbssd xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssd xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x50,0xd4]
+ vpdpbssd xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x50,0xd4]
+ vpdpbssd xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbssd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4]
+ vpdpbssd ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssd ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x50,0xd4]
+ vpdpbssd ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x50,0xd4]
+ vpdpbssd ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbssd zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0xd4]
+ vpdpbssd zmm2, zmm3, zmm4
+
+// CHECK: vpdpbssd zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x50,0xd4]
+ vpdpbssd zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x50,0xd4]
+ vpdpbssd zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x67,0x18,0x50,0x10]
+ vpdpbssd xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x50,0x51,0x7f]
+ vpdpbssd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x50,0x52,0x80]
+ vpdpbssd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x67,0x38,0x50,0x10]
+ vpdpbssd ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x50,0x51,0x7f]
+ vpdpbssd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x50,0x52,0x80]
+ vpdpbssd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x67,0x58,0x50,0x10]
+ vpdpbssd zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbssd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x50,0x51,0x7f]
+ vpdpbssd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x50,0x52,0x80]
+ vpdpbssd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbssds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4]
+ vpdpbssds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x51,0xd4]
+ vpdpbssds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x51,0xd4]
+ vpdpbssds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbssds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4]
+ vpdpbssds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x51,0xd4]
+ vpdpbssds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x51,0xd4]
+ vpdpbssds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbssds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0xd4]
+ vpdpbssds zmm2, zmm3, zmm4
+
+// CHECK: vpdpbssds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x51,0xd4]
+ vpdpbssds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x51,0xd4]
+ vpdpbssds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x67,0x18,0x51,0x10]
+ vpdpbssds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x67,0x8f,0x51,0x51,0x7f]
+ vpdpbssds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x67,0x9f,0x51,0x52,0x80]
+ vpdpbssds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x67,0x38,0x51,0x10]
+ vpdpbssds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x67,0xaf,0x51,0x51,0x7f]
+ vpdpbssds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x67,0xbf,0x51,0x52,0x80]
+ vpdpbssds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x67,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x67,0x58,0x51,0x10]
+ vpdpbssds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x67,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbssds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x67,0xcf,0x51,0x51,0x7f]
+ vpdpbssds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x67,0xdf,0x51,0x52,0x80]
+ vpdpbssds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4]
+ vpdpbsud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsud xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x50,0xd4]
+ vpdpbsud xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x50,0xd4]
+ vpdpbsud xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4]
+ vpdpbsud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsud ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x50,0xd4]
+ vpdpbsud ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x50,0xd4]
+ vpdpbsud ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbsud zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0xd4]
+ vpdpbsud zmm2, zmm3, zmm4
+
+// CHECK: vpdpbsud zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x50,0xd4]
+ vpdpbsud zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x50,0xd4]
+ vpdpbsud zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x18,0x50,0x10]
+ vpdpbsud xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x50,0x51,0x7f]
+ vpdpbsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0x50,0x52,0x80]
+ vpdpbsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0x38,0x50,0x10]
+ vpdpbsud ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x50,0x51,0x7f]
+ vpdpbsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0x50,0x52,0x80]
+ vpdpbsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0x58,0x50,0x10]
+ vpdpbsud zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x50,0x51,0x7f]
+ vpdpbsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0x50,0x52,0x80]
+ vpdpbsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4]
+ vpdpbsuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsuds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x51,0xd4]
+ vpdpbsuds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x51,0xd4]
+ vpdpbsuds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4]
+ vpdpbsuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsuds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x51,0xd4]
+ vpdpbsuds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x51,0xd4]
+ vpdpbsuds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbsuds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0xd4]
+ vpdpbsuds zmm2, zmm3, zmm4
+
+// CHECK: vpdpbsuds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x51,0xd4]
+ vpdpbsuds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x51,0xd4]
+ vpdpbsuds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x18,0x51,0x10]
+ vpdpbsuds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0x51,0x51,0x7f]
+ vpdpbsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0x51,0x52,0x80]
+ vpdpbsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0x38,0x51,0x10]
+ vpdpbsuds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0x51,0x51,0x7f]
+ vpdpbsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0x51,0x52,0x80]
+ vpdpbsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0x58,0x51,0x10]
+ vpdpbsuds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0x51,0x51,0x7f]
+ vpdpbsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0x51,0x52,0x80]
+ vpdpbsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4]
+ vpdpbuud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuud xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x50,0xd4]
+ vpdpbuud xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x50,0xd4]
+ vpdpbuud xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4]
+ vpdpbuud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuud ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x50,0xd4]
+ vpdpbuud ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x50,0xd4]
+ vpdpbuud ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbuud zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0xd4]
+ vpdpbuud zmm2, zmm3, zmm4
+
+// CHECK: vpdpbuud zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x50,0xd4]
+ vpdpbuud zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x50,0xd4]
+ vpdpbuud zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x50,0x10]
+ vpdpbuud xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x50,0x51,0x7f]
+ vpdpbuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x50,0x52,0x80]
+ vpdpbuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x50,0x10]
+ vpdpbuud ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x50,0x51,0x7f]
+ vpdpbuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x50,0x52,0x80]
+ vpdpbuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x50,0x10]
+ vpdpbuud zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x50,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x50,0x51,0x7f]
+ vpdpbuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x50,0x52,0x80]
+ vpdpbuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4]
+ vpdpbuuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuuds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x51,0xd4]
+ vpdpbuuds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x51,0xd4]
+ vpdpbuuds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4]
+ vpdpbuuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuuds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x51,0xd4]
+ vpdpbuuds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x51,0xd4]
+ vpdpbuuds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpbuuds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0xd4]
+ vpdpbuuds zmm2, zmm3, zmm4
+
+// CHECK: vpdpbuuds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x51,0xd4]
+ vpdpbuuds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x51,0xd4]
+ vpdpbuuds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0x51,0x10]
+ vpdpbuuds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0x51,0x51,0x7f]
+ vpdpbuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0x51,0x52,0x80]
+ vpdpbuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0x51,0x10]
+ vpdpbuuds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0x51,0x51,0x7f]
+ vpdpbuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0x51,0x52,0x80]
+ vpdpbuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0x51,0x10]
+ vpdpbuuds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0x51,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0x51,0x51,0x7f]
+ vpdpbuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0x51,0x52,0x80]
+ vpdpbuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// VNNI INT16
+
+// CHECK: vpdpwsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4]
+ vpdpwsud xmm2, xmm3, xmm4
+
+// CHECK: vpdpwsud xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd2,0xd4]
+ vpdpwsud xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd2,0xd4]
+ vpdpwsud xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4]
+ vpdpwsud ymm2, ymm3, ymm4
+
+// CHECK: vpdpwsud ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd2,0xd4]
+ vpdpwsud ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd2,0xd4]
+ vpdpwsud ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwsud zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0xd4]
+ vpdpwsud zmm2, zmm3, zmm4
+
+// CHECK: vpdpwsud zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd2,0xd4]
+ vpdpwsud zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd2,0xd4]
+ vpdpwsud zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x18,0xd2,0x10]
+ vpdpwsud xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd2,0x51,0x7f]
+ vpdpwsud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0xd2,0x52,0x80]
+ vpdpwsud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0x38,0xd2,0x10]
+ vpdpwsud ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd2,0x51,0x7f]
+ vpdpwsud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0xd2,0x52,0x80]
+ vpdpwsud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0x58,0xd2,0x10]
+ vpdpwsud zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwsud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd2,0x51,0x7f]
+ vpdpwsud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0xd2,0x52,0x80]
+ vpdpwsud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4]
+ vpdpwsuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwsuds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd3,0xd4]
+ vpdpwsuds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd3,0xd4]
+ vpdpwsuds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4]
+ vpdpwsuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwsuds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd3,0xd4]
+ vpdpwsuds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd3,0xd4]
+ vpdpwsuds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwsuds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0xd4]
+ vpdpwsuds zmm2, zmm3, zmm4
+
+// CHECK: vpdpwsuds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd3,0xd4]
+ vpdpwsuds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd3,0xd4]
+ vpdpwsuds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x18,0xd3,0x10]
+ vpdpwsuds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x66,0x8f,0xd3,0x51,0x7f]
+ vpdpwsuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x66,0x9f,0xd3,0x52,0x80]
+ vpdpwsuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0x38,0xd3,0x10]
+ vpdpwsuds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x66,0xaf,0xd3,0x51,0x7f]
+ vpdpwsuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x66,0xbf,0xd3,0x52,0x80]
+ vpdpwsuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x66,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0x58,0xd3,0x10]
+ vpdpwsuds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x66,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwsuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x66,0xcf,0xd3,0x51,0x7f]
+ vpdpwsuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x66,0xdf,0xd3,0x52,0x80]
+ vpdpwsuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwusd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4]
+ vpdpwusd xmm2, xmm3, xmm4
+
+// CHECK: vpdpwusd xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd2,0xd4]
+ vpdpwusd xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd2,0xd4]
+ vpdpwusd xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwusd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4]
+ vpdpwusd ymm2, ymm3, ymm4
+
+// CHECK: vpdpwusd ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd2,0xd4]
+ vpdpwusd ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd2,0xd4]
+ vpdpwusd ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwusd zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0xd4]
+ vpdpwusd zmm2, zmm3, zmm4
+
+// CHECK: vpdpwusd zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd2,0xd4]
+ vpdpwusd zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd2,0xd4]
+ vpdpwusd zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusd xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x65,0x18,0xd2,0x10]
+ vpdpwusd xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd2,0x51,0x7f]
+ vpdpwusd xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x65,0x9f,0xd2,0x52,0x80]
+ vpdpwusd xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusd ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x65,0x38,0xd2,0x10]
+ vpdpwusd ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd2,0x51,0x7f]
+ vpdpwusd ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x65,0xbf,0xd2,0x52,0x80]
+ vpdpwusd ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusd zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusd zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x65,0x58,0xd2,0x10]
+ vpdpwusd zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwusd zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd2,0x51,0x7f]
+ vpdpwusd zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x65,0xdf,0xd2,0x52,0x80]
+ vpdpwusd zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwusds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4]
+ vpdpwusds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwusds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd3,0xd4]
+ vpdpwusds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd3,0xd4]
+ vpdpwusds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwusds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4]
+ vpdpwusds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwusds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd3,0xd4]
+ vpdpwusds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd3,0xd4]
+ vpdpwusds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwusds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0xd4]
+ vpdpwusds zmm2, zmm3, zmm4
+
+// CHECK: vpdpwusds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd3,0xd4]
+ vpdpwusds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd3,0xd4]
+ vpdpwusds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x65,0x18,0xd3,0x10]
+ vpdpwusds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x65,0x8f,0xd3,0x51,0x7f]
+ vpdpwusds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x65,0x9f,0xd3,0x52,0x80]
+ vpdpwusds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x65,0x38,0xd3,0x10]
+ vpdpwusds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x65,0xaf,0xd3,0x51,0x7f]
+ vpdpwusds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x65,0xbf,0xd3,0x52,0x80]
+ vpdpwusds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x65,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x65,0x58,0xd3,0x10]
+ vpdpwusds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x65,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwusds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x65,0xcf,0xd3,0x51,0x7f]
+ vpdpwusds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x65,0xdf,0xd3,0x52,0x80]
+ vpdpwusds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4]
+ vpdpwuud xmm2, xmm3, xmm4
+
+// CHECK: vpdpwuud xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd2,0xd4]
+ vpdpwuud xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd2,0xd4]
+ vpdpwuud xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4]
+ vpdpwuud ymm2, ymm3, ymm4
+
+// CHECK: vpdpwuud ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd2,0xd4]
+ vpdpwuud ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd2,0xd4]
+ vpdpwuud ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwuud zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0xd4]
+ vpdpwuud zmm2, zmm3, zmm4
+
+// CHECK: vpdpwuud zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd2,0xd4]
+ vpdpwuud zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd2,0xd4]
+ vpdpwuud zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuud xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0xd2,0x10]
+ vpdpwuud xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd2,0x51,0x7f]
+ vpdpwuud xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0xd2,0x52,0x80]
+ vpdpwuud xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuud ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0xd2,0x10]
+ vpdpwuud ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd2,0x51,0x7f]
+ vpdpwuud ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0xd2,0x52,0x80]
+ vpdpwuud ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuud zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuud zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0xd2,0x10]
+ vpdpwuud zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd2,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwuud zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd2,0x51,0x7f]
+ vpdpwuud zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0xd2,0x52,0x80]
+ vpdpwuud zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4]
+ vpdpwuuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwuuds xmm2 {k7}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd3,0xd4]
+ vpdpwuuds xmm2 {k7}, xmm3, xmm4
+
+// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd3,0xd4]
+ vpdpwuuds xmm2 {k7} {z}, xmm3, xmm4
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4]
+ vpdpwuuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwuuds ymm2 {k7}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd3,0xd4]
+ vpdpwuuds ymm2 {k7}, ymm3, ymm4
+
+// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd3,0xd4]
+ vpdpwuuds ymm2 {k7} {z}, ymm3, ymm4
+
+// CHECK: vpdpwuuds zmm2, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0xd4]
+ vpdpwuuds zmm2, zmm3, zmm4
+
+// CHECK: vpdpwuuds zmm2 {k7}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd3,0xd4]
+ vpdpwuuds zmm2 {k7}, zmm3, zmm4
+
+// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd3,0xd4]
+ vpdpwuuds zmm2 {k7} {z}, zmm3, zmm4
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x0f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuuds xmm2 {k7}, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x18,0xd3,0x10]
+ vpdpwuuds xmm2, xmm3, dword ptr [eax]{1to4}
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf2,0x64,0x8f,0xd3,0x51,0x7f]
+ vpdpwuuds xmm2 {k7} {z}, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+// CHECK: encoding: [0x62,0xf2,0x64,0x9f,0xd3,0x52,0x80]
+ vpdpwuuds xmm2 {k7} {z}, xmm3, dword ptr [edx - 512]{1to4}
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x2f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuuds ymm2 {k7}, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0x38,0xd3,0x10]
+ vpdpwuuds ymm2, ymm3, dword ptr [eax]{1to8}
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf2,0x64,0xaf,0xd3,0x51,0x7f]
+ vpdpwuuds ymm2 {k7} {z}, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+// CHECK: encoding: [0x62,0xf2,0x64,0xbf,0xd3,0x52,0x80]
+ vpdpwuuds ymm2 {k7} {z}, ymm3, dword ptr [edx - 512]{1to8}
+
+// CHECK: vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuuds zmm2, zmm3, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0x62,0xf2,0x64,0x4f,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuuds zmm2 {k7}, zmm3, zmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0x58,0xd3,0x10]
+ vpdpwuuds zmm2, zmm3, dword ptr [eax]{1to16}
+
+// CHECK: vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+// CHECK: encoding: [0x62,0xf2,0x64,0x48,0xd3,0x14,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwuuds zmm2, zmm3, zmmword ptr [2*ebp - 2048]
+
+// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf2,0x64,0xcf,0xd3,0x51,0x7f]
+ vpdpwuuds zmm2 {k7} {z}, zmm3, zmmword ptr [ecx + 8128]
+
+// CHECK: vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf2,0x64,0xdf,0xd3,0x52,0x80]
+ vpdpwuuds zmm2 {k7} {z}, zmm3, dword ptr [edx - 512]{1to16}
+
// VMPSADBW
// CHECK: vmpsadbw xmm2, xmm3, xmm4, 123
diff --git a/llvm/test/MC/X86/avx10_2ni-64-att.s b/llvm/test/MC/X86/avx10_2ni-64-att.s
index 09566eb50ddad..4fa7c0a918528 100644
--- a/llvm/test/MC/X86/avx10_2ni-64-att.s
+++ b/llvm/test/MC/X86/avx10_2ni-64-att.s
@@ -1,5 +1,1415 @@
// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+// VNNI FP16
+
+// CHECK: vdpphps %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0x52,0xf0]
+ vdpphps %xmm24, %xmm23, %xmm22
+
+// CHECK: vdpphps %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0x52,0xf0]
+ vdpphps %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0x52,0xf0]
+ vdpphps %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vdpphps %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0x52,0xf0]
+ vdpphps %ymm24, %ymm23, %ymm22
+
+// CHECK: vdpphps %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0x52,0xf0]
+ vdpphps %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x52,0xf0]
+ vdpphps %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdpphps %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0x52,0xf0]
+ vdpphps %zmm24, %zmm23, %zmm22
+
+// CHECK: vdpphps %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0x52,0xf0]
+ vdpphps %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x52,0xf0]
+ vdpphps %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vdpphps 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vdpphps 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vdpphps 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vdpphps 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vdpphps (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x52,0x35,0x00,0x00,0x00,0x00]
+ vdpphps (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vdpphps -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vdpphps -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vdpphps 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x52,0x71,0x7f]
+ vdpphps 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vdpphps -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x52,0x72,0x80]
+ vdpphps -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vdpphps 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vdpphps 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vdpphps 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vdpphps 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vdpphps (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x52,0x35,0x00,0x00,0x00,0x00]
+ vdpphps (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vdpphps -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vdpphps -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vdpphps 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x52,0x71,0x7f]
+ vdpphps 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdpphps -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x52,0x72,0x80]
+ vdpphps -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vdpphps 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vdpphps 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vdpphps 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vdpphps 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vdpphps (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x52,0x35,0x00,0x00,0x00,0x00]
+ vdpphps (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vdpphps -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x52,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vdpphps -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vdpphps 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x52,0x71,0x7f]
+ vdpphps 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vdpphps -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x52,0x72,0x80]
+ vdpphps -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// VNNI INT8
+
+// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x47,0x00,0x50,0xf0]
+ vpdpbssd %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x07,0x50,0xf0]
+ vpdpbssd %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0x87,0x50,0xf0]
+ vpdpbssd %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x47,0x20,0x50,0xf0]
+ vpdpbssd %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x27,0x50,0xf0]
+ vpdpbssd %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0xa7,0x50,0xf0]
+ vpdpbssd %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x47,0x40,0x50,0xf0]
+ vpdpbssd %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x47,0x50,0xf0]
+ vpdpbssd %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0xc7,0x50,0xf0]
+ vpdpbssd %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssd 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbssd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbssd (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x10,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbssd (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbssd -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssd -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbssd 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0x87,0x50,0x71,0x7f]
+ vpdpbssd 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0x97,0x50,0x72,0x80]
+ vpdpbssd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssd 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbssd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbssd (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x30,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbssd (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbssd -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssd -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbssd 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x50,0x71,0x7f]
+ vpdpbssd 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x50,0x72,0x80]
+ vpdpbssd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssd 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbssd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbssd (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x50,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbssd (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbssd -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbssd -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbssd 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x50,0x71,0x7f]
+ vpdpbssd 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x50,0x72,0x80]
+ vpdpbssd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x47,0x00,0x51,0xf0]
+ vpdpbssds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x07,0x51,0xf0]
+ vpdpbssds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0x87,0x51,0xf0]
+ vpdpbssds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x47,0x20,0x51,0xf0]
+ vpdpbssds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x27,0x51,0xf0]
+ vpdpbssds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0xa7,0x51,0xf0]
+ vpdpbssds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x47,0x40,0x51,0xf0]
+ vpdpbssds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x47,0x47,0x51,0xf0]
+ vpdpbssds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x47,0xc7,0x51,0xf0]
+ vpdpbssds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbssds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbssds (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x10,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbssds (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbssds -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssds -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbssds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0x87,0x51,0x71,0x7f]
+ vpdpbssds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0x97,0x51,0x72,0x80]
+ vpdpbssds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbssds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbssds (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x30,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbssds (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbssds -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssds -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbssds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xa7,0x51,0x71,0x7f]
+ vpdpbssds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xb7,0x51,0x72,0x80]
+ vpdpbssds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x47,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbssds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x47,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbssds (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x50,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbssds (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbssds -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x47,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbssds -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbssds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xc7,0x51,0x71,0x7f]
+ vpdpbssds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbssds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x47,0xd7,0x51,0x72,0x80]
+ vpdpbssds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x00,0x50,0xf0]
+ vpdpbsud %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x07,0x50,0xf0]
+ vpdpbsud %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0x87,0x50,0xf0]
+ vpdpbsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x46,0x20,0x50,0xf0]
+ vpdpbsud %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x27,0x50,0xf0]
+ vpdpbsud %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xa7,0x50,0xf0]
+ vpdpbsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x40,0x50,0xf0]
+ vpdpbsud %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x47,0x50,0xf0]
+ vpdpbsud %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xc7,0x50,0xf0]
+ vpdpbsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbsud (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x10,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbsud (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbsud -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsud -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x87,0x50,0x71,0x7f]
+ vpdpbsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x97,0x50,0x72,0x80]
+ vpdpbsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbsud (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x30,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbsud (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbsud -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsud -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0x50,0x71,0x7f]
+ vpdpbsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0x50,0x72,0x80]
+ vpdpbsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbsud (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x50,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbsud (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbsud -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbsud -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0x50,0x71,0x7f]
+ vpdpbsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0x50,0x72,0x80]
+ vpdpbsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x00,0x51,0xf0]
+ vpdpbsuds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x07,0x51,0xf0]
+ vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0x87,0x51,0xf0]
+ vpdpbsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x46,0x20,0x51,0xf0]
+ vpdpbsuds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x27,0x51,0xf0]
+ vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xa7,0x51,0xf0]
+ vpdpbsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x40,0x51,0xf0]
+ vpdpbsuds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x47,0x51,0xf0]
+ vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xc7,0x51,0xf0]
+ vpdpbsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbsuds (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x10,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbsuds (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbsuds -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsuds -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x87,0x51,0x71,0x7f]
+ vpdpbsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x97,0x51,0x72,0x80]
+ vpdpbsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbsuds (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x30,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbsuds (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbsuds -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsuds -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0x51,0x71,0x7f]
+ vpdpbsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0x51,0x72,0x80]
+ vpdpbsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbsuds (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x50,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbsuds (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbsuds -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbsuds -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0x51,0x71,0x7f]
+ vpdpbsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0x51,0x72,0x80]
+ vpdpbsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0x50,0xf0]
+ vpdpbuud %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0x50,0xf0]
+ vpdpbuud %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0x50,0xf0]
+ vpdpbuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0x50,0xf0]
+ vpdpbuud %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0x50,0xf0]
+ vpdpbuud %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x50,0xf0]
+ vpdpbuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0x50,0xf0]
+ vpdpbuud %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0x50,0xf0]
+ vpdpbuud %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x50,0xf0]
+ vpdpbuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbuud (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbuud (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbuud -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuud -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x50,0x71,0x7f]
+ vpdpbuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x50,0x72,0x80]
+ vpdpbuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbuud (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbuud (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbuud -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuud -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x50,0x71,0x7f]
+ vpdpbuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x50,0x72,0x80]
+ vpdpbuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbuud (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x50,0x35,0x00,0x00,0x00,0x00]
+ vpdpbuud (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbuud -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x50,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbuud -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x50,0x71,0x7f]
+ vpdpbuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x50,0x72,0x80]
+ vpdpbuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0x51,0xf0]
+ vpdpbuuds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0x51,0xf0]
+ vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0x51,0xf0]
+ vpdpbuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0x51,0xf0]
+ vpdpbuuds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0x51,0xf0]
+ vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0x51,0xf0]
+ vpdpbuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0x51,0xf0]
+ vpdpbuuds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0x51,0xf0]
+ vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0x51,0xf0]
+ vpdpbuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpbuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpbuuds (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbuuds (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpbuuds -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuuds -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpbuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0x51,0x71,0x7f]
+ vpdpbuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0x51,0x72,0x80]
+ vpdpbuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpbuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpbuuds (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbuuds (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpbuuds -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuuds -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpbuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0x51,0x71,0x7f]
+ vpdpbuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0x51,0x72,0x80]
+ vpdpbuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpbuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpbuuds (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0x51,0x35,0x00,0x00,0x00,0x00]
+ vpdpbuuds (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpbuuds -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0x51,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpbuuds -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpbuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0x51,0x71,0x7f]
+ vpdpbuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpbuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0x51,0x72,0x80]
+ vpdpbuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// VNNI INT16
+
+// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x00,0xd2,0xf0]
+ vpdpwsud %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x07,0xd2,0xf0]
+ vpdpwsud %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0x87,0xd2,0xf0]
+ vpdpwsud %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x46,0x20,0xd2,0xf0]
+ vpdpwsud %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x27,0xd2,0xf0]
+ vpdpwsud %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xa7,0xd2,0xf0]
+ vpdpwsud %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x40,0xd2,0xf0]
+ vpdpwsud %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x47,0xd2,0xf0]
+ vpdpwsud %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xc7,0xd2,0xf0]
+ vpdpwsud %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwsud (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x10,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwsud (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwsud -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsud -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x87,0xd2,0x71,0x7f]
+ vpdpwsud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x97,0xd2,0x72,0x80]
+ vpdpwsud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwsud (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x30,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwsud (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwsud -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsud -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0xd2,0x71,0x7f]
+ vpdpwsud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0xd2,0x72,0x80]
+ vpdpwsud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwsud (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x50,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwsud (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwsud -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwsud -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0xd2,0x71,0x7f]
+ vpdpwsud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0xd2,0x72,0x80]
+ vpdpwsud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x00,0xd3,0xf0]
+ vpdpwsuds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x07,0xd3,0xf0]
+ vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0x87,0xd3,0xf0]
+ vpdpwsuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x46,0x20,0xd3,0xf0]
+ vpdpwsuds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x27,0xd3,0xf0]
+ vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xa7,0xd3,0xf0]
+ vpdpwsuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x46,0x40,0xd3,0xf0]
+ vpdpwsuds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x46,0x47,0xd3,0xf0]
+ vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x46,0xc7,0xd3,0xf0]
+ vpdpwsuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwsuds (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x10,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwsuds (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwsuds -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsuds -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x87,0xd3,0x71,0x7f]
+ vpdpwsuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0x97,0xd3,0x72,0x80]
+ vpdpwsuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwsuds (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x30,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwsuds (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwsuds -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsuds -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xa7,0xd3,0x71,0x7f]
+ vpdpwsuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xb7,0xd3,0x72,0x80]
+ vpdpwsuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x46,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x46,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwsuds (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x50,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwsuds (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwsuds -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x46,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwsuds -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xc7,0xd3,0x71,0x7f]
+ vpdpwsuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x46,0xd7,0xd3,0x72,0x80]
+ vpdpwsuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x45,0x00,0xd2,0xf0]
+ vpdpwusd %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x07,0xd2,0xf0]
+ vpdpwusd %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0x87,0xd2,0xf0]
+ vpdpwusd %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x45,0x20,0xd2,0xf0]
+ vpdpwusd %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x27,0xd2,0xf0]
+ vpdpwusd %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0xa7,0xd2,0xf0]
+ vpdpwusd %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x45,0x40,0xd2,0xf0]
+ vpdpwusd %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x47,0xd2,0xf0]
+ vpdpwusd %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0xc7,0xd2,0xf0]
+ vpdpwusd %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusd 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwusd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusd 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwusd (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x10,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwusd (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwusd -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusd -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwusd 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0x87,0xd2,0x71,0x7f]
+ vpdpwusd 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0x97,0xd2,0x72,0x80]
+ vpdpwusd -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusd 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwusd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusd 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwusd (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x30,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwusd (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwusd -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusd -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwusd 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xa7,0xd2,0x71,0x7f]
+ vpdpwusd 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xb7,0xd2,0x72,0x80]
+ vpdpwusd -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusd 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwusd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusd 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwusd (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x50,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwusd (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwusd -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwusd -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwusd 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xc7,0xd2,0x71,0x7f]
+ vpdpwusd 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xd7,0xd2,0x72,0x80]
+ vpdpwusd -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x45,0x00,0xd3,0xf0]
+ vpdpwusds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x07,0xd3,0xf0]
+ vpdpwusds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0x87,0xd3,0xf0]
+ vpdpwusds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x45,0x20,0xd3,0xf0]
+ vpdpwusds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x27,0xd3,0xf0]
+ vpdpwusds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0xa7,0xd3,0xf0]
+ vpdpwusds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x45,0x40,0xd3,0xf0]
+ vpdpwusds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x45,0x47,0xd3,0xf0]
+ vpdpwusds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x45,0xc7,0xd3,0xf0]
+ vpdpwusds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwusds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwusds (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x10,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwusds (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwusds -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusds -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwusds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0x87,0xd3,0x71,0x7f]
+ vpdpwusds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0x97,0xd3,0x72,0x80]
+ vpdpwusds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwusds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwusds (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x30,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwusds (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwusds -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusds -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwusds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xa7,0xd3,0x71,0x7f]
+ vpdpwusds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xb7,0xd3,0x72,0x80]
+ vpdpwusds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x45,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwusds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x45,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwusds (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x50,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwusds (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwusds -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x45,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwusds -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwusds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xc7,0xd3,0x71,0x7f]
+ vpdpwusds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwusds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x45,0xd7,0xd3,0x72,0x80]
+ vpdpwusds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0xd2,0xf0]
+ vpdpwuud %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0xd2,0xf0]
+ vpdpwuud %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0xd2,0xf0]
+ vpdpwuud %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0xd2,0xf0]
+ vpdpwuud %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0xd2,0xf0]
+ vpdpwuud %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0xd2,0xf0]
+ vpdpwuud %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0xd2,0xf0]
+ vpdpwuud %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0xd2,0xf0]
+ vpdpwuud %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0xd2,0xf0]
+ vpdpwuud %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuud 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuud 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwuud (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwuud (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwuud -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0xd2,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuud -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0xd2,0x71,0x7f]
+ vpdpwuud 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0xd2,0x72,0x80]
+ vpdpwuud -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuud 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuud 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwuud (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwuud (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwuud -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0xd2,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuud -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0xd2,0x71,0x7f]
+ vpdpwuud 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0xd2,0x72,0x80]
+ vpdpwuud -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0xd2,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuud 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0xd2,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuud 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwuud (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0xd2,0x35,0x00,0x00,0x00,0x00]
+ vpdpwuud (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwuud -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0xd2,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwuud -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0xd2,0x71,0x7f]
+ vpdpwuud 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0xd2,0x72,0x80]
+ vpdpwuud -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x00,0xd3,0xf0]
+ vpdpwuuds %xmm24, %xmm23, %xmm22
+
+// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x07,0xd3,0xf0]
+ vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0x87,0xd3,0xf0]
+ vpdpwuuds %xmm24, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0x82,0x44,0x20,0xd3,0xf0]
+ vpdpwuuds %ymm24, %ymm23, %ymm22
+
+// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x27,0xd3,0xf0]
+ vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xa7,0xd3,0xf0]
+ vpdpwuuds %ymm24, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0x82,0x44,0x40,0xd3,0xf0]
+ vpdpwuuds %zmm24, %zmm23, %zmm22
+
+// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0x82,0x44,0x47,0xd3,0xf0]
+ vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0x82,0x44,0xc7,0xd3,0xf0]
+ vpdpwuuds %zmm24, %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x00,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuuds 268435456(%rbp,%r14,8), %xmm23, %xmm22
+
+// CHECK: vpdpwuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x07,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuuds 291(%r8,%rax,4), %xmm23, %xmm22 {%k7}
+
+// CHECK: vpdpwuuds (%rip){1to4}, %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x10,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwuuds (%rip){1to4}, %xmm23, %xmm22
+
+// CHECK: vpdpwuuds -512(,%rbp,2), %xmm23, %xmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x00,0xd3,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuuds -512(,%rbp,2), %xmm23, %xmm22
+
+// CHECK: vpdpwuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x87,0xd3,0x71,0x7f]
+ vpdpwuuds 2032(%rcx), %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0x97,0xd3,0x72,0x80]
+ vpdpwuuds -512(%rdx){1to4}, %xmm23, %xmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x20,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuuds 268435456(%rbp,%r14,8), %ymm23, %ymm22
+
+// CHECK: vpdpwuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x27,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuuds 291(%r8,%rax,4), %ymm23, %ymm22 {%k7}
+
+// CHECK: vpdpwuuds (%rip){1to8}, %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x30,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwuuds (%rip){1to8}, %ymm23, %ymm22
+
+// CHECK: vpdpwuuds -1024(,%rbp,2), %ymm23, %ymm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x20,0xd3,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuuds -1024(,%rbp,2), %ymm23, %ymm22
+
+// CHECK: vpdpwuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xa7,0xd3,0x71,0x7f]
+ vpdpwuuds 4064(%rcx), %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xb7,0xd3,0x72,0x80]
+ vpdpwuuds -512(%rdx){1to8}, %ymm23, %ymm22 {%k7} {z}
+
+// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xa2,0x44,0x40,0xd3,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuuds 268435456(%rbp,%r14,8), %zmm23, %zmm22
+
+// CHECK: vpdpwuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+// CHECK: encoding: [0x62,0xc2,0x44,0x47,0xd3,0xb4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuuds 291(%r8,%rax,4), %zmm23, %zmm22 {%k7}
+
+// CHECK: vpdpwuuds (%rip){1to16}, %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x50,0xd3,0x35,0x00,0x00,0x00,0x00]
+ vpdpwuuds (%rip){1to16}, %zmm23, %zmm22
+
+// CHECK: vpdpwuuds -2048(,%rbp,2), %zmm23, %zmm22
+// CHECK: encoding: [0x62,0xe2,0x44,0x40,0xd3,0x34,0x6d,0x00,0xf8,0xff,0xff]
+ vpdpwuuds -2048(,%rbp,2), %zmm23, %zmm22
+
+// CHECK: vpdpwuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xc7,0xd3,0x71,0x7f]
+ vpdpwuuds 8128(%rcx), %zmm23, %zmm22 {%k7} {z}
+
+// CHECK: vpdpwuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+// CHECK: encoding: [0x62,0xe2,0x44,0xd7,0xd3,0x72,0x80]
+ vpdpwuuds -512(%rdx){1to16}, %zmm23, %zmm22 {%k7} {z}
+
// VMPSADBW
// CHECK: vmpsadbw $123, %xmm24, %xmm23, %xmm22
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index e85cde3140594..3b7caeff44e57 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -4175,6 +4175,9 @@ static const X86FoldTableEntry Table3[] = {
{X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0},
{X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0},
{X86::VDPBF16PSZr, X86::VDPBF16PSZm, 0},
+ {X86::VDPPHPSZ128r, X86::VDPPHPSZ128m, 0},
+ {X86::VDPPHPSZ256r, X86::VDPPHPSZ256m, 0},
+ {X86::VDPPHPSZr, X86::VDPPHPSZm, 0},
{X86::VEXP2PDZrk, X86::VEXP2PDZmk, 0},
{X86::VEXP2PSZrk, X86::VEXP2PSZmk, 0},
{X86::VEXPANDPDZ128rrk, X86::VEXPANDPDZ128rmk, TB_NO_REVERSE},
@@ -4913,12 +4916,24 @@ static const X86FoldTableEntry Table3[] = {
{X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0},
{X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0},
{X86::VPDPBSSDSYrr, X86::VPDPBSSDSYrm, 0},
+ {X86::VPDPBSSDSZ128r, X86::VPDPBSSDSZ128m, 0},
+ {X86::VPDPBSSDSZ256r, X86::VPDPBSSDSZ256m, 0},
+ {X86::VPDPBSSDSZr, X86::VPDPBSSDSZm, 0},
{X86::VPDPBSSDSrr, X86::VPDPBSSDSrm, 0},
{X86::VPDPBSSDYrr, X86::VPDPBSSDYrm, 0},
+ {X86::VPDPBSSDZ128r, X86::VPDPBSSDZ128m, 0},
+ {X86::VPDPBSSDZ256r, X86::VPDPBSSDZ256m, 0},
+ {X86::VPDPBSSDZr, X86::VPDPBSSDZm, 0},
{X86::VPDPBSSDrr, X86::VPDPBSSDrm, 0},
{X86::VPDPBSUDSYrr, X86::VPDPBSUDSYrm, 0},
+ {X86::VPDPBSUDSZ128r, X86::VPDPBSUDSZ128m, 0},
+ {X86::VPDPBSUDSZ256r, X86::VPDPBSUDSZ256m, 0},
+ {X86::VPDPBSUDSZr, X86::VPDPBSUDSZm, 0},
{X86::VPDPBSUDSrr, X86::VPDPBSUDSrm, 0},
{X86::VPDPBSUDYrr, X86::VPDPBSUDYrm, 0},
+ {X86::VPDPBSUDZ128r, X86::VPDPBSUDZ128m, 0},
+ {X86::VPDPBSUDZ256r, X86::VPDPBSUDZ256m, 0},
+ {X86::VPDPBSUDZr, X86::VPDPBSUDZm, 0},
{X86::VPDPBSUDrr, X86::VPDPBSUDrm, 0},
{X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0},
{X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0},
@@ -4931,8 +4946,14 @@ static const X86FoldTableEntry Table3[] = {
{X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0},
{X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0},
{X86::VPDPBUUDSYrr, X86::VPDPBUUDSYrm, 0},
+ {X86::VPDPBUUDSZ128r, X86::VPDPBUUDSZ128m, 0},
+ {X86::VPDPBUUDSZ256r, X86::VPDPBUUDSZ256m, 0},
+ {X86::VPDPBUUDSZr, X86::VPDPBUUDSZm, 0},
{X86::VPDPBUUDSrr, X86::VPDPBUUDSrm, 0},
{X86::VPDPBUUDYrr, X86::VPDPBUUDYrm, 0},
+ {X86::VPDPBUUDZ128r, X86::VPDPBUUDZ128m, 0},
+ {X86::VPDPBUUDZ256r, X86::VPDPBUUDZ256m, 0},
+ {X86::VPDPBUUDZr, X86::VPDPBUUDZm, 0},
{X86::VPDPBUUDrr, X86::VPDPBUUDrm, 0},
{X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0},
{X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0},
@@ -4945,16 +4966,34 @@ static const X86FoldTableEntry Table3[] = {
{X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0},
{X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0},
{X86::VPDPWSUDSYrr, X86::VPDPWSUDSYrm, 0},
+ {X86::VPDPWSUDSZ128r, X86::VPDPWSUDSZ128m, 0},
+ {X86::VPDPWSUDSZ256r, X86::VPDPWSUDSZ256m, 0},
+ {X86::VPDPWSUDSZr, X86::VPDPWSUDSZm, 0},
{X86::VPDPWSUDSrr, X86::VPDPWSUDSrm, 0},
{X86::VPDPWSUDYrr, X86::VPDPWSUDYrm, 0},
+ {X86::VPDPWSUDZ128r, X86::VPDPWSUDZ128m, 0},
+ {X86::VPDPWSUDZ256r, X86::VPDPWSUDZ256m, 0},
+ {X86::VPDPWSUDZr, X86::VPDPWSUDZm, 0},
{X86::VPDPWSUDrr, X86::VPDPWSUDrm, 0},
{X86::VPDPWUSDSYrr, X86::VPDPWUSDSYrm, 0},
+ {X86::VPDPWUSDSZ128r, X86::VPDPWUSDSZ128m, 0},
+ {X86::VPDPWUSDSZ256r, X86::VPDPWUSDSZ256m, 0},
+ {X86::VPDPWUSDSZr, X86::VPDPWUSDSZm, 0},
{X86::VPDPWUSDSrr, X86::VPDPWUSDSrm, 0},
{X86::VPDPWUSDYrr, X86::VPDPWUSDYrm, 0},
+ {X86::VPDPWUSDZ128r, X86::VPDPWUSDZ128m, 0},
+ {X86::VPDPWUSDZ256r, X86::VPDPWUSDZ256m, 0},
+ {X86::VPDPWUSDZr, X86::VPDPWUSDZm, 0},
{X86::VPDPWUSDrr, X86::VPDPWUSDrm, 0},
{X86::VPDPWUUDSYrr, X86::VPDPWUUDSYrm, 0},
+ {X86::VPDPWUUDSZ128r, X86::VPDPWUUDSZ128m, 0},
+ {X86::VPDPWUUDSZ256r, X86::VPDPWUUDSZ256m, 0},
+ {X86::VPDPWUUDSZr, X86::VPDPWUUDSZm, 0},
{X86::VPDPWUUDSrr, X86::VPDPWUUDSrm, 0},
{X86::VPDPWUUDYrr, X86::VPDPWUUDYrm, 0},
+ {X86::VPDPWUUDZ128r, X86::VPDPWUUDZ128m, 0},
+ {X86::VPDPWUUDZ256r, X86::VPDPWUUDZ256m, 0},
+ {X86::VPDPWUUDZr, X86::VPDPWUUDZm, 0},
{X86::VPDPWUUDrr, X86::VPDPWUUDrm, 0},
{X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0},
{X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0},
@@ -5628,6 +5667,12 @@ static const X86FoldTableEntry Table4[] = {
{X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mkz, 0},
{X86::VDPBF16PSZrk, X86::VDPBF16PSZmk, 0},
{X86::VDPBF16PSZrkz, X86::VDPBF16PSZmkz, 0},
+ {X86::VDPPHPSZ128rk, X86::VDPPHPSZ128mk, 0},
+ {X86::VDPPHPSZ128rkz, X86::VDPPHPSZ128mkz, 0},
+ {X86::VDPPHPSZ256rk, X86::VDPPHPSZ256mk, 0},
+ {X86::VDPPHPSZ256rkz, X86::VDPPHPSZ256mkz, 0},
+ {X86::VDPPHPSZrk, X86::VDPPHPSZmk, 0},
+ {X86::VDPPHPSZrkz, X86::VDPPHPSZmkz, 0},
{X86::VFCMADDCPHZ128rk, X86::VFCMADDCPHZ128mk, 0},
{X86::VFCMADDCPHZ128rkz, X86::VFCMADDCPHZ128mkz, 0},
{X86::VFCMADDCPHZ256rk, X86::VFCMADDCPHZ256mk, 0},
@@ -6226,6 +6271,30 @@ static const X86FoldTableEntry Table4[] = {
{X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0},
{X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0},
{X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0},
+ {X86::VPDPBSSDSZ128rk, X86::VPDPBSSDSZ128mk, 0},
+ {X86::VPDPBSSDSZ128rkz, X86::VPDPBSSDSZ128mkz, 0},
+ {X86::VPDPBSSDSZ256rk, X86::VPDPBSSDSZ256mk, 0},
+ {X86::VPDPBSSDSZ256rkz, X86::VPDPBSSDSZ256mkz, 0},
+ {X86::VPDPBSSDSZrk, X86::VPDPBSSDSZmk, 0},
+ {X86::VPDPBSSDSZrkz, X86::VPDPBSSDSZmkz, 0},
+ {X86::VPDPBSSDZ128rk, X86::VPDPBSSDZ128mk, 0},
+ {X86::VPDPBSSDZ128rkz, X86::VPDPBSSDZ128mkz, 0},
+ {X86::VPDPBSSDZ256rk, X86::VPDPBSSDZ256mk, 0},
+ {X86::VPDPBSSDZ256rkz, X86::VPDPBSSDZ256mkz, 0},
+ {X86::VPDPBSSDZrk, X86::VPDPBSSDZmk, 0},
+ {X86::VPDPBSSDZrkz, X86::VPDPBSSDZmkz, 0},
+ {X86::VPDPBSUDSZ128rk, X86::VPDPBSUDSZ128mk, 0},
+ {X86::VPDPBSUDSZ128rkz, X86::VPDPBSUDSZ128mkz, 0},
+ {X86::VPDPBSUDSZ256rk, X86::VPDPBSUDSZ256mk, 0},
+ {X86::VPDPBSUDSZ256rkz, X86::VPDPBSUDSZ256mkz, 0},
+ {X86::VPDPBSUDSZrk, X86::VPDPBSUDSZmk, 0},
+ {X86::VPDPBSUDSZrkz, X86::VPDPBSUDSZmkz, 0},
+ {X86::VPDPBSUDZ128rk, X86::VPDPBSUDZ128mk, 0},
+ {X86::VPDPBSUDZ128rkz, X86::VPDPBSUDZ128mkz, 0},
+ {X86::VPDPBSUDZ256rk, X86::VPDPBSUDZ256mk, 0},
+ {X86::VPDPBSUDZ256rkz, X86::VPDPBSUDZ256mkz, 0},
+ {X86::VPDPBSUDZrk, X86::VPDPBSUDZmk, 0},
+ {X86::VPDPBSUDZrkz, X86::VPDPBSUDZmkz, 0},
{X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mk, 0},
{X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mkz, 0},
{X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mk, 0},
@@ -6238,6 +6307,18 @@ static const X86FoldTableEntry Table4[] = {
{X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mkz, 0},
{X86::VPDPBUSDZrk, X86::VPDPBUSDZmk, 0},
{X86::VPDPBUSDZrkz, X86::VPDPBUSDZmkz, 0},
+ {X86::VPDPBUUDSZ128rk, X86::VPDPBUUDSZ128mk, 0},
+ {X86::VPDPBUUDSZ128rkz, X86::VPDPBUUDSZ128mkz, 0},
+ {X86::VPDPBUUDSZ256rk, X86::VPDPBUUDSZ256mk, 0},
+ {X86::VPDPBUUDSZ256rkz, X86::VPDPBUUDSZ256mkz, 0},
+ {X86::VPDPBUUDSZrk, X86::VPDPBUUDSZmk, 0},
+ {X86::VPDPBUUDSZrkz, X86::VPDPBUUDSZmkz, 0},
+ {X86::VPDPBUUDZ128rk, X86::VPDPBUUDZ128mk, 0},
+ {X86::VPDPBUUDZ128rkz, X86::VPDPBUUDZ128mkz, 0},
+ {X86::VPDPBUUDZ256rk, X86::VPDPBUUDZ256mk, 0},
+ {X86::VPDPBUUDZ256rkz, X86::VPDPBUUDZ256mkz, 0},
+ {X86::VPDPBUUDZrk, X86::VPDPBUUDZmk, 0},
+ {X86::VPDPBUUDZrkz, X86::VPDPBUUDZmkz, 0},
{X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mk, 0},
{X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mkz, 0},
{X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mk, 0},
@@ -6250,6 +6331,42 @@ static const X86FoldTableEntry Table4[] = {
{X86::VPDPWSSDZ256rkz, X86::VPDPWSSDZ256mkz, 0},
{X86::VPDPWSSDZrk, X86::VPDPWSSDZmk, 0},
{X86::VPDPWSSDZrkz, X86::VPDPWSSDZmkz, 0},
+ {X86::VPDPWSUDSZ128rk, X86::VPDPWSUDSZ128mk, 0},
+ {X86::VPDPWSUDSZ128rkz, X86::VPDPWSUDSZ128mkz, 0},
+ {X86::VPDPWSUDSZ256rk, X86::VPDPWSUDSZ256mk, 0},
+ {X86::VPDPWSUDSZ256rkz, X86::VPDPWSUDSZ256mkz, 0},
+ {X86::VPDPWSUDSZrk, X86::VPDPWSUDSZmk, 0},
+ {X86::VPDPWSUDSZrkz, X86::VPDPWSUDSZmkz, 0},
+ {X86::VPDPWSUDZ128rk, X86::VPDPWSUDZ128mk, 0},
+ {X86::VPDPWSUDZ128rkz, X86::VPDPWSUDZ128mkz, 0},
+ {X86::VPDPWSUDZ256rk, X86::VPDPWSUDZ256mk, 0},
+ {X86::VPDPWSUDZ256rkz, X86::VPDPWSUDZ256mkz, 0},
+ {X86::VPDPWSUDZrk, X86::VPDPWSUDZmk, 0},
+ {X86::VPDPWSUDZrkz, X86::VPDPWSUDZmkz, 0},
+ {X86::VPDPWUSDSZ128rk, X86::VPDPWUSDSZ128mk, 0},
+ {X86::VPDPWUSDSZ128rkz, X86::VPDPWUSDSZ128mkz, 0},
+ {X86::VPDPWUSDSZ256rk, X86::VPDPWUSDSZ256mk, 0},
+ {X86::VPDPWUSDSZ256rkz, X86::VPDPWUSDSZ256mkz, 0},
+ {X86::VPDPWUSDSZrk, X86::VPDPWUSDSZmk, 0},
+ {X86::VPDPWUSDSZrkz, X86::VPDPWUSDSZmkz, 0},
+ {X86::VPDPWUSDZ128rk, X86::VPDPWUSDZ128mk, 0},
+ {X86::VPDPWUSDZ128rkz, X86::VPDPWUSDZ128mkz, 0},
+ {X86::VPDPWUSDZ256rk, X86::VPDPWUSDZ256mk, 0},
+ {X86::VPDPWUSDZ256rkz, X86::VPDPWUSDZ256mkz, 0},
+ {X86::VPDPWUSDZrk, X86::VPDPWUSDZmk, 0},
+ {X86::VPDPWUSDZrkz, X86::VPDPWUSDZmkz, 0},
+ {X86::VPDPWUUDSZ128rk, X86::VPDPWUUDSZ128mk, 0},
+ {X86::VPDPWUUDSZ128rkz, X86::VPDPWUUDSZ128mkz, 0},
+ {X86::VPDPWUUDSZ256rk, X86::VPDPWUUDSZ256mk, 0},
+ {X86::VPDPWUUDSZ256rkz, X86::VPDPWUUDSZ256mkz, 0},
+ {X86::VPDPWUUDSZrk, X86::VPDPWUUDSZmk, 0},
+ {X86::VPDPWUUDSZrkz, X86::VPDPWUUDSZmkz, 0},
+ {X86::VPDPWUUDZ128rk, X86::VPDPWUUDZ128mk, 0},
+ {X86::VPDPWUUDZ128rkz, X86::VPDPWUUDZ128mkz, 0},
+ {X86::VPDPWUUDZ256rk, X86::VPDPWUUDZ256mk, 0},
+ {X86::VPDPWUUDZ256rkz, X86::VPDPWUUDZ256mkz, 0},
+ {X86::VPDPWUUDZrk, X86::VPDPWUUDZmk, 0},
+ {X86::VPDPWUUDZrkz, X86::VPDPWUUDZmkz, 0},
{X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0},
{X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0},
{X86::VPERMBZrrk, X86::VPERMBZrmk, 0},
@@ -7892,6 +8009,9 @@ static const X86FoldTableEntry BroadcastTable3[] = {
{X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128mb, TB_BCAST_SS},
{X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256mb, TB_BCAST_SS},
{X86::VDPBF16PSZr, X86::VDPBF16PSZmb, TB_BCAST_SS},
+ {X86::VDPPHPSZ128r, X86::VDPPHPSZ128mb, TB_BCAST_SS},
+ {X86::VDPPHPSZ256r, X86::VDPPHPSZ256mb, TB_BCAST_SS},
+ {X86::VDPPHPSZr, X86::VDPPHPSZmb, TB_BCAST_SS},
{X86::VEXP2PDZrk, X86::VEXP2PDZmbk, TB_BCAST_SD},
{X86::VEXP2PSZrk, X86::VEXP2PSZmbk, TB_BCAST_SS},
{X86::VFCMADDCPHZ128r, X86::VFCMADDCPHZ128mb, TB_BCAST_SS},
@@ -8227,18 +8347,54 @@ static const X86FoldTableEntry BroadcastTable3[] = {
{X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmbk, TB_BCAST_Q},
{X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmbk, TB_BCAST_Q},
{X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmbk, TB_BCAST_Q},
+ {X86::VPDPBSSDSZ128r, X86::VPDPBSSDSZ128mb, TB_BCAST_D},
+ {X86::VPDPBSSDSZ256r, X86::VPDPBSSDSZ256mb, TB_BCAST_D},
+ {X86::VPDPBSSDSZr, X86::VPDPBSSDSZmb, TB_BCAST_D},
+ {X86::VPDPBSSDZ128r, X86::VPDPBSSDZ128mb, TB_BCAST_D},
+ {X86::VPDPBSSDZ256r, X86::VPDPBSSDZ256mb, TB_BCAST_D},
+ {X86::VPDPBSSDZr, X86::VPDPBSSDZmb, TB_BCAST_D},
+ {X86::VPDPBSUDSZ128r, X86::VPDPBSUDSZ128mb, TB_BCAST_D},
+ {X86::VPDPBSUDSZ256r, X86::VPDPBSUDSZ256mb, TB_BCAST_D},
+ {X86::VPDPBSUDSZr, X86::VPDPBSUDSZmb, TB_BCAST_D},
+ {X86::VPDPBSUDZ128r, X86::VPDPBSUDZ128mb, TB_BCAST_D},
+ {X86::VPDPBSUDZ256r, X86::VPDPBSUDZ256mb, TB_BCAST_D},
+ {X86::VPDPBSUDZr, X86::VPDPBSUDZmb, TB_BCAST_D},
{X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128mb, TB_BCAST_D},
{X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256mb, TB_BCAST_D},
{X86::VPDPBUSDSZr, X86::VPDPBUSDSZmb, TB_BCAST_D},
{X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128mb, TB_BCAST_D},
{X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256mb, TB_BCAST_D},
{X86::VPDPBUSDZr, X86::VPDPBUSDZmb, TB_BCAST_D},
+ {X86::VPDPBUUDSZ128r, X86::VPDPBUUDSZ128mb, TB_BCAST_D},
+ {X86::VPDPBUUDSZ256r, X86::VPDPBUUDSZ256mb, TB_BCAST_D},
+ {X86::VPDPBUUDSZr, X86::VPDPBUUDSZmb, TB_BCAST_D},
+ {X86::VPDPBUUDZ128r, X86::VPDPBUUDZ128mb, TB_BCAST_D},
+ {X86::VPDPBUUDZ256r, X86::VPDPBUUDZ256mb, TB_BCAST_D},
+ {X86::VPDPBUUDZr, X86::VPDPBUUDZmb, TB_BCAST_D},
{X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128mb, TB_BCAST_D},
{X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256mb, TB_BCAST_D},
{X86::VPDPWSSDSZr, X86::VPDPWSSDSZmb, TB_BCAST_D},
{X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128mb, TB_BCAST_D},
{X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256mb, TB_BCAST_D},
{X86::VPDPWSSDZr, X86::VPDPWSSDZmb, TB_BCAST_D},
+ {X86::VPDPWSUDSZ128r, X86::VPDPWSUDSZ128mb, TB_BCAST_D},
+ {X86::VPDPWSUDSZ256r, X86::VPDPWSUDSZ256mb, TB_BCAST_D},
+ {X86::VPDPWSUDSZr, X86::VPDPWSUDSZmb, TB_BCAST_D},
+ {X86::VPDPWSUDZ128r, X86::VPDPWSUDZ128mb, TB_BCAST_D},
+ {X86::VPDPWSUDZ256r, X86::VPDPWSUDZ256mb, TB_BCAST_D},
+ {X86::VPDPWSUDZr, X86::VPDPWSUDZmb, TB_BCAST_D},
+ {X86::VPDPWUSDSZ128r, X86::VPDPWUSDSZ128mb, TB_BCAST_D},
+ {X86::VPDPWUSDSZ256r, X86::VPDPWUSDSZ256mb, TB_BCAST_D},
+ {X86::VPDPWUSDSZr, X86::VPDPWUSDSZmb, TB_BCAST_D},
+ {X86::VPDPWUSDZ128r, X86::VPDPWUSDZ128mb, TB_BCAST_D},
+ {X86::VPDPWUSDZ256r, X86::VPDPWUSDZ256mb, TB_BCAST_D},
+ {X86::VPDPWUSDZr, X86::VPDPWUSDZmb, TB_BCAST_D},
+ {X86::VPDPWUUDSZ128r, X86::VPDPWUUDSZ128mb, TB_BCAST_D},
+ {X86::VPDPWUUDSZ256r, X86::VPDPWUUDSZ256mb, TB_BCAST_D},
+ {X86::VPDPWUUDSZr, X86::VPDPWUUDSZmb, TB_BCAST_D},
+ {X86::VPDPWUUDZ128r, X86::VPDPWUUDZ128mb, TB_BCAST_D},
+ {X86::VPDPWUUDZ256r, X86::VPDPWUUDZ256mb, TB_BCAST_D},
+ {X86::VPDPWUUDZr, X86::VPDPWUUDZmb, TB_BCAST_D},
{X86::VPERMDZ256rrkz, X86::VPERMDZ256rmbkz, TB_BCAST_D},
{X86::VPERMDZrrkz, X86::VPERMDZrmbkz, TB_BCAST_D},
{X86::VPERMI2DZ128rr, X86::VPERMI2DZ128rmb, TB_BCAST_D},
@@ -8632,6 +8788,12 @@ static const X86FoldTableEntry BroadcastTable4[] = {
{X86::VDPBF16PSZ256rkz, X86::VDPBF16PSZ256mbkz, TB_BCAST_SS},
{X86::VDPBF16PSZrk, X86::VDPBF16PSZmbk, TB_BCAST_SS},
{X86::VDPBF16PSZrkz, X86::VDPBF16PSZmbkz, TB_BCAST_SS},
+ {X86::VDPPHPSZ128rk, X86::VDPPHPSZ128mbk, TB_BCAST_SS},
+ {X86::VDPPHPSZ128rkz, X86::VDPPHPSZ128mbkz, TB_BCAST_SS},
+ {X86::VDPPHPSZ256rk, X86::VDPPHPSZ256mbk, TB_BCAST_SS},
+ {X86::VDPPHPSZ256rkz, X86::VDPPHPSZ256mbkz, TB_BCAST_SS},
+ {X86::VDPPHPSZrk, X86::VDPPHPSZmbk, TB_BCAST_SS},
+ {X86::VDPPHPSZrkz, X86::VDPPHPSZmbkz, TB_BCAST_SS},
{X86::VFCMADDCPHZ128rk, X86::VFCMADDCPHZ128mbk, TB_BCAST_SS},
{X86::VFCMADDCPHZ128rkz, X86::VFCMADDCPHZ128mbkz, TB_BCAST_SS},
{X86::VFCMADDCPHZ256rk, X86::VFCMADDCPHZ256mbk, TB_BCAST_SS},
@@ -9079,6 +9241,30 @@ static const X86FoldTableEntry BroadcastTable4[] = {
{X86::VPANDQZ128rrk, X86::VPANDQZ128rmbk, TB_BCAST_Q},
{X86::VPANDQZ256rrk, X86::VPANDQZ256rmbk, TB_BCAST_Q},
{X86::VPANDQZrrk, X86::VPANDQZrmbk, TB_BCAST_Q},
+ {X86::VPDPBSSDSZ128rk, X86::VPDPBSSDSZ128mbk, TB_BCAST_D},
+ {X86::VPDPBSSDSZ128rkz, X86::VPDPBSSDSZ128mbkz, TB_BCAST_D},
+ {X86::VPDPBSSDSZ256rk, X86::VPDPBSSDSZ256mbk, TB_BCAST_D},
+ {X86::VPDPBSSDSZ256rkz, X86::VPDPBSSDSZ256mbkz, TB_BCAST_D},
+ {X86::VPDPBSSDSZrk, X86::VPDPBSSDSZmbk, TB_BCAST_D},
+ {X86::VPDPBSSDSZrkz, X86::VPDPBSSDSZmbkz, TB_BCAST_D},
+ {X86::VPDPBSSDZ128rk, X86::VPDPBSSDZ128mbk, TB_BCAST_D},
+ {X86::VPDPBSSDZ128rkz, X86::VPDPBSSDZ128mbkz, TB_BCAST_D},
+ {X86::VPDPBSSDZ256rk, X86::VPDPBSSDZ256mbk, TB_BCAST_D},
+ {X86::VPDPBSSDZ256rkz, X86::VPDPBSSDZ256mbkz, TB_BCAST_D},
+ {X86::VPDPBSSDZrk, X86::VPDPBSSDZmbk, TB_BCAST_D},
+ {X86::VPDPBSSDZrkz, X86::VPDPBSSDZmbkz, TB_BCAST_D},
+ {X86::VPDPBSUDSZ128rk, X86::VPDPBSUDSZ128mbk, TB_BCAST_D},
+ {X86::VPDPBSUDSZ128rkz, X86::VPDPBSUDSZ128mbkz, TB_BCAST_D},
+ {X86::VPDPBSUDSZ256rk, X86::VPDPBSUDSZ256mbk, TB_BCAST_D},
+ {X86::VPDPBSUDSZ256rkz, X86::VPDPBSUDSZ256mbkz, TB_BCAST_D},
+ {X86::VPDPBSUDSZrk, X86::VPDPBSUDSZmbk, TB_BCAST_D},
+ {X86::VPDPBSUDSZrkz, X86::VPDPBSUDSZmbkz, TB_BCAST_D},
+ {X86::VPDPBSUDZ128rk, X86::VPDPBSUDZ128mbk, TB_BCAST_D},
+ {X86::VPDPBSUDZ128rkz, X86::VPDPBSUDZ128mbkz, TB_BCAST_D},
+ {X86::VPDPBSUDZ256rk, X86::VPDPBSUDZ256mbk, TB_BCAST_D},
+ {X86::VPDPBSUDZ256rkz, X86::VPDPBSUDZ256mbkz, TB_BCAST_D},
+ {X86::VPDPBSUDZrk, X86::VPDPBSUDZmbk, TB_BCAST_D},
+ {X86::VPDPBSUDZrkz, X86::VPDPBSUDZmbkz, TB_BCAST_D},
{X86::VPDPBUSDSZ128rk, X86::VPDPBUSDSZ128mbk, TB_BCAST_D},
{X86::VPDPBUSDSZ128rkz, X86::VPDPBUSDSZ128mbkz, TB_BCAST_D},
{X86::VPDPBUSDSZ256rk, X86::VPDPBUSDSZ256mbk, TB_BCAST_D},
@@ -9091,6 +9277,18 @@ static const X86FoldTableEntry BroadcastTable4[] = {
{X86::VPDPBUSDZ256rkz, X86::VPDPBUSDZ256mbkz, TB_BCAST_D},
{X86::VPDPBUSDZrk, X86::VPDPBUSDZmbk, TB_BCAST_D},
{X86::VPDPBUSDZrkz, X86::VPDPBUSDZmbkz, TB_BCAST_D},
+ {X86::VPDPBUUDSZ128rk, X86::VPDPBUUDSZ128mbk, TB_BCAST_D},
+ {X86::VPDPBUUDSZ128rkz, X86::VPDPBUUDSZ128mbkz, TB_BCAST_D},
+ {X86::VPDPBUUDSZ256rk, X86::VPDPBUUDSZ256mbk, TB_BCAST_D},
+ {X86::VPDPBUUDSZ256rkz, X86::VPDPBUUDSZ256mbkz, TB_BCAST_D},
+ {X86::VPDPBUUDSZrk, X86::VPDPBUUDSZmbk, TB_BCAST_D},
+ {X86::VPDPBUUDSZrkz, X86::VPDPBUUDSZmbkz, TB_BCAST_D},
+ {X86::VPDPBUUDZ128rk, X86::VPDPBUUDZ128mbk, TB_BCAST_D},
+ {X86::VPDPBUUDZ128rkz, X86::VPDPBUUDZ128mbkz, TB_BCAST_D},
+ {X86::VPDPBUUDZ256rk, X86::VPDPBUUDZ256mbk, TB_BCAST_D},
+ {X86::VPDPBUUDZ256rkz, X86::VPDPBUUDZ256mbkz, TB_BCAST_D},
+ {X86::VPDPBUUDZrk, X86::VPDPBUUDZmbk, TB_BCAST_D},
+ {X86::VPDPBUUDZrkz, X86::VPDPBUUDZmbkz, TB_BCAST_D},
{X86::VPDPWSSDSZ128rk, X86::VPDPWSSDSZ128mbk, TB_BCAST_D},
{X86::VPDPWSSDSZ128rkz, X86::VPDPWSSDSZ128mbkz, TB_BCAST_D},
{X86::VPDPWSSDSZ256rk, X86::VPDPWSSDSZ256mbk, TB_BCAST_D},
@@ -9103,6 +9301,42 @@ static const X86FoldTableEntry BroadcastTable4[] = {
{X86::VPDPWSSDZ256rkz, X86::VPDPWSSDZ256mbkz, TB_BCAST_D},
{X86::VPDPWSSDZrk, X86::VPDPWSSDZmbk, TB_BCAST_D},
{X86::VPDPWSSDZrkz, X86::VPDPWSSDZmbkz, TB_BCAST_D},
+ {X86::VPDPWSUDSZ128rk, X86::VPDPWSUDSZ128mbk, TB_BCAST_D},
+ {X86::VPDPWSUDSZ128rkz, X86::VPDPWSUDSZ128mbkz, TB_BCAST_D},
+ {X86::VPDPWSUDSZ256rk, X86::VPDPWSUDSZ256mbk, TB_BCAST_D},
+ {X86::VPDPWSUDSZ256rkz, X86::VPDPWSUDSZ256mbkz, TB_BCAST_D},
+ {X86::VPDPWSUDSZrk, X86::VPDPWSUDSZmbk, TB_BCAST_D},
+ {X86::VPDPWSUDSZrkz, X86::VPDPWSUDSZmbkz, TB_BCAST_D},
+ {X86::VPDPWSUDZ128rk, X86::VPDPWSUDZ128mbk, TB_BCAST_D},
+ {X86::VPDPWSUDZ128rkz, X86::VPDPWSUDZ128mbkz, TB_BCAST_D},
+ {X86::VPDPWSUDZ256rk, X86::VPDPWSUDZ256mbk, TB_BCAST_D},
+ {X86::VPDPWSUDZ256rkz, X86::VPDPWSUDZ256mbkz, TB_BCAST_D},
+ {X86::VPDPWSUDZrk, X86::VPDPWSUDZmbk, TB_BCAST_D},
+ {X86::VPDPWSUDZrkz, X86::VPDPWSUDZmbkz, TB_BCAST_D},
+ {X86::VPDPWUSDSZ128rk, X86::VPDPWUSDSZ128mbk, TB_BCAST_D},
+ {X86::VPDPWUSDSZ128rkz, X86::VPDPWUSDSZ128mbkz, TB_BCAST_D},
+ {X86::VPDPWUSDSZ256rk, X86::VPDPWUSDSZ256mbk, TB_BCAST_D},
+ {X86::VPDPWUSDSZ256rkz, X86::VPDPWUSDSZ256mbkz, TB_BCAST_D},
+ {X86::VPDPWUSDSZrk, X86::VPDPWUSDSZmbk, TB_BCAST_D},
+ {X86::VPDPWUSDSZrkz, X86::VPDPWUSDSZmbkz, TB_BCAST_D},
+ {X86::VPDPWUSDZ128rk, X86::VPDPWUSDZ128mbk, TB_BCAST_D},
+ {X86::VPDPWUSDZ128rkz, X86::VPDPWUSDZ128mbkz, TB_BCAST_D},
+ {X86::VPDPWUSDZ256rk, X86::VPDPWUSDZ256mbk, TB_BCAST_D},
+ {X86::VPDPWUSDZ256rkz, X86::VPDPWUSDZ256mbkz, TB_BCAST_D},
+ {X86::VPDPWUSDZrk, X86::VPDPWUSDZmbk, TB_BCAST_D},
+ {X86::VPDPWUSDZrkz, X86::VPDPWUSDZmbkz, TB_BCAST_D},
+ {X86::VPDPWUUDSZ128rk, X86::VPDPWUUDSZ128mbk, TB_BCAST_D},
+ {X86::VPDPWUUDSZ128rkz, X86::VPDPWUUDSZ128mbkz, TB_BCAST_D},
+ {X86::VPDPWUUDSZ256rk, X86::VPDPWUUDSZ256mbk, TB_BCAST_D},
+ {X86::VPDPWUUDSZ256rkz, X86::VPDPWUUDSZ256mbkz, TB_BCAST_D},
+ {X86::VPDPWUUDSZrk, X86::VPDPWUUDSZmbk, TB_BCAST_D},
+ {X86::VPDPWUUDSZrkz, X86::VPDPWUUDSZmbkz, TB_BCAST_D},
+ {X86::VPDPWUUDZ128rk, X86::VPDPWUUDZ128mbk, TB_BCAST_D},
+ {X86::VPDPWUUDZ128rkz, X86::VPDPWUUDZ128mbkz, TB_BCAST_D},
+ {X86::VPDPWUUDZ256rk, X86::VPDPWUUDZ256mbk, TB_BCAST_D},
+ {X86::VPDPWUUDZ256rkz, X86::VPDPWUUDZ256mbkz, TB_BCAST_D},
+ {X86::VPDPWUUDZrk, X86::VPDPWUUDZmbk, TB_BCAST_D},
+ {X86::VPDPWUUDZrkz, X86::VPDPWUUDZmbkz, TB_BCAST_D},
{X86::VPERMDZ256rrk, X86::VPERMDZ256rmbk, TB_BCAST_D},
{X86::VPERMDZrrk, X86::VPERMDZrmbk, TB_BCAST_D},
{X86::VPERMI2DZ128rrk, X86::VPERMI2DZ128rmbk, TB_BCAST_D},
diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
index f967344135553..60b1a48721653 100644
--- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
+++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
@@ -242,7 +242,8 @@ void X86InstrMappingEmitter::emitCompressEVEXTable(
auto It = llvm::find_if(Predicates, [](const Record *R) {
StringRef Name = R->getName();
return Name == "HasAVXNECONVERT" || Name == "HasAVXVNNI" ||
- Name == "HasAVXIFMA";
+ Name == "HasAVXIFMA" || Name == "HasAVXVNNIINT8" ||
+ Name == "HasAVXVNNIINT16";
});
if (It != Predicates.end())
PredicateInsts[(*It)->getValueAsString("CondString")].push_back(NewInst);
More information about the cfe-commits
mailing list