[clang] 1c154bd - [X86] Add AVX-VNNI-INT16 instructions.

Freddy Ye via cfe-commits cfe-commits at lists.llvm.org
Wed Jul 19 23:31:37 PDT 2023


Author: Freddy Ye
Date: 2023-07-20T14:31:16+08:00
New Revision: 1c154bd755153b5c6ada4bbed58facf23f6abffc

URL: https://github.com/llvm/llvm-project/commit/1c154bd755153b5c6ada4bbed58facf23f6abffc
DIFF: https://github.com/llvm/llvm-project/commit/1c154bd755153b5c6ada4bbed58facf23f6abffc.diff

LOG: [X86] Add AVX-VNNI-INT16 instructions.

For more details about these instructions, please refer to the latest ISE document: https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html

Reviewed By: pengfei, skan

Differential Revision: https://reviews.llvm.org/D155145

Added: 
    clang/lib/Headers/avxvnniint16intrin.h
    clang/test/CodeGen/X86/avxvnniint16-builtins.c
    llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
    llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
    llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt
    llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt
    llvm/test/MC/X86/avx-vnni-int16-32-att.s
    llvm/test/MC/X86/avx-vnni-int16-32-intel.s
    llvm/test/MC/X86/avx-vnni-int16-64-att.s
    llvm/test/MC/X86/avx-vnni-int16-64-intel.s

Modified: 
    clang/docs/ReleaseNotes.rst
    clang/include/clang/Basic/BuiltinsX86.def
    clang/include/clang/Driver/Options.td
    clang/lib/Basic/Targets/X86.cpp
    clang/lib/Basic/Targets/X86.h
    clang/lib/Headers/CMakeLists.txt
    clang/lib/Headers/immintrin.h
    clang/test/CodeGen/attr-target-x86.c
    clang/test/Driver/x86-target-features.c
    clang/test/Preprocessor/x86_target_features.c
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/IR/IntrinsicsX86.td
    llvm/include/llvm/TargetParser/X86TargetParser.def
    llvm/lib/Target/X86/X86.td
    llvm/lib/Target/X86/X86InstrInfo.cpp
    llvm/lib/Target/X86/X86InstrInfo.td
    llvm/lib/Target/X86/X86InstrSSE.td
    llvm/lib/TargetParser/Host.cpp
    llvm/lib/TargetParser/X86TargetParser.cpp
    llvm/test/TableGen/x86-fold-tables.inc

Removed: 
    


################################################################################
diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2982810b67fa0c..55fa4cb27d0b61 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -824,6 +824,10 @@ X86 Support
 - Support ISA of ``SM4``.
   * Support intrinsic of ``_mm(256)_sm4key4_epi32``.
   * Support intrinsic of ``_mm(256)_sm4rnds4_epi32``.
+- Support ISA of ``AVX-VNNI-INT16``.
+  * Support intrinsic of ``_mm(256)_dpwsud(s)_epi32``.
+  * Support intrinsic of ``_mm(256)_dpwusd(s)_epi32``.
+  * Support intrinsic of ``_mm(256)_dpwuud(s)_epi32``.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^

diff  --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 48dd9cbb1ab7a4..10ac3b3c34efd2 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2116,6 +2116,20 @@ TARGET_HEADER_BUILTIN(__readgsword,  "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
 TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 
+// AVX-VNNI-INT16
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+
 // AVX-NE-CONVERT
 TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
 TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps256, "V8fyC*", "nV:256:", "avxneconvert")

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0578bc0cba1214..dff5749b3481d7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4960,6 +4960,8 @@ def mavxifma : Flag<["-"], "mavxifma">, Group<m_x86_Features_Group>;
 def mno_avxifma : Flag<["-"], "mno-avxifma">, Group<m_x86_Features_Group>;
 def mavxneconvert : Flag<["-"], "mavxneconvert">, Group<m_x86_Features_Group>;
 def mno_avxneconvert : Flag<["-"], "mno-avxneconvert">, Group<m_x86_Features_Group>;
+def mavxvnniint16 : Flag<["-"], "mavxvnniint16">, Group<m_x86_Features_Group>;
+def mno_avxvnniint16 : Flag<["-"], "mno-avxvnniint16">, Group<m_x86_Features_Group>;
 def mavxvnniint8 : Flag<["-"], "mavxvnniint8">, Group<m_x86_Features_Group>;
 def mno_avxvnniint8 : Flag<["-"], "mno-avxvnniint8">, Group<m_x86_Features_Group>;
 def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;

diff  --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index c89e1df4e52d2b..26b89619b2c94b 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -353,6 +353,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAVXNECONVERT= true;
     } else if (Feature == "+avxvnni") {
       HasAVXVNNI = true;
+    } else if (Feature == "+avxvnniint16") {
+      HasAVXVNNIINT16 = true;
     } else if (Feature == "+avxvnniint8") {
       HasAVXVNNIINT8 = true;
     } else if (Feature == "+serialize") {
@@ -836,6 +838,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AVXNECONVERT__");
   if (HasAVXVNNI)
     Builder.defineMacro("__AVXVNNI__");
+  if (HasAVXVNNIINT16)
+    Builder.defineMacro("__AVXVNNIINT16__");
   if (HasAVXVNNIINT8)
     Builder.defineMacro("__AVXVNNIINT8__");
   if (HasSERIALIZE)
@@ -964,6 +968,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("avxifma", true)
       .Case("avxneconvert", true)
       .Case("avxvnni", true)
+      .Case("avxvnniint16", true)
       .Case("avxvnniint8", true)
       .Case("bmi", true)
       .Case("bmi2", true)
@@ -1069,6 +1074,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("avxifma", HasAVXIFMA)
       .Case("avxneconvert", HasAVXNECONVERT)
       .Case("avxvnni", HasAVXVNNI)
+      .Case("avxvnniint16", HasAVXVNNIINT16)
       .Case("avxvnniint8", HasAVXVNNIINT8)
       .Case("bmi", HasBMI)
       .Case("bmi2", HasBMI2)

diff  --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index d5ee63833febd2..039c05893d2692 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -145,6 +145,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasPTWRITE = false;
   bool HasINVPCID = false;
   bool HasENQCMD = false;
+  bool HasAVXVNNIINT16 = false;
   bool HasAMXFP16 = false;
   bool HasCMPCCXADD = false;
   bool HasRAOINT = false;

diff  --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 35c8b7de8db33a..f2b0c5cddcbbf8 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -160,6 +160,7 @@ set(x86_files
   avxifmaintrin.h
   avxintrin.h
   avxneconvertintrin.h
+  avxvnniint16intrin.h
   avxvnniint8intrin.h
   avxvnniintrin.h
   bmi2intrin.h

diff  --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
new file mode 100644
index 00000000000000..e4d342a8b45b1d
--- /dev/null
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -0,0 +1,473 @@
+/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVXVNNIINT16INTRIN_H
+#define __AVXVNNIINT16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
+                 __min_vector_width__(256)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUD instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x int].
+/// \param __A
+///    A 128-bit vector of [8 x short].
+/// \param __B
+///    A 128-bit vector of [8 x unsigned short].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUD instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x int].
+/// \param __A
+///    A 256-bit vector of [16 x short].
+/// \param __B
+///    A 256-bit vector of [16 x unsigned short].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x int].
+/// \param __A
+///    A 128-bit vector of [8 x short].
+/// \param __B
+///    A 128-bit vector of [8 x unsigned short].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x int].
+/// \param __A
+///    A 256-bit vector of [16 x short].
+/// \param __B
+///    A 256-bit vector of [16 x unsigned short].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUSD instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x int].
+/// \param __A
+///    A 128-bit vector of [8 x unsigned short].
+/// \param __B
+///    A 128-bit vector of [8 x short].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUSD instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x int].
+/// \param __A
+///    A 256-bit vector of [16 x unsigned short].
+/// \param __B
+///    A 256-bit vector of [16 x short].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x int].
+/// \param __A
+///    A 128-bit vector of [8 x unsigned short].
+/// \param __B
+///    A 128-bit vector of [8 x short].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x int].
+/// \param __A
+///    A 256-bit vector of [16 x unsigned short].
+/// \param __B
+///    A 256-bit vector of [16 x short].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUUD instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x unsigned int].
+/// \param __A
+///    A 128-bit vector of [8 x unsigned short].
+/// \param __B
+///    A 128-bit vector of [8 x unsigned short].
+/// \returns
+///    A 128-bit vector of [4 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUUD instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x unsigned int].
+/// \param __A
+///    A 256-bit vector of [16 x unsigned short].
+/// \param __B
+///    A 256-bit vector of [16 x unsigned short].
+/// \returns
+///    A 256-bit vector of [8 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x unsigned int].
+/// \param __A
+///    A 128-bit vector of [8 x unsigned short].
+/// \param __B
+///    A 128-bit vector of [8 x unsigned short].
+/// \returns
+///    A 128-bit vector of [4 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x unsigned int].
+/// \param __A
+///    A 256-bit vector of [16 x unsigned short].
+/// \param __B
+///    A 256-bit vector of [16 x unsigned short].
+/// \returns
+///    A 256-bit vector of [8 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT16INTRIN_H

diff  --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 1c9a50c7208dca..642602be14e60e 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -284,6 +284,11 @@
 #include <sm4intrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNIINT16__)
+#include <avxvnniint16intrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDPID__)
 /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).

diff  --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
new file mode 100644
index 00000000000000..a10ca551a15146
--- /dev/null
+++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m128i test_mm_dpwsud_epi32(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_dpwsud_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_dpwsud_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwsud_epi32(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_dpwsud_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_dpwsud_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwsuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_dpwsuds_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_dpwsuds_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwsuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_dpwsuds_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_dpwsuds_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwusd_epi32(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_dpwusd_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_dpwusd_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwusd_epi32(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_dpwusd_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_dpwusd_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwusds_epi32(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_dpwusds_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_dpwusds_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwusds_epi32(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_dpwusds_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_dpwusds_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwuud_epi32(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_dpwuud_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_dpwuud_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwuud_epi32(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_dpwuud_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_dpwuud_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwuuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_dpwuuds_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_dpwuuds_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwuuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_dpwuuds_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_dpwuuds_epi32(__A, __B, __C);
+}

diff  --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index f2c79eda5d24dd..d2f09b67c7c3e4 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@ void __attribute__((target("arch=x86-64-v4"))) x86_64_v4(void) {}
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-3dnow,-3dnowa,-mmx"

diff  --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index e387e2ca45361f..0eb55e93546a70 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -364,6 +364,11 @@
 // SM4: "-target-feature" "+sm4"
 // NO-SM4: "-target-feature" "-sm4"
 
+// RUN: %clang --target=i386 -mavxvnniint16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVXVNNIINT16 %s
+// RUN: %clang --target=i386 -mno-avxvnniint16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVXVNNIINT16 %s
+// AVXVNNIINT16: "-target-feature" "+avxvnniint16"
+// NO-AVXVNNIINT16: "-target-feature" "-avxvnniint16"
+
 // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s
 // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s
 // CRC32: "-target-feature" "+crc32"

diff  --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 20d96d072fa4a2..7a9c8eb0f8dddc 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -700,6 +700,20 @@
 // SM4NOAVX-NOT: #define __AVX__ 1
 // SM4NOAVX-NOT: #define __SM4__ 1
 
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavxvnniint16 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVXVNNIINT16 %s
+
+// AVXVNNIINT16: #define __AVX2__ 1
+// AVXVNNIINT16: #define __AVXVNNIINT16__ 1
+
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mno-avxvnniint16 -x c -E -dM -o - %s | FileCheck  -check-prefix=NOAVXVNNIINT16 %s
+
+// NOAVXVNNIINT16-NOT: #define __AVXVNNIINT16__ 1
+
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavxvnniint16 -mno-avx2 -x c -E -dM -o - %s | FileCheck  -check-prefix=AVXVNNIINT16NOAVX2 %s
+
+// AVXVNNIINT16NOAVX2-NOT: #define __AVX2__ 1
+// AVXVNNIINT16NOAVX2-NOT: #define __AVXVNNIINT16__ 1
+
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mcrc32 -x c -E -dM -o - %s | FileCheck -check-prefix=CRC32 %s
 
 // CRC32: #define __CRC32__ 1

diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 3264ec4ab51253..2431ff52d6e2ca 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -282,6 +282,7 @@ Changes to the X86 Backend
 * Support ISA of ``SHA512``.
 * Support ISA of ``SM3``.
 * Support ISA of ``SM4``.
+* Support ISA of ``AVX-VNNI-INT16``.
 
 Changes to the OCaml bindings
 -----------------------------

diff  --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 45aaee87fb608f..57cd1dc47bd9fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2053,6 +2053,67 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
                               [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
                               [IntrNoMem]>;
+
+  def int_x86_avx2_vpdpwsud_128
+      : ClangBuiltin<"__builtin_ia32_vpdpwsud128">,
+        DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwsud_256
+      : ClangBuiltin<"__builtin_ia32_vpdpwsud256">,
+        DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwsuds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpwsuds128">,
+        DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwsuds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpwsuds256">,
+        DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwusd_128
+      : ClangBuiltin<"__builtin_ia32_vpdpwusd128">,
+        DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwusd_256
+      : ClangBuiltin<"__builtin_ia32_vpdpwusd256">,
+        DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwusds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpwusds128">,
+        DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwusds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpwusds256">,
+        DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwuud_128
+      : ClangBuiltin<"__builtin_ia32_vpdpwuud128">,
+        DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwuud_256
+      : ClangBuiltin<"__builtin_ia32_vpdpwuud256">,
+        DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwuuds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpwuuds128">,
+        DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [IntrNoMem]>;
+  def int_x86_avx2_vpdpwuuds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpwuuds256">,
+        DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 32c7ffe4f23395..7964353045258f 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -230,6 +230,7 @@ X86_FEATURE       (AVXVNNIINT8,     "avxvnniint8")
 X86_FEATURE       (SHA512,          "sha512")
 X86_FEATURE       (SM3,             "sm3")
 X86_FEATURE       (SM4,             "sm4")
+X86_FEATURE       (AVXVNNIINT16,    "avxvnniint16")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")

diff  --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8b33ad629ec5ea..a28fc83e4e4da9 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -191,6 +191,10 @@ def FeatureAVXVNNIINT8  : SubtargetFeature<"avxvnniint8",
                              "HasAVXVNNIINT8", "true",
                              "Enable AVX-VNNI-INT8",
                              [FeatureAVX2]>;
+def FeatureAVXVNNIINT16 : SubtargetFeature<"avxvnniint16",
+                             "HasAVXVNNIINT16", "true",
+                             "Enable AVX-VNNI-INT16",
+                             [FeatureAVX2]>;
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 0b8bec8fb35fc0..10a0ccdcb02329 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2565,6 +2565,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VPDPWSSDrr:
   case X86::VPDPWSSDSYrr:
   case X86::VPDPWSSDSrr:
+  case X86::VPDPWUUDrr:
+  case X86::VPDPWUUDYrr:
+  case X86::VPDPWUUDSrr:
+  case X86::VPDPWUUDSYrr:
   case X86::VPDPBSSDSrr:
   case X86::VPDPBSSDSYrr:
   case X86::VPDPBSSDrr:

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index e065a3169bd002..08e6e4e0627b77 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -924,6 +924,7 @@ def HasVNNI      : Predicate<"Subtarget->hasVNNI()">;
 def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
 def HasBF16      : Predicate<"Subtarget->hasBF16()">;
 def HasFP16      : Predicate<"Subtarget->hasFP16()">;
+def HasAVXVNNIINT16 : Predicate<"Subtarget->hasAVXVNNIINT16()">;
 def HasAVXVNNIINT8 : Predicate<"Subtarget->hasAVXVNNIINT8()">;
 def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
 def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;

diff  --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 84e39b3107188d..6c57eceab37698 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8382,3 +8382,47 @@ defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, V
 defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX_4V;
 defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX_4V;
 defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX_4V;
+
+let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
+  let isCommutable = IsCommutable in
+  def rr  : I<opc, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src1, VR128:$src2, VR128:$src3),
+              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+              [(set VR128:$dst,
+                (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
+                        VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+              VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+  def rm  : I<opc, MRMSrcMem, (outs VR128:$dst),
+              (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+              [(set VR128:$dst,
+                (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
+                        VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
+              VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+  let isCommutable = IsCommutable in
+  def Yrr  : I<opc, MRMSrcReg, (outs VR256:$dst),
+               (ins VR256:$src1, VR256:$src2, VR256:$src3),
+               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+               [(set VR256:$dst,
+                 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
+                         VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+
+  def Yrm  : I<opc, MRMSrcMem, (outs VR256:$dst),
+               (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+               [(set VR256:$dst,
+                 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
+                         VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
+               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+}
+
+defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS;
+defm VPDPWSUDS  : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8XS;
+defm VPDPWUSD   : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8PD;
+defm VPDPWUSDS  : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8PD;
+defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8PS;
+defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8PS;

diff  --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 0796a749bae44a..a1cedbb0308f8e 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1759,6 +1759,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["avxvnniint8"] = HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave;
   Features["avxneconvert"] = HasLeaf7Subleaf1 && ((EDX >> 5) & 1) && HasAVXSave;
   Features["amx-complex"] = HasLeaf7Subleaf1 && ((EDX >> 8) & 1) && HasAMXSave;
+  Features["avxvnniint16"] = HasLeaf7Subleaf1 && ((EDX >> 10) & 1) && HasAVXSave;
   Features["prefetchi"]  = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
 
   bool HasLeafD = MaxLevel >= 0xd &&

diff  --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index c1434edf043123..606370bfe57dc6 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -654,6 +654,7 @@ constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
 constexpr FeatureBitset ImpliedFeaturesCMPCCXADD = {};
 constexpr FeatureBitset ImpliedFeaturesRAOINT = {};
+constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT16 = FeatureAVX2;
 constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2;
 constexpr FeatureBitset ImpliedFeaturesAVXIFMA = FeatureAVX2;
 constexpr FeatureBitset ImpliedFeaturesAVXNECONVERT = FeatureAVX2;

diff  --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
new file mode 100644
index 00000000000000..999c968fa80db5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)

diff  --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
new file mode 100644
index 00000000000000..534352f3220019
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x71,0xd2,0x44,0x24,0xe8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x75,0xd2,0x44,0x24,0xd8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x71,0xd3,0x44,0x24,0xe8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x75,0xd3,0x44,0x24,0xd8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x70,0xd2,0x44,0x24,0xe8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128_commuted(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x70,0xd2,0x44,0x24,0xe8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %C, <4 x i32> %B)
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x74,0xd2,0x44,0x24,0xd8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256_commuted(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x74,0xd2,0x44,0x24,0xd8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %C, <8 x i32> %B)
+  ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x70,0xd3,0x44,0x24,0xe8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128_commuted(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x70,0xd3,0x44,0x24,0xe8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %C, <4 x i32> %B)
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x74,0xd3,0x44,0x24,0xd8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256_commuted(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop # encoding: [0x90]
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x74,0xd3,0x44,0x24,0xd8]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %C, <8 x i32> %B)
+  ret <8 x i32> %ret
+}

diff  --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt
new file mode 100644
index 00000000000000..099970430ea29b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt
@@ -0,0 +1,339 @@
+# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:        vpdpwsud %ymm4, %ymm3, %ymm2
+# INTEL:      vpdpwsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xd2,0xd4
+
+# ATT:        vpdpwsud %xmm4, %xmm3, %xmm2
+# INTEL:      vpdpwsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xd2,0xd4
+
+# ATT:        vpdpwsud  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL:      vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwsud  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL:      vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwsud  (%eax), %ymm3, %ymm2
+# INTEL:      vpdpwsud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0xd2,0x10
+
+# ATT:        vpdpwsud  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL:      vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:        vpdpwsud  4064(%ecx), %ymm3, %ymm2
+# INTEL:      vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT:        vpdpwsud  -4096(%edx), %ymm3, %ymm2
+# INTEL:      vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff
+
+# ATT:        vpdpwsud  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL:      vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwsud  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL:      vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwsud  (%eax), %xmm3, %xmm2
+# INTEL:      vpdpwsud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0xd2,0x10
+
+# ATT:        vpdpwsud  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL:      vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:        vpdpwsud  2032(%ecx), %xmm3, %xmm2
+# INTEL:      vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00
+
+# ATT:        vpdpwsud  -2048(%edx), %xmm3, %xmm2
+# INTEL:      vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff
+
+# ATT:        vpdpwsuds %ymm4, %ymm3, %ymm2
+# INTEL:      vpdpwsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xd3,0xd4
+
+# ATT:        vpdpwsuds %xmm4, %xmm3, %xmm2
+# INTEL:      vpdpwsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xd3,0xd4
+
+# ATT:        vpdpwsuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL:      vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwsuds  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL:      vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwsuds  (%eax), %ymm3, %ymm2
+# INTEL:      vpdpwsuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0xd3,0x10
+
+# ATT:        vpdpwsuds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL:      vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:        vpdpwsuds  4064(%ecx), %ymm3, %ymm2
+# INTEL:      vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT:        vpdpwsuds  -4096(%edx), %ymm3, %ymm2
+# INTEL:      vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff
+
+# ATT:        vpdpwsuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL:      vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwsuds  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL:      vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwsuds  (%eax), %xmm3, %xmm2
+# INTEL:      vpdpwsuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0xd3,0x10
+
+# ATT:        vpdpwsuds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL:      vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:        vpdpwsuds  2032(%ecx), %xmm3, %xmm2
+# INTEL:      vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00
+
+# ATT:        vpdpwsuds  -2048(%edx), %xmm3, %xmm2
+# INTEL:      vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff
+
+# ATT:        vpdpwusd %ymm4, %ymm3, %ymm2
+# INTEL:      vpdpwusd ymm2, ymm3, ymm4
+0xc4,0xe2,0x65,0xd2,0xd4
+
+# ATT:        vpdpwusd %xmm4, %xmm3, %xmm2
+# INTEL:      vpdpwusd xmm2, xmm3, xmm4
+0xc4,0xe2,0x61,0xd2,0xd4
+
+# ATT:        vpdpwusd  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL:      vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwusd  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL:      vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwusd  (%eax), %ymm3, %ymm2
+# INTEL:      vpdpwusd ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x65,0xd2,0x10
+
+# ATT:        vpdpwusd  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL:      vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:        vpdpwusd  4064(%ecx), %ymm3, %ymm2
+# INTEL:      vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT:        vpdpwusd  -4096(%edx), %ymm3, %ymm2
+# INTEL:      vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff
+
+# ATT:        vpdpwusd  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL:      vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwusd  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL:      vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwusd  (%eax), %xmm3, %xmm2
+# INTEL:      vpdpwusd xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x61,0xd2,0x10
+
+# ATT:        vpdpwusd  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL:      vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:        vpdpwusd  2032(%ecx), %xmm3, %xmm2
+# INTEL:      vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00
+
+# ATT:        vpdpwusd  -2048(%edx), %xmm3, %xmm2
+# INTEL:      vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff
+
+# ATT:        vpdpwusds %ymm4, %ymm3, %ymm2
+# INTEL:      vpdpwusds ymm2, ymm3, ymm4
+0xc4,0xe2,0x65,0xd3,0xd4
+
+# ATT:        vpdpwusds %xmm4, %xmm3, %xmm2
+# INTEL:      vpdpwusds xmm2, xmm3, xmm4
+0xc4,0xe2,0x61,0xd3,0xd4
+
+# ATT:        vpdpwusds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL:      vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwusds  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL:      vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwusds  (%eax), %ymm3, %ymm2
+# INTEL:      vpdpwusds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x65,0xd3,0x10
+
+# ATT:        vpdpwusds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL:      vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:        vpdpwusds  4064(%ecx), %ymm3, %ymm2
+# INTEL:      vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT:        vpdpwusds  -4096(%edx), %ymm3, %ymm2
+# INTEL:      vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff
+
+# ATT:        vpdpwusds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL:      vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwusds  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL:      vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwusds  (%eax), %xmm3, %xmm2
+# INTEL:      vpdpwusds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x61,0xd3,0x10
+
+# ATT:        vpdpwusds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL:      vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:        vpdpwusds  2032(%ecx), %xmm3, %xmm2
+# INTEL:      vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00
+
+# ATT:        vpdpwusds  -2048(%edx), %xmm3, %xmm2
+# INTEL:      vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff
+
+# ATT:        vpdpwuud %ymm4, %ymm3, %ymm2
+# INTEL:      vpdpwuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0xd2,0xd4
+
+# ATT:        vpdpwuud %xmm4, %xmm3, %xmm2
+# INTEL:      vpdpwuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0xd2,0xd4
+
+# ATT:        vpdpwuud  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL:      vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwuud  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL:      vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwuud  (%eax), %ymm3, %ymm2
+# INTEL:      vpdpwuud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0xd2,0x10
+
+# ATT:        vpdpwuud  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL:      vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:        vpdpwuud  4064(%ecx), %ymm3, %ymm2
+# INTEL:      vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT:        vpdpwuud  -4096(%edx), %ymm3, %ymm2
+# INTEL:      vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff
+
+# ATT:        vpdpwuud  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL:      vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwuud  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL:      vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwuud  (%eax), %xmm3, %xmm2
+# INTEL:      vpdpwuud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0xd2,0x10
+
+# ATT:        vpdpwuud  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL:      vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:        vpdpwuud  2032(%ecx), %xmm3, %xmm2
+# INTEL:      vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00
+
+# ATT:        vpdpwuud  -2048(%edx), %xmm3, %xmm2
+# INTEL:      vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff
+
+# ATT:        vpdpwuuds %ymm4, %ymm3, %ymm2
+# INTEL:      vpdpwuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0xd3,0xd4
+
+# ATT:        vpdpwuuds %xmm4, %xmm3, %xmm2
+# INTEL:      vpdpwuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0xd3,0xd4
+
+# ATT:        vpdpwuuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL:      vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwuuds  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL:      vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwuuds  (%eax), %ymm3, %ymm2
+# INTEL:      vpdpwuuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0xd3,0x10
+
+# ATT:        vpdpwuuds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL:      vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:        vpdpwuuds  4064(%ecx), %ymm3, %ymm2
+# INTEL:      vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT:        vpdpwuuds  -4096(%edx), %ymm3, %ymm2
+# INTEL:      vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff
+
+# ATT:        vpdpwuuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL:      vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:        vpdpwuuds  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL:      vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:        vpdpwuuds  (%eax), %xmm3, %xmm2
+# INTEL:      vpdpwuuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0xd3,0x10
+
+# ATT:        vpdpwuuds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL:      vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:        vpdpwuuds  2032(%ecx), %xmm3, %xmm2
+# INTEL:      vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00
+
+# ATT:        vpdpwuuds  -2048(%edx), %xmm3, %xmm2
+# INTEL:      vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff
+

diff  --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt
new file mode 100644
index 00000000000000..55396db790c715
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt
@@ -0,0 +1,339 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   vpdpwsud %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymm4
+0xc4,0x62,0x16,0xd2,0xe4
+
+# ATT:   vpdpwsud %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmm4
+0xc4,0x62,0x12,0xd2,0xe4
+
+# ATT:   vpdpwsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsud  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsud  (%rip), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsud  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwsud  4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT:   vpdpwsud  -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT:   vpdpwsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsud  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsud  (%rip), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsud  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwsud  2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT:   vpdpwsud  -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwsuds %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymm4
+0xc4,0x62,0x16,0xd3,0xe4
+
+# ATT:   vpdpwsuds %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmm4
+0xc4,0x62,0x12,0xd3,0xe4
+
+# ATT:   vpdpwsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsuds  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsuds  (%rip), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsuds  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwsuds  4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT:   vpdpwsuds  -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT:   vpdpwsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwsuds  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwsuds  (%rip), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwsuds  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwsuds  2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT:   vpdpwsuds  -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwusd %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymm4
+0xc4,0x62,0x15,0xd2,0xe4
+
+# ATT:   vpdpwusd %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmm4
+0xc4,0x62,0x11,0xd2,0xe4
+
+# ATT:   vpdpwusd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusd  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusd  (%rip), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusd  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwusd  4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT:   vpdpwusd  -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT:   vpdpwusd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusd  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusd  (%rip), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusd  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwusd  2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT:   vpdpwusd  -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwusds %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymm4
+0xc4,0x62,0x15,0xd3,0xe4
+
+# ATT:   vpdpwusds %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmm4
+0xc4,0x62,0x11,0xd3,0xe4
+
+# ATT:   vpdpwusds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusds  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusds  (%rip), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusds  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwusds  4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT:   vpdpwusds  -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT:   vpdpwusds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwusds  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwusds  (%rip), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwusds  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwusds  2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT:   vpdpwusds  -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwuud %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymm4
+0xc4,0x62,0x14,0xd2,0xe4
+
+# ATT:   vpdpwuud %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmm4
+0xc4,0x62,0x10,0xd2,0xe4
+
+# ATT:   vpdpwuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuud  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuud  (%rip), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuud  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwuud  4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT:   vpdpwuud  -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT:   vpdpwuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuud  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuud  (%rip), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuud  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwuud  2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT:   vpdpwuud  -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT:   vpdpwuuds %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymm4
+0xc4,0x62,0x14,0xd3,0xe4
+
+# ATT:   vpdpwuuds %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmm4
+0xc4,0x62,0x10,0xd3,0xe4
+
+# ATT:   vpdpwuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuuds  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuuds  (%rip), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuuds  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpwuuds  4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT:   vpdpwuuds  -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT:   vpdpwuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpwuuds  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpwuuds  (%rip), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpwuuds  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpwuuds  2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT:   vpdpwuuds  -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff
+

diff  --git a/llvm/test/MC/X86/avx-vnni-int16-32-att.s b/llvm/test/MC/X86/avx-vnni-int16-32-att.s
new file mode 100644
index 00000000000000..63a082d2132860
--- /dev/null
+++ b/llvm/test/MC/X86/avx-vnni-int16-32-att.s
@@ -0,0 +1,338 @@
+// RUN: llvm-mc -triple i686-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK:      vpdpwsud %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4]
+               vpdpwsud %ymm4, %ymm3, %ymm2
+
+// CHECK:      vpdpwsud %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4]
+               vpdpwsud %xmm4, %xmm3, %xmm2
+
+// CHECK:      vpdpwsud  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwsud  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK:      vpdpwsud  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwsud  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK:      vpdpwsud  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x10]
+               vpdpwsud  (%eax), %ymm3, %ymm2
+
+// CHECK:      vpdpwsud  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwsud  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK:      vpdpwsud  4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwsud  4064(%ecx), %ymm3, %ymm2
+
+// CHECK:      vpdpwsud  -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwsud  -4096(%edx), %ymm3, %ymm2
+
+// CHECK:      vpdpwsud  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwsud  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK:      vpdpwsud  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwsud  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK:      vpdpwsud  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x10]
+               vpdpwsud  (%eax), %xmm3, %xmm2
+
+// CHECK:      vpdpwsud  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwsud  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK:      vpdpwsud  2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwsud  2032(%ecx), %xmm3, %xmm2
+
+// CHECK:      vpdpwsud  -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwsud  -2048(%edx), %xmm3, %xmm2
+
+// CHECK:      vpdpwsuds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4]
+               vpdpwsuds %ymm4, %ymm3, %ymm2
+
+// CHECK:      vpdpwsuds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4]
+               vpdpwsuds %xmm4, %xmm3, %xmm2
+
+// CHECK:      vpdpwsuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwsuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK:      vpdpwsuds  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwsuds  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK:      vpdpwsuds  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x10]
+               vpdpwsuds  (%eax), %ymm3, %ymm2
+
+// CHECK:      vpdpwsuds  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwsuds  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK:      vpdpwsuds  4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwsuds  4064(%ecx), %ymm3, %ymm2
+
+// CHECK:      vpdpwsuds  -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwsuds  -4096(%edx), %ymm3, %ymm2
+
+// CHECK:      vpdpwsuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwsuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK:      vpdpwsuds  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwsuds  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK:      vpdpwsuds  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x10]
+               vpdpwsuds  (%eax), %xmm3, %xmm2
+
+// CHECK:      vpdpwsuds  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwsuds  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK:      vpdpwsuds  2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwsuds  2032(%ecx), %xmm3, %xmm2
+
+// CHECK:      vpdpwsuds  -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwsuds  -2048(%edx), %xmm3, %xmm2
+
+// CHECK:      vpdpwusd %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4]
+               vpdpwusd %ymm4, %ymm3, %ymm2
+
+// CHECK:      vpdpwusd %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4]
+               vpdpwusd %xmm4, %xmm3, %xmm2
+
+// CHECK:      vpdpwusd  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwusd  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK:      vpdpwusd  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwusd  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK:      vpdpwusd  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x10]
+               vpdpwusd  (%eax), %ymm3, %ymm2
+
+// CHECK:      vpdpwusd  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwusd  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK:      vpdpwusd  4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwusd  4064(%ecx), %ymm3, %ymm2
+
+// CHECK:      vpdpwusd  -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwusd  -4096(%edx), %ymm3, %ymm2
+
+// CHECK:      vpdpwusd  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwusd  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK:      vpdpwusd  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwusd  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK:      vpdpwusd  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x10]
+               vpdpwusd  (%eax), %xmm3, %xmm2
+
+// CHECK:      vpdpwusd  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwusd  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK:      vpdpwusd  2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwusd  2032(%ecx), %xmm3, %xmm2
+
+// CHECK:      vpdpwusd  -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwusd  -2048(%edx), %xmm3, %xmm2
+
+// CHECK:      vpdpwusds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4]
+               vpdpwusds %ymm4, %ymm3, %ymm2
+
+// CHECK:      vpdpwusds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4]
+               vpdpwusds %xmm4, %xmm3, %xmm2
+
+// CHECK:      vpdpwusds  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwusds  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK:      vpdpwusds  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwusds  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK:      vpdpwusds  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x10]
+               vpdpwusds  (%eax), %ymm3, %ymm2
+
+// CHECK:      vpdpwusds  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwusds  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK:      vpdpwusds  4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwusds  4064(%ecx), %ymm3, %ymm2
+
+// CHECK:      vpdpwusds  -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwusds  -4096(%edx), %ymm3, %ymm2
+
+// CHECK:      vpdpwusds  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwusds  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK:      vpdpwusds  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwusds  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK:      vpdpwusds  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x10]
+               vpdpwusds  (%eax), %xmm3, %xmm2
+
+// CHECK:      vpdpwusds  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwusds  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK:      vpdpwusds  2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwusds  2032(%ecx), %xmm3, %xmm2
+
+// CHECK:      vpdpwusds  -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwusds  -2048(%edx), %xmm3, %xmm2
+
+// CHECK:      vpdpwuud %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4]
+               vpdpwuud %ymm4, %ymm3, %ymm2
+
+// CHECK:      vpdpwuud %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4]
+               vpdpwuud %xmm4, %xmm3, %xmm2
+
+// CHECK:      vpdpwuud  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwuud  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK:      vpdpwuud  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwuud  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK:      vpdpwuud  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x10]
+               vpdpwuud  (%eax), %ymm3, %ymm2
+
+// CHECK:      vpdpwuud  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwuud  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK:      vpdpwuud  4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwuud  4064(%ecx), %ymm3, %ymm2
+
+// CHECK:      vpdpwuud  -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwuud  -4096(%edx), %ymm3, %ymm2
+
+// CHECK:      vpdpwuud  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwuud  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK:      vpdpwuud  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwuud  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK:      vpdpwuud  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x10]
+               vpdpwuud  (%eax), %xmm3, %xmm2
+
+// CHECK:      vpdpwuud  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwuud  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK:      vpdpwuud  2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwuud  2032(%ecx), %xmm3, %xmm2
+
+// CHECK:      vpdpwuud  -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwuud  -2048(%edx), %xmm3, %xmm2
+
+// CHECK:      vpdpwuuds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4]
+               vpdpwuuds %ymm4, %ymm3, %ymm2
+
+// CHECK:      vpdpwuuds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4]
+               vpdpwuuds %xmm4, %xmm3, %xmm2
+
+// CHECK:      vpdpwuuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwuuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK:      vpdpwuuds  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwuuds  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK:      vpdpwuuds  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x10]
+               vpdpwuuds  (%eax), %ymm3, %ymm2
+
+// CHECK:      vpdpwuuds  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwuuds  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK:      vpdpwuuds  4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwuuds  4064(%ecx), %ymm3, %ymm2
+
+// CHECK:      vpdpwuuds  -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwuuds  -4096(%edx), %ymm3, %ymm2
+
+// CHECK:      vpdpwuuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwuuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK:      vpdpwuuds  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwuuds  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK:      vpdpwuuds  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x10]
+               vpdpwuuds  (%eax), %xmm3, %xmm2
+
+// CHECK:      vpdpwuuds  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwuuds  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK:      vpdpwuuds  2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwuuds  2032(%ecx), %xmm3, %xmm2
+
+// CHECK:      vpdpwuuds  -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwuuds  -2048(%edx), %xmm3, %xmm2
+

diff  --git a/llvm/test/MC/X86/avx-vnni-int16-32-intel.s b/llvm/test/MC/X86/avx-vnni-int16-32-intel.s
new file mode 100644
index 00000000000000..9a4b163c391f37
--- /dev/null
+++ b/llvm/test/MC/X86/avx-vnni-int16-32-intel.s
@@ -0,0 +1,338 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK:      vpdpwsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4]
+               vpdpwsud ymm2, ymm3, ymm4
+
+// CHECK:      vpdpwsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4]
+               vpdpwsud xmm2, xmm3, xmm4
+
+// CHECK:      vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwsud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x10]
+               vpdpwsud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK:      vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK:      vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK:      vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK:      vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwsud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x10]
+               vpdpwsud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK:      vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK:      vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK:      vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK:      vpdpwsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4]
+               vpdpwsuds ymm2, ymm3, ymm4
+
+// CHECK:      vpdpwsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4]
+               vpdpwsuds xmm2, xmm3, xmm4
+
+// CHECK:      vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwsuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x10]
+               vpdpwsuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK:      vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK:      vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK:      vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK:      vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwsuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x10]
+               vpdpwsuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK:      vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK:      vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK:      vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK:      vpdpwusd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4]
+               vpdpwusd ymm2, ymm3, ymm4
+
+// CHECK:      vpdpwusd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4]
+               vpdpwusd xmm2, xmm3, xmm4
+
+// CHECK:      vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwusd ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x10]
+               vpdpwusd ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK:      vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK:      vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK:      vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK:      vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwusd xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x10]
+               vpdpwusd xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK:      vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK:      vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK:      vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK:      vpdpwusds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4]
+               vpdpwusds ymm2, ymm3, ymm4
+
+// CHECK:      vpdpwusds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4]
+               vpdpwusds xmm2, xmm3, xmm4
+
+// CHECK:      vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwusds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x10]
+               vpdpwusds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK:      vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK:      vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK:      vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK:      vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwusds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x10]
+               vpdpwusds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK:      vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK:      vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK:      vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK:      vpdpwuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4]
+               vpdpwuud ymm2, ymm3, ymm4
+
+// CHECK:      vpdpwuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4]
+               vpdpwuud xmm2, xmm3, xmm4
+
+// CHECK:      vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwuud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x10]
+               vpdpwuud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK:      vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK:      vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK:      vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK:      vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwuud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x10]
+               vpdpwuud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK:      vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK:      vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK:      vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK:      vpdpwuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4]
+               vpdpwuuds ymm2, ymm3, ymm4
+
+// CHECK:      vpdpwuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4]
+               vpdpwuuds xmm2, xmm3, xmm4
+
+// CHECK:      vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwuuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x10]
+               vpdpwuuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK:      vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+               vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK:      vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+               vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK:      vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff]
+               vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK:      vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+               vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK:      vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+               vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK:      vpdpwuuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x10]
+               vpdpwuuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK:      vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+               vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK:      vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00]
+               vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK:      vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff]
+               vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048]
+

diff  --git a/llvm/test/MC/X86/avx-vnni-int16-64-att.s b/llvm/test/MC/X86/avx-vnni-int16-64-att.s
new file mode 100644
index 00000000000000..4616f0a5d5883f
--- /dev/null
+++ b/llvm/test/MC/X86/avx-vnni-int16-64-att.s
@@ -0,0 +1,338 @@
+// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpwsud %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xe4]
+          vpdpwsud %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwsud %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xe4]
+          vpdpwsud %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwsud  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsud  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwsud  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwsud  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwsud  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwsud  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwsud  4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwsud  4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwsud  -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwsud  -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwsud  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsud  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwsud  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwsud  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwsud  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwsud  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwsud  2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwsud  2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwsud  -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwsud  -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xe4]
+          vpdpwsuds %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwsuds %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xe4]
+          vpdpwsuds %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsuds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwsuds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwsuds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds  4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwsuds  4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds  -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwsuds  -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsuds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwsuds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwsuds  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds  2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwsuds  2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds  -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwsuds  -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwusd %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xe4]
+          vpdpwusd %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwusd %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xe4]
+          vpdpwusd %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwusd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwusd  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusd  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwusd  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwusd  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwusd  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwusd  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwusd  4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwusd  4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwusd  -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwusd  -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwusd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwusd  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusd  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwusd  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwusd  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwusd  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwusd  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwusd  2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwusd  2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwusd  -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwusd  -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwusds %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xe4]
+          vpdpwusds %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwusds %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xe4]
+          vpdpwusds %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwusds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwusds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwusds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwusds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwusds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwusds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwusds  4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwusds  4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwusds  -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwusds  -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwusds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwusds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwusds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwusds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwusds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwusds  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwusds  2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwusds  2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwusds  -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwusds  -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwuud %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xe4]
+          vpdpwuud %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwuud %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xe4]
+          vpdpwuud %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwuud  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuud  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwuud  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwuud  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwuud  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwuud  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwuud  4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwuud  4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwuud  -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwuud  -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwuud  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuud  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwuud  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwuud  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwuud  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwuud  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwuud  2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwuud  2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwuud  -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwuud  -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xe4]
+          vpdpwuuds %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwuuds %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xe4]
+          vpdpwuuds %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuuds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwuuds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwuuds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds  4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwuuds  4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds  -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwuuds  -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuuds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwuuds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwuuds  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds  2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwuuds  2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds  -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwuuds  -2048(%rdx), %xmm13, %xmm12
+

diff  --git a/llvm/test/MC/X86/avx-vnni-int16-64-intel.s b/llvm/test/MC/X86/avx-vnni-int16-64-intel.s
new file mode 100644
index 00000000000000..a83a55d937b7df
--- /dev/null
+++ b/llvm/test/MC/X86/avx-vnni-int16-64-intel.s
@@ -0,0 +1,338 @@
+// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpwsud ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xe4]
+          vpdpwsud ymm12, ymm13, ymm4
+
+// CHECK: vpdpwsud xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xe4]
+          vpdpwsud xmm12, xmm13, xmm4
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwsud ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwsud xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xe4]
+          vpdpwsuds ymm12, ymm13, ymm4
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xe4]
+          vpdpwsuds xmm12, xmm13, xmm4
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwsuds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwsuds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xe4]
+          vpdpwusd ymm12, ymm13, ymm4
+
+// CHECK: vpdpwusd xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xe4]
+          vpdpwusd xmm12, xmm13, xmm4
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwusd ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwusd xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xe4]
+          vpdpwusds ymm12, ymm13, ymm4
+
+// CHECK: vpdpwusds xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xe4]
+          vpdpwusds xmm12, xmm13, xmm4
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwusds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwusds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xe4]
+          vpdpwuud ymm12, ymm13, ymm4
+
+// CHECK: vpdpwuud xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xe4]
+          vpdpwuud xmm12, xmm13, xmm4
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwuud ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00]
+          vpdpwuud xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xe4]
+          vpdpwuuds ymm12, ymm13, ymm4
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xe4]
+          vpdpwuuds xmm12, xmm13, xmm4
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwuuds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+          vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+          vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+          vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+          vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00]
+          vpdpwuuds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+          vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+          vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+          vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+

diff  --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 80d5d3b09c4d61..6168fdfd323c3b 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -4240,6 +4240,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   {X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0},
   {X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0},
   {X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0},
+  {X86::VPDPWSUDSYrr, X86::VPDPWSUDSYrm, 0},
+  {X86::VPDPWSUDSrr, X86::VPDPWSUDSrm, 0},
+  {X86::VPDPWSUDYrr, X86::VPDPWSUDYrm, 0},
+  {X86::VPDPWSUDrr, X86::VPDPWSUDrm, 0},
+  {X86::VPDPWUSDSYrr, X86::VPDPWUSDSYrm, 0},
+  {X86::VPDPWUSDSrr, X86::VPDPWUSDSrm, 0},
+  {X86::VPDPWUSDYrr, X86::VPDPWUSDYrm, 0},
+  {X86::VPDPWUSDrr, X86::VPDPWUSDrm, 0},
+  {X86::VPDPWUUDSYrr, X86::VPDPWUUDSYrm, 0},
+  {X86::VPDPWUUDSrr, X86::VPDPWUUDSrm, 0},
+  {X86::VPDPWUUDYrr, X86::VPDPWUUDYrm, 0},
+  {X86::VPDPWUUDrr, X86::VPDPWUUDrm, 0},
   {X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0},
   {X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0},
   {X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0},


        


More information about the cfe-commits mailing list