[clang] 1c154bd - [X86] Add AVX-VNNI-INT16 instructions.
Freddy Ye via cfe-commits
cfe-commits at lists.llvm.org
Wed Jul 19 23:31:37 PDT 2023
Author: Freddy Ye
Date: 2023-07-20T14:31:16+08:00
New Revision: 1c154bd755153b5c6ada4bbed58facf23f6abffc
URL: https://github.com/llvm/llvm-project/commit/1c154bd755153b5c6ada4bbed58facf23f6abffc
DIFF: https://github.com/llvm/llvm-project/commit/1c154bd755153b5c6ada4bbed58facf23f6abffc.diff
LOG: [X86] Add AVX-VNNI-INT16 instructions.
For more details about these instructions, please refer to the latest ISE document: https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html
Reviewed By: pengfei, skan
Differential Revision: https://reviews.llvm.org/D155145
Added:
clang/lib/Headers/avxvnniint16intrin.h
clang/test/CodeGen/X86/avxvnniint16-builtins.c
llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt
llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt
llvm/test/MC/X86/avx-vnni-int16-32-att.s
llvm/test/MC/X86/avx-vnni-int16-32-intel.s
llvm/test/MC/X86/avx-vnni-int16-64-att.s
llvm/test/MC/X86/avx-vnni-int16-64-intel.s
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/BuiltinsX86.def
clang/include/clang/Driver/Options.td
clang/lib/Basic/Targets/X86.cpp
clang/lib/Basic/Targets/X86.h
clang/lib/Headers/CMakeLists.txt
clang/lib/Headers/immintrin.h
clang/test/CodeGen/attr-target-x86.c
clang/test/Driver/x86-target-features.c
clang/test/Preprocessor/x86_target_features.c
llvm/docs/ReleaseNotes.rst
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/include/llvm/TargetParser/X86TargetParser.def
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86InstrInfo.td
llvm/lib/Target/X86/X86InstrSSE.td
llvm/lib/TargetParser/Host.cpp
llvm/lib/TargetParser/X86TargetParser.cpp
llvm/test/TableGen/x86-fold-tables.inc
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2982810b67fa0c..55fa4cb27d0b61 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -824,6 +824,10 @@ X86 Support
- Support ISA of ``SM4``.
* Support intrinsic of ``_mm(256)_sm4key4_epi32``.
* Support intrinsic of ``_mm(256)_sm4rnds4_epi32``.
+- Support ISA of ``AVX-VNNI-INT16``.
+ * Support intrinsic of ``_mm(256)_dpwsud(s)_epi32``.
+ * Support intrinsic of ``_mm(256)_dpwusd(s)_epi32``.
+ * Support intrinsic of ``_mm(256)_dpwuud(s)_epi32``.
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 48dd9cbb1ab7a4..10ac3b3c34efd2 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2116,6 +2116,20 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+// AVX-VNNI-INT16
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
+TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
+
// AVX-NE-CONVERT
TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps256, "V8fyC*", "nV:256:", "avxneconvert")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0578bc0cba1214..dff5749b3481d7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4960,6 +4960,8 @@ def mavxifma : Flag<["-"], "mavxifma">, Group<m_x86_Features_Group>;
def mno_avxifma : Flag<["-"], "mno-avxifma">, Group<m_x86_Features_Group>;
def mavxneconvert : Flag<["-"], "mavxneconvert">, Group<m_x86_Features_Group>;
def mno_avxneconvert : Flag<["-"], "mno-avxneconvert">, Group<m_x86_Features_Group>;
+def mavxvnniint16 : Flag<["-"], "mavxvnniint16">, Group<m_x86_Features_Group>;
+def mno_avxvnniint16 : Flag<["-"], "mno-avxvnniint16">, Group<m_x86_Features_Group>;
def mavxvnniint8 : Flag<["-"], "mavxvnniint8">, Group<m_x86_Features_Group>;
def mno_avxvnniint8 : Flag<["-"], "mno-avxvnniint8">, Group<m_x86_Features_Group>;
def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index c89e1df4e52d2b..26b89619b2c94b 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -353,6 +353,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVXNECONVERT= true;
} else if (Feature == "+avxvnni") {
HasAVXVNNI = true;
+ } else if (Feature == "+avxvnniint16") {
+ HasAVXVNNIINT16 = true;
} else if (Feature == "+avxvnniint8") {
HasAVXVNNIINT8 = true;
} else if (Feature == "+serialize") {
@@ -836,6 +838,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AVXNECONVERT__");
if (HasAVXVNNI)
Builder.defineMacro("__AVXVNNI__");
+ if (HasAVXVNNIINT16)
+ Builder.defineMacro("__AVXVNNIINT16__");
if (HasAVXVNNIINT8)
Builder.defineMacro("__AVXVNNIINT8__");
if (HasSERIALIZE)
@@ -964,6 +968,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avxifma", true)
.Case("avxneconvert", true)
.Case("avxvnni", true)
+ .Case("avxvnniint16", true)
.Case("avxvnniint8", true)
.Case("bmi", true)
.Case("bmi2", true)
@@ -1069,6 +1074,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avxifma", HasAVXIFMA)
.Case("avxneconvert", HasAVXNECONVERT)
.Case("avxvnni", HasAVXVNNI)
+ .Case("avxvnniint16", HasAVXVNNIINT16)
.Case("avxvnniint8", HasAVXVNNIINT8)
.Case("bmi", HasBMI)
.Case("bmi2", HasBMI2)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index d5ee63833febd2..039c05893d2692 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -145,6 +145,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasPTWRITE = false;
bool HasINVPCID = false;
bool HasENQCMD = false;
+ bool HasAVXVNNIINT16 = false;
bool HasAMXFP16 = false;
bool HasCMPCCXADD = false;
bool HasRAOINT = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 35c8b7de8db33a..f2b0c5cddcbbf8 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -160,6 +160,7 @@ set(x86_files
avxifmaintrin.h
avxintrin.h
avxneconvertintrin.h
+ avxvnniint16intrin.h
avxvnniint8intrin.h
avxvnniintrin.h
bmi2intrin.h
diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
new file mode 100644
index 00000000000000..e4d342a8b45b1d
--- /dev/null
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -0,0 +1,473 @@
+/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVXVNNIINT16INTRIN_H
+#define __AVXVNNIINT16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
+ __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
+ __min_vector_width__(256)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUD instruction.
+///
+/// \param __W
+/// A 128-bit vector of [4 x int].
+/// \param __A
+/// A 128-bit vector of [8 x short].
+/// \param __B
+/// A 128-bit vector of [8 x unsigned short].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUD instruction.
+///
+/// \param __W
+/// A 256-bit vector of [8 x int].
+/// \param __A
+/// A 256-bit vector of [16 x short].
+/// \param __B
+/// A 256-bit vector of [16 x unsigned short].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+/// A 128-bit vector of [4 x int].
+/// \param __A
+/// A 128-bit vector of [8 x short].
+/// \param __B
+/// A 128-bit vector of [8 x unsigned short].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+/// A 256-bit vector of [8 x int].
+/// \param __A
+/// A 256-bit vector of [16 x short].
+/// \param __B
+/// A 256-bit vector of [16 x unsigned short].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUSD instruction.
+///
+/// \param __W
+/// A 128-bit vector of [4 x int].
+/// \param __A
+/// A 128-bit vector of [8 x unsigned short].
+/// \param __B
+/// A 128-bit vector of [8 x short].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUSD instruction.
+///
+/// \param __W
+/// A 256-bit vector of [8 x int].
+/// \param __A
+/// A 256-bit vector of [16 x unsigned short].
+/// \param __B
+/// A 256-bit vector of [16 x short].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+/// A 128-bit vector of [4 x int].
+/// \param __A
+/// A 128-bit vector of [8 x unsigned short].
+/// \param __B
+/// A 128-bit vector of [8 x short].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+/// A 256-bit vector of [8 x int].
+/// \param __A
+/// A 256-bit vector of [16 x unsigned short].
+/// \param __B
+/// A 256-bit vector of [16 x short].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUUD instruction.
+///
+/// \param __W
+/// A 128-bit vector of [4 x unsigned int].
+/// \param __A
+/// A 128-bit vector of [8 x unsigned short].
+/// \param __B
+/// A 128-bit vector of [8 x unsigned short].
+/// \returns
+/// A 128-bit vector of [4 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUUD instruction.
+///
+/// \param __W
+/// A 256-bit vector of [8 x unsigned int].
+/// \param __A
+/// A 256-bit vector of [16 x unsigned short].
+/// \param __B
+/// A 256-bit vector of [16 x unsigned short].
+/// \returns
+/// A 256-bit vector of [8 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+/// A 128-bit vector of [4 x unsigned int].
+/// \param __A
+/// A 128-bit vector of [8 x unsigned short].
+/// \param __B
+/// A 128-bit vector of [8 x unsigned short].
+/// \returns
+/// A 128-bit vector of [4 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+/// signed 16-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+/// A 256-bit vector of [8 x unsigned int].
+/// \param __A
+/// A 256-bit vector of [16 x unsigned short].
+/// \param __B
+/// A 256-bit vector of [16 x unsigned short].
+/// \returns
+/// A 256-bit vector of [8 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT16INTRIN_H
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 1c9a50c7208dca..642602be14e60e 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -284,6 +284,11 @@
#include <sm4intrin.h>
#endif
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVXVNNIINT16__)
+#include <avxvnniint16intrin.h>
+#endif
+
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__RDPID__)
/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
new file mode 100644
index 00000000000000..a10ca551a15146
--- /dev/null
+++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +avxvnniint16 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m128i test_mm_dpwsud_epi32(__m128i __A, __m128i __B, __m128i __C) {
+ // CHECK-LABEL: @test_mm_dpwsud_epi32(
+ // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ return _mm_dpwsud_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwsud_epi32(__m256i __A, __m256i __B, __m256i __C) {
+ // CHECK-LABEL: @test_mm256_dpwsud_epi32(
+ // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ return _mm256_dpwsud_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwsuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
+ // CHECK-LABEL: @test_mm_dpwsuds_epi32(
+ // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ return _mm_dpwsuds_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwsuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
+ // CHECK-LABEL: @test_mm256_dpwsuds_epi32(
+ // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ return _mm256_dpwsuds_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwusd_epi32(__m128i __A, __m128i __B, __m128i __C) {
+ // CHECK-LABEL: @test_mm_dpwusd_epi32(
+ // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ return _mm_dpwusd_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwusd_epi32(__m256i __A, __m256i __B, __m256i __C) {
+ // CHECK-LABEL: @test_mm256_dpwusd_epi32(
+ // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ return _mm256_dpwusd_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwusds_epi32(__m128i __A, __m128i __B, __m128i __C) {
+ // CHECK-LABEL: @test_mm_dpwusds_epi32(
+ // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ return _mm_dpwusds_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwusds_epi32(__m256i __A, __m256i __B, __m256i __C) {
+ // CHECK-LABEL: @test_mm256_dpwusds_epi32(
+ // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ return _mm256_dpwusds_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwuud_epi32(__m128i __A, __m128i __B, __m128i __C) {
+ // CHECK-LABEL: @test_mm_dpwuud_epi32(
+ // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ return _mm_dpwuud_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwuud_epi32(__m256i __A, __m256i __B, __m256i __C) {
+ // CHECK-LABEL: @test_mm256_dpwuud_epi32(
+ // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ return _mm256_dpwuud_epi32(__A, __B, __C);
+}
+
+__m128i test_mm_dpwuuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
+ // CHECK-LABEL: @test_mm_dpwuuds_epi32(
+ // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+ return _mm_dpwuuds_epi32(__A, __B, __C);
+}
+
+__m256i test_mm256_dpwuuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
+ // CHECK-LABEL: @test_mm256_dpwuuds_epi32(
+ // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+ return _mm256_dpwuuds_epi32(__A, __B, __C);
+}
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index f2c79eda5d24dd..d2f09b67c7c3e4 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@ void __attribute__((target("arch=x86-64-v4"))) x86_64_v4(void) {}
// CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686"
// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
// CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
// CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
// CHECK-NOT: tune-cpu
// CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-3dnow,-3dnowa,-mmx"
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index e387e2ca45361f..0eb55e93546a70 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -364,6 +364,11 @@
// SM4: "-target-feature" "+sm4"
// NO-SM4: "-target-feature" "-sm4"
+// RUN: %clang --target=i386 -mavxvnniint16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVXVNNIINT16 %s
+// RUN: %clang --target=i386 -mno-avxvnniint16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVXVNNIINT16 %s
+// AVXVNNIINT16: "-target-feature" "+avxvnniint16"
+// NO-AVXVNNIINT16: "-target-feature" "-avxvnniint16"
+
// RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s
// RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s
// CRC32: "-target-feature" "+crc32"
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 20d96d072fa4a2..7a9c8eb0f8dddc 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -700,6 +700,20 @@
// SM4NOAVX-NOT: #define __AVX__ 1
// SM4NOAVX-NOT: #define __SM4__ 1
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavxvnniint16 -x c -E -dM -o - %s | FileCheck -check-prefix=AVXVNNIINT16 %s
+
+// AVXVNNIINT16: #define __AVX2__ 1
+// AVXVNNIINT16: #define __AVXVNNIINT16__ 1
+
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mno-avxvnniint16 -x c -E -dM -o - %s | FileCheck -check-prefix=NOAVXVNNIINT16 %s
+
+// NOAVXVNNIINT16-NOT: #define __AVXVNNIINT16__ 1
+
+// RUN: %clang -target i686-unknown-linux-gnu -march=atom -mavxvnniint16 -mno-avx2 -x c -E -dM -o - %s | FileCheck -check-prefix=AVXVNNIINT16NOAVX2 %s
+
+// AVXVNNIINT16NOAVX2-NOT: #define __AVX2__ 1
+// AVXVNNIINT16NOAVX2-NOT: #define __AVXVNNIINT16__ 1
+
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mcrc32 -x c -E -dM -o - %s | FileCheck -check-prefix=CRC32 %s
// CRC32: #define __CRC32__ 1
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 3264ec4ab51253..2431ff52d6e2ca 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -282,6 +282,7 @@ Changes to the X86 Backend
* Support ISA of ``SHA512``.
* Support ISA of ``SM3``.
* Support ISA of ``SM4``.
+* Support ISA of ``AVX-VNNI-INT16``.
Changes to the OCaml bindings
-----------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 45aaee87fb608f..57cd1dc47bd9fc 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2053,6 +2053,67 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
[llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
[IntrNoMem]>;
+
+ def int_x86_avx2_vpdpwsud_128
+ : ClangBuiltin<"__builtin_ia32_vpdpwsud128">,
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwsud_256
+ : ClangBuiltin<"__builtin_ia32_vpdpwsud256">,
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwsuds_128
+ : ClangBuiltin<"__builtin_ia32_vpdpwsuds128">,
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwsuds_256
+ : ClangBuiltin<"__builtin_ia32_vpdpwsuds256">,
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwusd_128
+ : ClangBuiltin<"__builtin_ia32_vpdpwusd128">,
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwusd_256
+ : ClangBuiltin<"__builtin_ia32_vpdpwusd256">,
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwusds_128
+ : ClangBuiltin<"__builtin_ia32_vpdpwusds128">,
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwusds_256
+ : ClangBuiltin<"__builtin_ia32_vpdpwusds256">,
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwuud_128
+ : ClangBuiltin<"__builtin_ia32_vpdpwuud128">,
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwuud_256
+ : ClangBuiltin<"__builtin_ia32_vpdpwuud256">,
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwuuds_128
+ : ClangBuiltin<"__builtin_ia32_vpdpwuuds128">,
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpwuuds_256
+ : ClangBuiltin<"__builtin_ia32_vpdpwuuds256">,
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 32c7ffe4f23395..7964353045258f 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -230,6 +230,7 @@ X86_FEATURE (AVXVNNIINT8, "avxvnniint8")
X86_FEATURE (SHA512, "sha512")
X86_FEATURE (SM3, "sm3")
X86_FEATURE (SM4, "sm4")
+X86_FEATURE (AVXVNNIINT16, "avxvnniint16")
// These features aren't really CPU features, but the frontend can set them.
X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk")
X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8b33ad629ec5ea..a28fc83e4e4da9 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -191,6 +191,10 @@ def FeatureAVXVNNIINT8 : SubtargetFeature<"avxvnniint8",
"HasAVXVNNIINT8", "true",
"Enable AVX-VNNI-INT8",
[FeatureAVX2]>;
+def FeatureAVXVNNIINT16 : SubtargetFeature<"avxvnniint16",
+ "HasAVXVNNIINT16", "true",
+ "Enable AVX-VNNI-INT16",
+ [FeatureAVX2]>;
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 0b8bec8fb35fc0..10a0ccdcb02329 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2565,6 +2565,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPDPWSSDrr:
case X86::VPDPWSSDSYrr:
case X86::VPDPWSSDSrr:
+ case X86::VPDPWUUDrr:
+ case X86::VPDPWUUDYrr:
+ case X86::VPDPWUUDSrr:
+ case X86::VPDPWUUDSYrr:
case X86::VPDPBSSDSrr:
case X86::VPDPBSSDSYrr:
case X86::VPDPBSSDrr:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index e065a3169bd002..08e6e4e0627b77 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -924,6 +924,7 @@ def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">;
def HasFP16 : Predicate<"Subtarget->hasFP16()">;
+def HasAVXVNNIINT16 : Predicate<"Subtarget->hasAVXVNNIINT16()">;
def HasAVXVNNIINT8 : Predicate<"Subtarget->hasAVXVNNIINT8()">;
def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 84e39b3107188d..6c57eceab37698 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8382,3 +8382,47 @@ defm VSM4KEY4 : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, V
defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX_4V;
defm VSM4RNDS4 : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX_4V;
defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX_4V;
+
+let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
+ let isCommutable = IsCommutable in
+ def rr : I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
+ VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def rm : I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
+ VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ let isCommutable = IsCommutable in
+ def Yrr : I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst,
+ (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
+ VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+
+ def Yrm : I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst,
+ (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
+ VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+}
+
+defm VPDPWSUD : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS;
+defm VPDPWSUDS : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8XS;
+defm VPDPWUSD : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8PD;
+defm VPDPWUSDS : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8PD;
+defm VPDPWUUD : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8PS;
+defm VPDPWUUDS : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8PS;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 0796a749bae44a..a1cedbb0308f8e 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1759,6 +1759,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
Features["avxvnniint8"] = HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave;
Features["avxneconvert"] = HasLeaf7Subleaf1 && ((EDX >> 5) & 1) && HasAVXSave;
Features["amx-complex"] = HasLeaf7Subleaf1 && ((EDX >> 8) & 1) && HasAMXSave;
+ Features["avxvnniint16"] = HasLeaf7Subleaf1 && ((EDX >> 10) & 1) && HasAVXSave;
Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
bool HasLeafD = MaxLevel >= 0xd &&
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index c1434edf043123..606370bfe57dc6 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -654,6 +654,7 @@ constexpr FeatureBitset ImpliedFeaturesHRESET = {};
constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
constexpr FeatureBitset ImpliedFeaturesCMPCCXADD = {};
constexpr FeatureBitset ImpliedFeaturesRAOINT = {};
+constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT16 = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesAVXIFMA = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesAVXNECONVERT = FeatureAVX2;
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
new file mode 100644
index 00000000000000..999c968fa80db5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
new file mode 100644
index 00000000000000..534352f3220019
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x71,0xd2,0x44,0x24,0xe8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x75,0xd2,0x44,0x24,0xd8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x71,0xd3,0x44,0x24,0xe8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x75,0xd3,0x44,0x24,0xd8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x70,0xd2,0x44,0x24,0xe8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128_commuted(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x70,0xd2,0x44,0x24,0xe8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %C, <4 x i32> %B)
+ ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x74,0xd2,0x44,0x24,0xd8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256_commuted(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x74,0xd2,0x44,0x24,0xd8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %C, <8 x i32> %B)
+ ret <8 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x70,0xd3,0x44,0x24,0xe8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128_commuted(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xf8,0x29,0x54,0x24,0xe8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x70,0xd3,0x44,0x24,0xe8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %C, <4 x i32> %B)
+ ret <4 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x74,0xd3,0x44,0x24,0xd8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256_commuted(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # encoding: [0xc5,0xfc,0x11,0x54,0x24,0xd8]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # encoding: [0xc4,0xe2,0x74,0xd3,0x44,0x24,0xd8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %C, <8 x i32> %B)
+ ret <8 x i32> %ret
+}
diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt
new file mode 100644
index 00000000000000..099970430ea29b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-32.txt
@@ -0,0 +1,339 @@
+# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=i386-unknown-unknown --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: vpdpwsud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xd2,0xd4
+
+# ATT: vpdpwsud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xd2,0xd4
+
+# ATT: vpdpwsud 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%eax), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0xd2,0x10
+
+# ATT: vpdpwsud -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwsud 4064(%ecx), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwsud -4096(%edx), %ymm3, %ymm2
+# INTEL: vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwsud 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%eax), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0xd2,0x10
+
+# ATT: vpdpwsud -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwsud 2032(%ecx), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwsud -2048(%edx), %xmm3, %xmm2
+# INTEL: vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwsuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0xd3,0xd4
+
+# ATT: vpdpwsuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0xd3,0xd4
+
+# ATT: vpdpwsuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%eax), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0xd3,0x10
+
+# ATT: vpdpwsuds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwsuds 4064(%ecx), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwsuds -4096(%edx), %ymm3, %ymm2
+# INTEL: vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwsuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%eax), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0xd3,0x10
+
+# ATT: vpdpwsuds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwsuds 2032(%ecx), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwsuds -2048(%edx), %xmm3, %xmm2
+# INTEL: vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwusd %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymm4
+0xc4,0xe2,0x65,0xd2,0xd4
+
+# ATT: vpdpwusd %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmm4
+0xc4,0xe2,0x61,0xd2,0xd4
+
+# ATT: vpdpwusd 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%eax), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x65,0xd2,0x10
+
+# ATT: vpdpwusd -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwusd 4064(%ecx), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwusd -4096(%edx), %ymm3, %ymm2
+# INTEL: vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwusd 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%eax), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x61,0xd2,0x10
+
+# ATT: vpdpwusd -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwusd 2032(%ecx), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwusd -2048(%edx), %xmm3, %xmm2
+# INTEL: vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwusds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymm4
+0xc4,0xe2,0x65,0xd3,0xd4
+
+# ATT: vpdpwusds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmm4
+0xc4,0xe2,0x61,0xd3,0xd4
+
+# ATT: vpdpwusds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%eax), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x65,0xd3,0x10
+
+# ATT: vpdpwusds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwusds 4064(%ecx), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwusds -4096(%edx), %ymm3, %ymm2
+# INTEL: vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwusds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%eax), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x61,0xd3,0x10
+
+# ATT: vpdpwusds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwusds 2032(%ecx), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwusds -2048(%edx), %xmm3, %xmm2
+# INTEL: vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwuud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0xd2,0xd4
+
+# ATT: vpdpwuud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0xd2,0xd4
+
+# ATT: vpdpwuud 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%eax), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0xd2,0x10
+
+# ATT: vpdpwuud -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwuud 4064(%ecx), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwuud -4096(%edx), %ymm3, %ymm2
+# INTEL: vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwuud 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%eax), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0xd2,0x10
+
+# ATT: vpdpwuud -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwuud 2032(%ecx), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwuud -2048(%edx), %xmm3, %xmm2
+# INTEL: vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwuuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0xd3,0xd4
+
+# ATT: vpdpwuuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0xd3,0xd4
+
+# ATT: vpdpwuuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%eax), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0xd3,0x10
+
+# ATT: vpdpwuuds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwuuds 4064(%ecx), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwuuds -4096(%edx), %ymm3, %ymm2
+# INTEL: vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwuuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%eax), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0xd3,0x10
+
+# ATT: vpdpwuuds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwuuds 2032(%ecx), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwuuds -2048(%edx), %xmm3, %xmm2
+# INTEL: vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff
+
diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt
new file mode 100644
index 00000000000000..55396db790c715
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int16-64.txt
@@ -0,0 +1,339 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: vpdpwsud %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymm4
+0xc4,0x62,0x16,0xd2,0xe4
+
+# ATT: vpdpwsud %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmm4
+0xc4,0x62,0x12,0xd2,0xe4
+
+# ATT: vpdpwsud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%rip), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsud -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwsud 4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwsud -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwsud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsud 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsud (%rip), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsud -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwsud 2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwsud -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwsuds %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymm4
+0xc4,0x62,0x16,0xd3,0xe4
+
+# ATT: vpdpwsuds %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmm4
+0xc4,0x62,0x12,0xd3,0xe4
+
+# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%rip), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsuds -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwsuds 4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwsuds -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwsuds 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwsuds (%rip), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwsuds -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwsuds 2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwsuds -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwusd %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymm4
+0xc4,0x62,0x15,0xd2,0xe4
+
+# ATT: vpdpwusd %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmm4
+0xc4,0x62,0x11,0xd2,0xe4
+
+# ATT: vpdpwusd 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%rip), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusd -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwusd 4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwusd -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwusd 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusd 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusd (%rip), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusd -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwusd 2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwusd -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwusds %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymm4
+0xc4,0x62,0x15,0xd3,0xe4
+
+# ATT: vpdpwusds %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmm4
+0xc4,0x62,0x11,0xd3,0xe4
+
+# ATT: vpdpwusds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%rip), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusds -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwusds 4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwusds -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwusds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwusds 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwusds (%rip), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwusds -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwusds 2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwusds -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwuud %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymm4
+0xc4,0x62,0x14,0xd2,0xe4
+
+# ATT: vpdpwuud %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmm4
+0xc4,0x62,0x10,0xd2,0xe4
+
+# ATT: vpdpwuud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%rip), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuud -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwuud 4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwuud -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwuud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuud 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuud (%rip), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuud -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwuud 2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwuud -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT: vpdpwuuds %ymm4, %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymm4
+0xc4,0x62,0x14,0xd3,0xe4
+
+# ATT: vpdpwuuds %xmm4, %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmm4
+0xc4,0x62,0x10,0xd3,0xe4
+
+# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%rip), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuuds -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpwuuds 4064(%rcx), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT: vpdpwuuds -4096(%rdx), %ymm13, %ymm12
+# INTEL: vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT: vpdpwuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpwuuds 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpwuuds (%rip), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpwuuds -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpwuuds 2032(%rcx), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT: vpdpwuuds -2048(%rdx), %xmm13, %xmm12
+# INTEL: vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff
+
diff --git a/llvm/test/MC/X86/avx-vnni-int16-32-att.s b/llvm/test/MC/X86/avx-vnni-int16-32-att.s
new file mode 100644
index 00000000000000..63a082d2132860
--- /dev/null
+++ b/llvm/test/MC/X86/avx-vnni-int16-32-att.s
@@ -0,0 +1,338 @@
+// RUN: llvm-mc -triple i686-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpwsud %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4]
+ vpdpwsud %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpwsud %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4]
+ vpdpwsud %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpwsud 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsud 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpwsud 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsud 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpwsud (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x10]
+ vpdpwsud (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpwsud -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsud -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpwsud 4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwsud 4064(%ecx), %ymm3, %ymm2
+
+// CHECK: vpdpwsud -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwsud -4096(%edx), %ymm3, %ymm2
+
+// CHECK: vpdpwsud 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsud 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpwsud 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsud 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpwsud (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x10]
+ vpdpwsud (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpwsud -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsud -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpwsud 2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwsud 2032(%ecx), %xmm3, %xmm2
+
+// CHECK: vpdpwsud -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwsud -2048(%edx), %xmm3, %xmm2
+
+// CHECK: vpdpwsuds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4]
+ vpdpwsuds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpwsuds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4]
+ vpdpwsuds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpwsuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpwsuds 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsuds 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpwsuds (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x10]
+ vpdpwsuds (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpwsuds -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsuds -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpwsuds 4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwsuds 4064(%ecx), %ymm3, %ymm2
+
+// CHECK: vpdpwsuds -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwsuds -4096(%edx), %ymm3, %ymm2
+
+// CHECK: vpdpwsuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpwsuds 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsuds 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpwsuds (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x10]
+ vpdpwsuds (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpwsuds -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsuds -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpwsuds 2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwsuds 2032(%ecx), %xmm3, %xmm2
+
+// CHECK: vpdpwsuds -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwsuds -2048(%edx), %xmm3, %xmm2
+
+// CHECK: vpdpwusd %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4]
+ vpdpwusd %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpwusd %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4]
+ vpdpwusd %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpwusd 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusd 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpwusd 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusd 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpwusd (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x10]
+ vpdpwusd (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpwusd -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusd -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpwusd 4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwusd 4064(%ecx), %ymm3, %ymm2
+
+// CHECK: vpdpwusd -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwusd -4096(%edx), %ymm3, %ymm2
+
+// CHECK: vpdpwusd 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusd 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpwusd 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusd 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpwusd (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x10]
+ vpdpwusd (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpwusd -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusd -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpwusd 2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwusd 2032(%ecx), %xmm3, %xmm2
+
+// CHECK: vpdpwusd -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwusd -2048(%edx), %xmm3, %xmm2
+
+// CHECK: vpdpwusds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4]
+ vpdpwusds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpwusds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4]
+ vpdpwusds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpwusds 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusds 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpwusds 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusds 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpwusds (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x10]
+ vpdpwusds (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpwusds -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusds -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpwusds 4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwusds 4064(%ecx), %ymm3, %ymm2
+
+// CHECK: vpdpwusds -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwusds -4096(%edx), %ymm3, %ymm2
+
+// CHECK: vpdpwusds 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusds 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpwusds 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusds 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpwusds (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x10]
+ vpdpwusds (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpwusds -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusds -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpwusds 2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwusds 2032(%ecx), %xmm3, %xmm2
+
+// CHECK: vpdpwusds -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwusds -2048(%edx), %xmm3, %xmm2
+
+// CHECK: vpdpwuud %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4]
+ vpdpwuud %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpwuud %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4]
+ vpdpwuud %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpwuud 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuud 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpwuud 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuud 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpwuud (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x10]
+ vpdpwuud (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpwuud -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuud -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpwuud 4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwuud 4064(%ecx), %ymm3, %ymm2
+
+// CHECK: vpdpwuud -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwuud -4096(%edx), %ymm3, %ymm2
+
+// CHECK: vpdpwuud 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuud 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpwuud 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuud 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpwuud (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x10]
+ vpdpwuud (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpwuud -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuud -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpwuud 2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwuud 2032(%ecx), %xmm3, %xmm2
+
+// CHECK: vpdpwuud -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwuud -2048(%edx), %xmm3, %xmm2
+
+// CHECK: vpdpwuuds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4]
+ vpdpwuuds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpwuuds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4]
+ vpdpwuuds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpwuuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpwuuds 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuuds 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpwuuds (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x10]
+ vpdpwuuds (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpwuuds -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuuds -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpwuuds 4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwuuds 4064(%ecx), %ymm3, %ymm2
+
+// CHECK: vpdpwuuds -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwuuds -4096(%edx), %ymm3, %ymm2
+
+// CHECK: vpdpwuuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpwuuds 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuuds 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpwuuds (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x10]
+ vpdpwuuds (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpwuuds -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuuds -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpwuuds 2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwuuds 2032(%ecx), %xmm3, %xmm2
+
+// CHECK: vpdpwuuds -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwuuds -2048(%edx), %xmm3, %xmm2
+
diff --git a/llvm/test/MC/X86/avx-vnni-int16-32-intel.s b/llvm/test/MC/X86/avx-vnni-int16-32-intel.s
new file mode 100644
index 00000000000000..9a4b163c391f37
--- /dev/null
+++ b/llvm/test/MC/X86/avx-vnni-int16-32-intel.s
@@ -0,0 +1,338 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpwsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0xd4]
+ vpdpwsud ymm2, ymm3, ymm4
+
+// CHECK: vpdpwsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0xd4]
+ vpdpwsud xmm2, xmm3, xmm4
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x10]
+ vpdpwsud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwsud ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd2,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwsud ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x10]
+ vpdpwsud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwsud xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd2,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwsud xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0xd4]
+ vpdpwsuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0xd4]
+ vpdpwsuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x10]
+ vpdpwsuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwsuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x66,0xd3,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwsuds ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x10]
+ vpdpwsuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwsuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x62,0xd3,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwsuds xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK: vpdpwusd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0xd4]
+ vpdpwusd ymm2, ymm3, ymm4
+
+// CHECK: vpdpwusd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0xd4]
+ vpdpwusd xmm2, xmm3, xmm4
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x10]
+ vpdpwusd ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwusd ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd2,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwusd ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x10]
+ vpdpwusd xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwusd xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd2,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwusd xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK: vpdpwusds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0xd4]
+ vpdpwusds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwusds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0xd4]
+ vpdpwusds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x10]
+ vpdpwusds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwusds ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x65,0xd3,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwusds ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwusds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwusds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x10]
+ vpdpwusds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwusds xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x61,0xd3,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwusds xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK: vpdpwuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0xd4]
+ vpdpwuud ymm2, ymm3, ymm4
+
+// CHECK: vpdpwuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0xd4]
+ vpdpwuud xmm2, xmm3, xmm4
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x10]
+ vpdpwuud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwuud ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd2,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwuud ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x10]
+ vpdpwuud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwuud xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd2,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwuud xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0xd4]
+ vpdpwuuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0xd4]
+ vpdpwuuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x10]
+ vpdpwuuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x91,0xe0,0x0f,0x00,0x00]
+ vpdpwuuds ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x64,0xd3,0x92,0x00,0xf0,0xff,0xff]
+ vpdpwuuds ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpwuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpwuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x10]
+ vpdpwuuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x91,0xf0,0x07,0x00,0x00]
+ vpdpwuuds xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x60,0xd3,0x92,0x00,0xf8,0xff,0xff]
+ vpdpwuuds xmm2, xmm3, xmmword ptr [edx - 2048]
+
diff --git a/llvm/test/MC/X86/avx-vnni-int16-64-att.s b/llvm/test/MC/X86/avx-vnni-int16-64-att.s
new file mode 100644
index 00000000000000..4616f0a5d5883f
--- /dev/null
+++ b/llvm/test/MC/X86/avx-vnni-int16-64-att.s
@@ -0,0 +1,338 @@
+// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpwsud %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xe4]
+ vpdpwsud %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwsud %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xe4]
+ vpdpwsud %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwsud 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsud 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwsud (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwsud (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwsud -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsud -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwsud 4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwsud 4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwsud -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwsud -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwsud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwsud 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsud 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwsud (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwsud (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwsud -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsud -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwsud 2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwsud 2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwsud -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwsud -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xe4]
+ vpdpwsuds %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwsuds %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xe4]
+ vpdpwsuds %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsuds 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwsuds (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsuds -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds 4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwsuds 4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwsuds -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsuds 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwsuds (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsuds -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds 2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwsuds 2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwsuds -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwsuds -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwusd %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xe4]
+ vpdpwusd %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwusd %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xe4]
+ vpdpwusd %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusd 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwusd 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusd 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwusd (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwusd (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwusd -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusd -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwusd 4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwusd 4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwusd -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwusd -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwusd 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusd 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwusd 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusd 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwusd (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwusd (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwusd -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusd -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwusd 2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwusd 2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwusd -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwusd -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwusds %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xe4]
+ vpdpwusds %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwusds %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xe4]
+ vpdpwusds %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwusds 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusds 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwusds (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwusds (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwusds -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusds -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwusds 4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwusds 4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwusds -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwusds -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwusds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwusds 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusds 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwusds (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwusds (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwusds -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusds -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwusds 2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwusds 2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwusds -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwusds -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwuud %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xe4]
+ vpdpwuud %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwuud %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xe4]
+ vpdpwuud %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwuud 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuud 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwuud (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwuud (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwuud -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuud -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwuud 4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwuud 4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwuud -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwuud -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwuud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwuud 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuud 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwuud (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwuud (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwuud -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuud -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwuud 2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwuud 2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwuud -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwuud -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds %ymm4, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xe4]
+ vpdpwuuds %ymm4, %ymm13, %ymm12
+
+// CHECK: vpdpwuuds %xmm4, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xe4]
+ vpdpwuuds %xmm4, %xmm13, %xmm12
+
+// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuuds 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwuuds (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuuds -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds 4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwuuds 4064(%rcx), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwuuds -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: vpdpwuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuuds 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwuuds (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuuds -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds 2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwuuds 2032(%rcx), %xmm13, %xmm12
+
+// CHECK: vpdpwuuds -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwuuds -2048(%rdx), %xmm13, %xmm12
+
diff --git a/llvm/test/MC/X86/avx-vnni-int16-64-intel.s b/llvm/test/MC/X86/avx-vnni-int16-64-intel.s
new file mode 100644
index 00000000000000..a83a55d937b7df
--- /dev/null
+++ b/llvm/test/MC/X86/avx-vnni-int16-64-intel.s
@@ -0,0 +1,338 @@
+// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpwsud ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xe4]
+ vpdpwsud ymm12, ymm13, ymm4
+
+// CHECK: vpdpwsud xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xe4]
+ vpdpwsud xmm12, xmm13, xmm4
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwsud ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwsud ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwsud ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwsud xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwsud xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwsud xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xe4]
+ vpdpwsuds ymm12, ymm13, ymm4
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xe4]
+ vpdpwsuds xmm12, xmm13, xmm4
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwsuds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwsuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x16,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwsuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwsuds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwsuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x12,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwsuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xe4]
+ vpdpwusd ymm12, ymm13, ymm4
+
+// CHECK: vpdpwusd xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xe4]
+ vpdpwusd xmm12, xmm13, xmm4
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x15,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x15,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwusd ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwusd ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwusd ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x11,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x11,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwusd xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwusd xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwusd xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xe4]
+ vpdpwusds ymm12, ymm13, ymm4
+
+// CHECK: vpdpwusds xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xe4]
+ vpdpwusds xmm12, xmm13, xmm4
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x15,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x15,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwusds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwusds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwusds ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x15,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwusds ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x11,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwusds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x11,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwusds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwusds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwusds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwusds xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x11,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwusds xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xe4]
+ vpdpwuud ymm12, ymm13, ymm4
+
+// CHECK: vpdpwuud xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xe4]
+ vpdpwuud xmm12, xmm13, xmm4
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x14,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x14,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwuud ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwuud ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd2,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwuud ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x10,0xd2,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x10,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x25,0x00,0x00,0x00,0x00]
+ vpdpwuud xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwuud xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd2,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwuud xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymm4
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xe4]
+ vpdpwuuds ymm12, ymm13, ymm4
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmm4
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xe4]
+ vpdpwuuds xmm12, xmm13, xmm4
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x14,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x14,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwuuds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpwuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa1,0xe0,0x0f,0x00,0x00]
+ vpdpwuuds ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x14,0xd3,0xa2,0x00,0xf0,0xff,0xff]
+ vpdpwuuds ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x10,0xd3,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpwuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x10,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpwuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x25,0x00,0x00,0x00,0x00]
+ vpdpwuuds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpwuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa1,0xf0,0x07,0x00,0x00]
+ vpdpwuuds xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x10,0xd3,0xa2,0x00,0xf8,0xff,0xff]
+ vpdpwuuds xmm12, xmm13, xmmword ptr [rdx - 2048]
+
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 80d5d3b09c4d61..6168fdfd323c3b 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -4240,6 +4240,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0},
{X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0},
{X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0},
+ {X86::VPDPWSUDSYrr, X86::VPDPWSUDSYrm, 0},
+ {X86::VPDPWSUDSrr, X86::VPDPWSUDSrm, 0},
+ {X86::VPDPWSUDYrr, X86::VPDPWSUDYrm, 0},
+ {X86::VPDPWSUDrr, X86::VPDPWSUDrm, 0},
+ {X86::VPDPWUSDSYrr, X86::VPDPWUSDSYrm, 0},
+ {X86::VPDPWUSDSrr, X86::VPDPWUSDSrm, 0},
+ {X86::VPDPWUSDYrr, X86::VPDPWUSDYrm, 0},
+ {X86::VPDPWUSDrr, X86::VPDPWUSDrm, 0},
+ {X86::VPDPWUUDSYrr, X86::VPDPWUUDSYrm, 0},
+ {X86::VPDPWUUDSrr, X86::VPDPWUUDSrm, 0},
+ {X86::VPDPWUUDYrr, X86::VPDPWUUDYrm, 0},
+ {X86::VPDPWUUDrr, X86::VPDPWUUDrm, 0},
{X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0},
{X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0},
{X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0},
More information about the cfe-commits
mailing list